In [1]:
%autosave 0

Autosave disabled


#### In this work, same dataset has been considered as in 'reg1_sklearn.ipynb' as we seek if xgboost will outperform the best found estimator there. Please refer to 'reg1_sklearn.ipynb' the see the details of the dataset and rationale concerning the preprocessing phase. 

### 0. Dependencies

In [30]:
# linear algebra and data manipulation
import numpy as np
import pandas as pd

# Model selection
from sklearn.model_selection import train_test_split

# Preprocessing
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Dimensionality reduction
from sklearn.decomposition import PCA

# xgboost and hypertuning tool
import xgboost
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Pipelining
from sklearn.pipeline import make_pipeline

# Metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

## <center>1. Data preparation</center>

In [3]:
# loading in the data
df = pd.read_csv('prediktiv_data.csv')

In [4]:
# duplicates removal
df.drop_duplicates(inplace=True)

In [5]:
# NaNs treatment
for col in df.columns:
    if (df[col].isnull().sum() == 1):
        df.drop(df.loc[pd.isna(df[col])].index, inplace=True)
df.replace(np.nan, 'NA', inplace=True)

In [6]:
# data prep
df.drop(columns='id', inplace=True)
df.drop(df.loc[(df['feature01']>3)].index, inplace=True)
df.drop(df.loc[(df['feature06']<4)].index, inplace=True)
df.drop(df.loc[(df['feature06']>10)].index, inplace=True)
df.drop(columns='feature03', inplace=True)
df.drop(df.loc[(df['feature08']<4)].index, inplace=True)
df.drop(df.loc[(df['feature08']>9)].index, inplace=True)
df.drop(df.loc[(df['feature09']<4)].index, inplace=True)
df.drop(df.loc[(df['feature09']>8)].index, inplace=True)
df.drop(df.loc[(df['feature10.1']==3)].index, inplace=True)
df.drop(df.loc[(df['feature10.1']==0)].index, inplace=True)
df['feature10.1'] = df['feature10.1'].apply(lambda x: 0 if x==2 else 1)
df.drop(df.loc[(df['feature14']>2)].index, inplace=True)
df.drop(df.loc[(df['feature17']==0)].index, inplace=True)
df.drop(df.loc[(df['feature17']>4)].index, inplace=True)
df.drop(columns='feature18', inplace=True)
df.drop(df.loc[(df['feature21']==2)].index, inplace=True)
df.drop(df.loc[(df['feature12']=='frog')].index, inplace=True)
df.drop(df.loc[(df['feature13']=='green')].index, inplace=True)
df.drop(df.loc[(df['feature13']=='yellow')].index, inplace=True)
df['feature15'] = df['feature15'].apply(lambda x: 'good' if x=='amazing' else x)
df['feature15'] = df['feature15'].apply(lambda x: 'bad' if x=='horrible' else x)
df.drop(df.loc[(df['feature16']=='okay')].index, inplace=True)

df = df.loc[:, ['target', 'feature01', 'feature04', 'feature07', 'feature08', 'feature09',
       'feature10', 'feature14', 'feature17', 'feature22', 'feature23',
       'feature11', 'feature12', 'feature13', 'feature15', 'feature16',
       'feature10.1', 'feature21']]

In [7]:
# train/test split
y = df['target']
X = df.drop('target', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

Note, that all statistical interference has been applied before train/test split. See 'reg1_sklearn.ipynb' for details. Here, train/test split is done at this point only for convienience (data preprocessing operations therefore applies to both, train and test, data).

## <center>2. Xgboost hypertuning</center>

#### Dimentionality reduction might be worth considering, especially that dummy variables will be necessary. PCA will be applied. 

In [8]:
numeric_features = ['feature01','feature04','feature07','feature08','feature09','feature10','feature14',
                    'feature17','feature22','feature23']
nominal_features = ['feature11','feature12','feature13','feature15','feature16','feature10.1','feature21']

#### In order to apply PCA, some preprocessing is needed :

In [10]:
prep = make_column_transformer((StandardScaler(), numeric_features),
                               (OneHotEncoder(drop='first'), nominal_features))

#### Parameter space is given below:

In [11]:
params={
 "xgbregressor__learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "xgbregressor__max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "xgbregressor__min_child_weight" : [ 1, 3, 5, 7 ],
 "xgbregressor__gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "xgbregressor__colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
}

#### Pipeline will be, of course, essential here : 

In [12]:
pipe = make_pipeline(prep, PCA(n_components=10, svd_solver='full'), XGBRegressor())
# 10 PCA components deliver 90% of variability

#### GridSearch setup:

In [13]:
gs = GridSearchCV(pipe, params, scoring='roc_auc', n_jobs=-1, cv=10)

#### Hyperparameters tuning:

In [14]:
gs.fit(X_train,y_train)



In [15]:
gs.best_params_

{'xgbregressor__colsample_bytree': 0.3,
 'xgbregressor__gamma': 0.0,
 'xgbregressor__learning_rate': 0.05,
 'xgbregressor__max_depth': 3,
 'xgbregressor__min_child_weight': 1}

## <center>3. Predictions and evaluation</center>

In [21]:
y_pred = gs.predict(X_test)

In [27]:
# Some minor formatting
y_test = y_test.reset_index(drop=True)
y_pred = pd.Series(y_pred)

#### Well, this time xgboost did not beat its sklearn competitors.

In [32]:
print('xgboost determination coeff: {}'.format(round(r2_score(y_test, y_pred), 5)))

xgboost determination coeff: 0.83197


In [33]:
print('xgboost MAE: {}'.format(round(mean_absolute_error(y_test, y_pred), 5)))

xgboost MAE: 20243.40406


In [34]:
print('GradientBoostingRegressor determination coeff: 0.87513')

GradientBoostingRegressor determination coeff: 0.87513


In [35]:
print('GradientBoostingRegressor MAE: 17386.16431')

GradientBoostingRegressor MAE: 17386.16431
