In [1]:
%autosave 0

Autosave disabled


#### In this work, same dataset has been considered as in 'clf1_sklearn.ipynb' as we seek if the famous xgboost will outperform the best, so far, logistic regression. Please refer to 'clf1_sklearn.ipynb' the see the details of the dataset and rationale concerning the preprocessing phase. 

### 0. Dependencies

In [2]:
#linear algebra and data manipulation
import numpy as np
import pandas as pd

# train/test split
from sklearn.model_selection import train_test_split

# statistical interference
import scipy.stats
from scipy import stats
from scipy.stats import skewtest
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import chi2

# xgboost and hypertuning tool
from sklearn.model_selection import GridSearchCV
import xgboost

# metrics
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

## <center>1. Data preparation</center>

#### For details, please refer to 'clf1_sklearn.ipynb'.

In [3]:
# loading in the data
df = pd.read_csv('or_data.csv')

In [4]:
# invariant variable exclusion
df = df.drop('R9', axis=1)

In [5]:
# dtype conversion
for col in df.columns:
    if df[col].dtype == object:
        df = df[~(df[col].str.contains(','))]
for col in df.columns:
    if df[col].dtype == object:
        df[col] = df[col].astype('float')

In [6]:
# zero-rows exclusion
temp_df = df.drop(['YEAR', 'Y'], axis=1)
indicies = []
for i in temp_df.index:
    if (temp_df.loc[i].sum() == 0):
        indicies.append(i)
for i in indicies:
    df = df.drop(index=i)

In [7]:
# duplicates exclusion
df.drop_duplicates(inplace = True)

In [8]:
# train/test split
y = df['Y']
X = df.drop('Y', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8, stratify=df['Y'])

In [9]:
# exclusion of irrelevant numeric variables
df = pd.concat([X_train, y_train], axis=1)
spearman_list = []
for col in df.columns:
    if (col == 'Y')|(col=='YEAR'):
        continue
    if (scipy.stats.spearmanr(df[['Y', col]]).pvalue < 0.05):
        spearman_list.append(col)

pearson_list = []
for col in df.columns:
    if (col == 'Y')|(col=='YEAR'):
        continue
    if (scipy.stats.pearsonr(df['Y'], df[col])[1] < 0.05):
        pearson_list.append(col)

kendall_list = []
for col in df.columns:
    if (col == 'Y')|(col=='YEAR'):
        continue
    tau, p_value = stats.kendalltau(df['Y'], df[col])
    if (p_value < 0.05):
        kendall_list.append(col)       

siginificant_features = set(spearman_list + pearson_list + kendall_list)

features = []
for col in df.columns:
    if (col == 'Y')|(col=='YEAR'):
        continue
    for var in siginificant_features:
        if col == var:
            features.append(col)

variables = ['YEAR']
for col in df.columns:
    for feature in features:
        if col == feature:
            variables.append(col)
            
X_train = X_train.loc[:, variables]
X_test = X_test.loc[:, variables]

In [10]:
# exclusion of irrelevant nominal variables
X_train = X_train.drop('YEAR', axis=1)
X_test = X_test.drop('YEAR', axis=1)

In [11]:
# exclusion of colinear variables
X_train = X_train.drop('R1', axis=1)
X_test = X_test.drop('R1', axis=1)
X_train = X_train.drop('R16', axis=1)
X_test = X_test.drop('R16', axis=1)

## <center>2. Xgboost hypertuning</center>

#### Here is our parameter space:

In [12]:
params={
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
}

#### Model instantiation:

In [13]:
clf = xgboost.XGBClassifier()

#### Hyperparameters tuning:

In [14]:
gs = GridSearchCV(clf, param_grid=params, scoring='roc_auc', n_jobs=-1, cv=10)
gs.fit(X_train,y_train)

In [15]:
gs.best_params_

{'colsample_bytree': 0.4,
 'gamma': 0.0,
 'learning_rate': 0.05,
 'max_depth': 5,
 'min_child_weight': 3}

#### Finally, full specification of the best xgboost estimator :

In [16]:
gs.best_estimator_

## <center>3. Predictions and evaluation</center>

#### Of course, we will employ the best estimator that has been found in section 2.

In [17]:
clf = xgboost.XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.4,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0.0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.05, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=3, 
              monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1)

#### Training and predicting

In [18]:
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

### <center>Evaluation</center>

In [19]:
# Some minor formatting
y_pred = pd.Series(y_pred)
y_test = y_test.reset_index(drop=True)

#### In terms od accuracy, tie would probably be a fair description :

In [20]:
print('xgboost accuracy: {}'.format(round(accuracy_score(y_test, y_pred), 5)))

xgboost accuracy: 0.875


In [21]:
print('LogisticRegression accuracy: 0.88068')

LogisticRegression accuracy: 0.88068


#### It's no suprise that area under the ROC curve brings no clear winner either : 

In [22]:
print('xgboost accuracy: {}'.format(round(roc_auc_score(y_test, y_pred), 5)))

xgboost accuracy: 0.85418


In [23]:
print('LogisticRegression accuracy: 0.85403')

LogisticRegression accuracy: 0.85403
