# Building Models

In [1]:
# load required packages
import pandas as pd
import numpy as np
import random

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## Train and Test Data

In [2]:
# read train and test data

from datetime import datetime
import os

# train_data
train_data_files = sorted([f for f in os.listdir("data") if (f.endswith(".csv") and (f.startswith("train_data_")))], reverse=True)
latest_train_data = train_data_files[0]
train_data = pd.read_csv(f"data/{latest_train_data}")

# drop new generated index column
train_data.drop(train_data.columns[0], axis=1, inplace=True)
display(train_data.head())

# split train_data for models
y_train = train_data['Survived']
X_train = train_data.drop('Survived', axis=1)


# test_data
test_data_files = sorted([f for f in os.listdir("data") if (f.endswith(".csv") and (f.startswith("test_data_")))], reverse=True)
latest_test_data = test_data_files[0]
test_data = pd.read_csv(f"data/{latest_test_data}")

#drop new generated index column
test_data.drop(test_data.columns[0], axis=1, inplace=True)
display(test_data.head())

# split test_data for models
y_test = test_data['Survived']
X_test = test_data.drop('Survived', axis=1)

Unnamed: 0,Pclass,SibSp,Parch,Age_true,AgeGroup,FareGroup,CabinLvl,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Mr,Title_Mrs,Title_Ms,Title_Noble,Survived
0,1,0,2,1,0,4,7,0,0,1,1,0,0,0,0,1
1,3,0,0,0,3,1,0,0,0,1,0,1,0,0,0,0
2,3,1,1,1,0,2,0,0,0,1,0,0,0,1,0,1
3,2,1,2,1,4,3,0,0,0,1,0,1,0,0,0,0
4,2,1,1,1,4,3,0,0,0,1,0,1,0,0,0,0


Unnamed: 0,Pclass,SibSp,Parch,Age_true,AgeGroup,FareGroup,CabinLvl,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Mr,Title_Mrs,Title_Ms,Title_Noble,Survived
0,3,1,1,0,0,2,0,1,0,0,1,0,0,0,0,1
1,2,0,0,1,3,1,0,0,0,1,0,1,0,0,0,0
2,3,0,0,1,2,1,0,0,0,1,0,1,0,0,0,0
3,2,0,1,1,0,3,0,0,0,1,0,0,0,1,0,1
4,3,1,0,1,1,2,0,1,0,0,0,0,0,1,0,1


## Baseline Model

We build a model which predicts "Survival" (Class 1) for first-class passengers and "No Survival" (Class 0) if a passenger has ticket class 2 or 3.

In [3]:
# Option A - predict "No Survival" for all passengers
train_data.groupby('Survived').size()

baseline_pred_A = pd.Series(np.zeros(len(y_test)))

baseline_acc_A = accuracy_score(y_test, baseline_pred_A)
print(baseline_acc_A)

print("Classification Report:")
print(classification_report(y_test, baseline_pred_A))

print("Confusion Matrix:")
print(confusion_matrix(y_test, baseline_pred_A))

0.585820895522388
Classification Report:
              precision    recall  f1-score   support

           0       0.59      1.00      0.74       157
           1       0.00      0.00      0.00       111

    accuracy                           0.59       268
   macro avg       0.29      0.50      0.37       268
weighted avg       0.34      0.59      0.43       268

Confusion Matrix:
[[157   0]
 [111   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [4]:
# Option B - predict "Survival" or "No Survival" based on 'Pclass'

# for each 'PClass' find number of passengers that survived and did not survive
print(train_data.groupby(['Pclass', 'Survived']).size())
# if 'Pclass'==1, we predict 'Survived'=1, else we predict 'Survived'=0

Pclass  Survived
1       0            56
        1            83
2       0            69
        1            63
3       0           267
        1            85
dtype: int64


In [5]:
# make prediction
X_test['baseline_pred_B'] = 0
X_test.loc[X_test['Pclass'] == 1, 'baseline_pred_B'] = 1
baseline_pred_B = X_test.baseline_pred_B
X_test.drop('baseline_pred_B', axis=1, inplace=True)

# print performance measures
baseline_acc_B = accuracy_score(y_test, baseline_pred_B)
print(baseline_acc_B)

print("Classification Report:")
print(classification_report(y_test, baseline_pred_B))

print("Confusion Matrix:")
print(confusion_matrix(y_test, baseline_pred_B))

0.6940298507462687
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.85      0.76       157
           1       0.69      0.48      0.56       111

    accuracy                           0.69       268
   macro avg       0.69      0.66      0.66       268
weighted avg       0.69      0.69      0.68       268

Confusion Matrix:
[[133  24]
 [ 58  53]]


## XGBoost

https://www.datacamp.com/tutorial/xgboost-in-python  
https://thinkingneuron.com/how-to-create-a-classification-model-using-xgboost-in-python/  
https://towardsdatascience.com/a-guide-to-xgboost-hyperparameters-87980c7f44a9 (Hyperparameter Cheatsheet)
https://towardsdatascience.com/beyond-grid-search-hypercharge-hyperparameter-tuning-for-xgboost-7c78f7a2929d (Step by Step Tuning)

In [3]:
from xgboost import XGBClassifier

### Simple XGB-Classifier with default parameters

In [4]:
# simple XGB-Classifier with default parameters

random.seed(10)

xgb_simple = XGBClassifier()
xgb_simple.fit(X_train, y_train)
xgb_simple_pred = xgb_simple.predict(X_test)
xgb_simple_acc = accuracy_score(y_test, xgb_simple_pred)
print(xgb_simple_acc)

print("Classification Report:")
print(classification_report(y_test, xgb_simple_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, xgb_simple_pred))

0.8097014925373134
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.89      0.84       157
           1       0.81      0.70      0.75       111

    accuracy                           0.81       268
   macro avg       0.81      0.79      0.80       268
weighted avg       0.81      0.81      0.81       268

Confusion Matrix:
[[139  18]
 [ 33  78]]


### Hyperparameter-Tuning for best parameter setting

In [5]:
# Grid Search

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

random.seed(10)

# create an estimator
xgb = XGBClassifier()

# specify the parameter grid
xgb_parameters = {
    'max_depth': [3, 5]
    , 'subsample': [0.3, 0.8]
    , 'colsample_bytree': [0.3, 0.8]
    , 'colsample_bylevel': [0.3, 0.8]
    , 'learning_rate': [0.3, 0.7]
    , 'n_estimators': [50, 100]
    #, 'gamma': [0.5, 1, 3]
}

# specify the cross validation
stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# create grid search instance
xgb_grid_search = GridSearchCV(xgb, xgb_parameters, scoring='accuracy', cv=stratified_10_fold_cv)
#cv=stratified_10_fold_cv OR cv=10

# run the grid search
xgb_grid_search.fit(X_train, y_train)

# print the results of all hyper-parameter combinations
xgb_grid_search_results = pd.DataFrame(xgb_grid_search.cv_results_)
display(xgb_grid_search_results)

# print the best parameter setting
print("best score is {} with params {}".format(xgb_grid_search.best_score_, xgb_grid_search.best_params_))
# cv=10: best score is 0.8410906298003071 with params {'colsample_bytree': 0.8, 'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 50}
# stratcv: best score is 0.8344854070660522 with params {'colsample_bytree': 0.8, 'learning_rate': 0.3, 'max_depth': 3, 'n_estimators': 50}

# stratcv: best score is 0.8360983102918587 with params {'colsample_bylevel': 0.3, 'colsample_bytree': 0.3, 'learning_rate': 0.7, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bylevel,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.037898,0.004635,0.005885,0.000537,0.3,0.3,0.3,3,50,0.3,...,0.854839,0.790323,0.806452,0.790323,0.822581,0.693548,0.854839,0.818433,0.049987,41
1,0.035405,0.000804,0.005785,0.001163,0.3,0.3,0.3,3,50,0.8,...,0.854839,0.806452,0.870968,0.790323,0.822581,0.677419,0.854839,0.824885,0.055524,23
2,0.063929,0.001916,0.005785,0.000399,0.3,0.3,0.3,3,100,0.3,...,0.854839,0.822581,0.854839,0.790323,0.822581,0.741935,0.838710,0.834485,0.040747,3
3,0.064627,0.001163,0.005486,0.000499,0.3,0.3,0.3,3,100,0.8,...,0.838710,0.806452,0.887097,0.806452,0.822581,0.709677,0.854839,0.836073,0.052347,2
4,0.036103,0.000746,0.005785,0.000399,0.3,0.3,0.3,5,50,0.3,...,0.854839,0.774194,0.806452,0.806452,0.806452,0.693548,0.870968,0.818433,0.052027,41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,0.099732,0.017440,0.006386,0.001114,0.8,0.8,0.7,3,100,0.8,...,0.854839,0.790323,0.806452,0.822581,0.758065,0.693548,0.806452,0.823067,0.065056,33
60,0.058144,0.008638,0.006184,0.000598,0.8,0.8,0.7,5,50,0.3,...,0.838710,0.741935,0.741935,0.790323,0.838710,0.709677,0.854839,0.794470,0.050498,63
61,0.064428,0.012028,0.006785,0.000981,0.8,0.8,0.7,5,50,0.8,...,0.854839,0.790323,0.774194,0.822581,0.822581,0.709677,0.838710,0.816846,0.053297,50
62,0.126063,0.022533,0.007381,0.001017,0.8,0.8,0.7,5,100,0.3,...,0.806452,0.741935,0.741935,0.838710,0.774194,0.725806,0.838710,0.803917,0.053965,59


best score is 0.8360983102918587 with params {'colsample_bylevel': 0.3, 'colsample_bytree': 0.3, 'learning_rate': 0.7, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}


In [6]:
# Fit and evaluate best model

xgb_best = XGBClassifier(colsample_bylevel = 0.3, colsample_bytree = 0.3, learning_rate = 0.7, max_depth = 3, n_estimators = 50, subsample = 0.8)
#0.8208955223880597
xgb_best.fit(X_train, y_train)
xgb_best_pred = xgb_best.predict(X_test)
xgb_best_acc = accuracy_score(y_test, xgb_best_pred)
print(xgb_best_acc)

print("Classification Report:")
print(classification_report(y_test, xgb_best_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, xgb_best_pred))

0.8208955223880597
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.89      0.85       157
           1       0.82      0.72      0.77       111

    accuracy                           0.82       268
   macro avg       0.82      0.81      0.81       268
weighted avg       0.82      0.82      0.82       268

Confusion Matrix:
[[140  17]
 [ 31  80]]


### Step by Step Grid Search

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [27]:
# Step 1: max_depth
random.seed(10)

xgb = XGBClassifier()

xgb_parameters_1 = {'max_depth': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]}

stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
xgb_grid_search = GridSearchCV(xgb, xgb_parameters_1, scoring='accuracy', cv=stratified_10_fold_cv)

xgb_grid_search.fit(X_train, y_train)

xgb_grid_search_results = pd.DataFrame(xgb_grid_search.cv_results_)
display(xgb_grid_search_results)

print("best score is {} with params {}".format(xgb_grid_search.best_score_, xgb_grid_search.best_params_))

# best values for max_depth = [2,3,4,5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.087154,0.004174,0.010836,0.001057,1,{'max_depth': 1},0.84127,0.873016,0.888889,0.887097,0.83871,0.854839,0.822581,0.790323,0.645161,0.83871,0.828059,0.067254,5
1,0.094481,0.002924,0.010386,0.000974,2,{'max_depth': 2},0.904762,0.857143,0.873016,0.870968,0.790323,0.790323,0.822581,0.822581,0.758065,0.822581,0.831234,0.042811,3
2,0.110572,0.006613,0.011512,0.001681,3,{'max_depth': 3},0.873016,0.904762,0.888889,0.870968,0.806452,0.774194,0.83871,0.83871,0.725806,0.870968,0.839247,0.052996,2
3,0.126249,0.005697,0.011529,0.000684,4,{'max_depth': 4},0.873016,0.904762,0.857143,0.870968,0.822581,0.774194,0.854839,0.83871,0.758065,0.854839,0.840911,0.0428,1
4,0.124245,0.005004,0.010724,0.001062,5,{'max_depth': 5},0.888889,0.904762,0.825397,0.83871,0.822581,0.741935,0.854839,0.83871,0.725806,0.854839,0.829647,0.053972,4
5,0.135976,0.006056,0.010989,0.001387,6,{'max_depth': 6},0.888889,0.888889,0.84127,0.854839,0.806452,0.774194,0.83871,0.822581,0.709677,0.83871,0.826421,0.050897,6
6,0.153583,0.004269,0.011954,0.001374,7,{'max_depth': 7},0.904762,0.888889,0.825397,0.83871,0.790323,0.790323,0.83871,0.822581,0.725806,0.83871,0.826421,0.048206,8
7,0.164184,0.004594,0.011235,0.001078,8,{'max_depth': 8},0.888889,0.888889,0.825397,0.83871,0.790323,0.758065,0.83871,0.822581,0.725806,0.83871,0.821608,0.049133,12
8,0.175875,0.004721,0.011879,0.001131,9,{'max_depth': 9},0.904762,0.888889,0.84127,0.83871,0.790323,0.741935,0.83871,0.822581,0.709677,0.83871,0.821557,0.057062,13
9,0.184306,0.006106,0.010497,0.001322,10,{'max_depth': 10},0.888889,0.888889,0.84127,0.83871,0.790323,0.774194,0.83871,0.806452,0.709677,0.854839,0.823195,0.052047,10


best score is 0.8409114183307731 with params {'max_depth': 4}


In [28]:
# Step 2: subsample, colsample_bytree, colsample_bylevel
random.seed(10)

xgb = XGBClassifier()

xgb_parameters_2 = {'max_depth': [2,3,4,5]
                   , 'subsample': [0.1, 0.3, 0.5, 0.7, 0.9, 1]
                   , 'colsample_bytree': [0.1, 0.3, 0.5, 0.7, 0.9, 1]
                   , 'colsample_bylevel': [0.1, 0.3, 0.5, 0.7, 0.9, 1]}

stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
xgb_grid_search = GridSearchCV(xgb, xgb_parameters_2, scoring='accuracy', cv=stratified_10_fold_cv)

xgb_grid_search.fit(X_train, y_train)

xgb_grid_search_results_2 = pd.DataFrame(xgb_grid_search.cv_results_)
display(xgb_grid_search_results_2)

print("best score is {} with params {}".format(xgb_grid_search.best_score_, xgb_grid_search.best_params_))

# best values for
# max_depth = [2,4]
# subsample = [0.5, 0.7, 0.9]
# colsample_bytree = [0.3, 0.5, 0.9]
# colsample_bylevel = [0.1, 0.3, 0.7]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bylevel,param_colsample_bytree,param_max_depth,param_subsample,params,split0_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.094148,0.005155,0.009989,0.001835,0.1,0.1,2,0.1,"{'colsample_bylevel': 0.1, 'colsample_bytree':...",0.809524,...,0.806452,0.838710,0.774194,0.774194,0.758065,0.677419,0.838710,0.795981,0.051484,847
1,0.094689,0.005821,0.010009,0.001320,0.1,0.1,2,0.3,"{'colsample_bylevel': 0.1, 'colsample_bytree':...",0.841270,...,0.790323,0.822581,0.758065,0.838710,0.838710,0.661290,0.838710,0.813569,0.060517,717
2,0.103950,0.004508,0.010321,0.000866,0.1,0.1,2,0.5,"{'colsample_bylevel': 0.1, 'colsample_bytree':...",0.841270,...,0.838710,0.822581,0.822581,0.822581,0.822581,0.645161,0.854839,0.821633,0.061811,555
3,0.101710,0.010839,0.010511,0.001010,0.1,0.1,2,0.7,"{'colsample_bylevel': 0.1, 'colsample_bytree':...",0.857143,...,0.790323,0.822581,0.822581,0.822581,0.806452,0.661290,0.870968,0.819995,0.059893,612
4,0.124227,0.060625,0.009744,0.001571,0.1,0.1,2,0.9,"{'colsample_bylevel': 0.1, 'colsample_bytree':...",0.857143,...,0.822581,0.822581,0.870968,0.822581,0.790323,0.693548,0.870968,0.828085,0.051964,303
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,0.144559,0.005306,0.011619,0.001396,1,1,5,0.3,"{'colsample_bylevel': 1, 'colsample_bytree': 1...",0.825397,...,0.854839,0.774194,0.822581,0.822581,0.838710,0.725806,0.838710,0.821710,0.040200,543
860,0.150761,0.004301,0.011664,0.000834,1,1,5,0.5,"{'colsample_bylevel': 1, 'colsample_bytree': 1...",0.873016,...,0.854839,0.790323,0.790323,0.806452,0.822581,0.741935,0.870968,0.828059,0.043837,317
861,0.154240,0.007147,0.012484,0.000650,1,1,5,0.7,"{'colsample_bylevel': 1, 'colsample_bytree': 1...",0.888889,...,0.870968,0.806452,0.758065,0.822581,0.822581,0.709677,0.854839,0.824834,0.054132,483
862,0.151982,0.003265,0.011645,0.000631,1,1,5,0.9,"{'colsample_bylevel': 1, 'colsample_bytree': 1...",0.904762,...,0.854839,0.806452,0.774194,0.806452,0.838710,0.693548,0.854839,0.824808,0.057412,491


best score is 0.8441372247823861 with params {'colsample_bylevel': 0.1, 'colsample_bytree': 0.9, 'max_depth': 2, 'subsample': 0.7}


In [43]:
# save dataframe

from datetime import datetime

# save data with date to track changes 
date = str(datetime.now().date()).replace("-", "")

xgb_grid_search_results_2.to_csv(f"data/xgb_results_step2_{date}.csv")

In [37]:
cols_2 = ['param_max_depth', 'param_subsample', 'param_colsample_bytree', 'param_colsample_bylevel', 'mean_test_score', 'rank_test_score']
df_2 = xgb_grid_search_results_2[cols_2]
df_2_sorted = df_2.sort_values(by='rank_test_score')
df_2_sorted.head(10)

Unnamed: 0,param_colsample_bylevel,param_colsample_bytree,param_max_depth,param_subsample,mean_test_score,rank_test_score
99,0.1,0.9,2,0.7,0.844137,1
483,0.7,0.5,2,0.7,0.84406,2
170,0.3,0.3,2,0.5,0.842576,3
26,0.1,0.3,2,0.5,0.842576,3
76,0.1,0.7,2,0.9,0.84255,5
98,0.1,0.9,2,0.5,0.842499,6
77,0.1,0.7,2,1.0,0.840963,7
315,0.5,0.3,2,0.7,0.840937,8
459,0.7,0.3,2,0.7,0.840937,8
857,1.0,1.0,4,1.0,0.840911,10


In [38]:
# Step 3: learning_rate
random.seed(10)

xgb = XGBClassifier()

xgb_parameters_3 = {'max_depth': [2,4]
                   , 'subsample': [0.5, 0.7, 0.9]
                   , 'colsample_bytree': [0.3, 0.5, 0.9]
                   , 'colsample_bylevel': [0.1, 0.3, 0.7]
                   , 'learning_rate': [0.1, 0.3, 0.5, 0.7, 0.9]
                   , 'n_estimators': [50, 100, 150, 200, 250, 300]}

stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
xgb_grid_search = GridSearchCV(xgb, xgb_parameters_3, scoring='accuracy', cv=stratified_10_fold_cv)

xgb_grid_search.fit(X_train, y_train)

xgb_grid_search_results_3 = pd.DataFrame(xgb_grid_search.cv_results_)
display(xgb_grid_search_results_3)

print("best score is {} with params {}".format(xgb_grid_search.best_score_, xgb_grid_search.best_params_))

# best values for
# max_depth = [2,4]
# subsample = [0.5, 0.7, 0.9]
# colsample_bytree = [0.3, 0.5, 0.9]
# colsample_bylevel = [0.1, 0.3, 0.7]
# learning_rate = []
# n_estimators = []

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bylevel,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.063518,0.004715,0.009346,0.001981,0.1,0.3,0.1,2,50,0.5,...,0.838710,0.854839,0.806452,0.838710,0.790323,0.709677,0.774194,0.821608,0.050230,936
1,0.058864,0.003199,0.009017,0.001919,0.1,0.3,0.1,2,50,0.7,...,0.854839,0.838710,0.790323,0.838710,0.790323,0.709677,0.822581,0.818484,0.050169,1104
2,0.060230,0.003488,0.010143,0.001395,0.1,0.3,0.1,2,50,0.9,...,0.854839,0.838710,0.790323,0.822581,0.790323,0.709677,0.806452,0.818433,0.048299,1115
3,0.106949,0.010550,0.009827,0.001825,0.1,0.3,0.1,2,100,0.5,...,0.822581,0.822581,0.806452,0.822581,0.806452,0.693548,0.838710,0.820020,0.048275,1033
4,0.110194,0.008693,0.010148,0.002095,0.1,0.3,0.1,2,100,0.7,...,0.838710,0.806452,0.806452,0.838710,0.806452,0.677419,0.838710,0.824782,0.057512,764
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1615,0.181315,0.001596,0.007281,0.000457,0.7,0.9,0.9,4,250,0.7,...,0.838710,0.741935,0.741935,0.822581,0.822581,0.725806,0.790323,0.803943,0.050455,1557
1616,0.178622,0.002114,0.007081,0.000299,0.7,0.9,0.9,4,250,0.9,...,0.806452,0.790323,0.774194,0.822581,0.806452,0.693548,0.838710,0.811956,0.057165,1429
1617,0.214528,0.002377,0.007680,0.000457,0.7,0.9,0.9,4,300,0.5,...,0.822581,0.709677,0.774194,0.790323,0.790323,0.677419,0.774194,0.781490,0.053158,1618
1618,0.215025,0.002491,0.007381,0.000488,0.7,0.9,0.9,4,300,0.7,...,0.838710,0.758065,0.725806,0.774194,0.806452,0.677419,0.854839,0.799104,0.060610,1591


best score is 0.847363031233999 with params {'colsample_bylevel': 0.7, 'colsample_bytree': 0.5, 'learning_rate': 0.3, 'max_depth': 2, 'n_estimators': 50, 'subsample': 0.7}


In [39]:
# save dataframe

from datetime import datetime

# save data with date to track changes 
date = str(datetime.now().date()).replace("-", "")

xgb_grid_search_results_3.to_csv(f"data/xgb_results_step3_{date}.csv")

In [15]:
cols_3 = ['param_max_depth', 'param_subsample', 'param_colsample_bytree', 'param_colsample_bylevel', 'param_learning_rate', 'param_n_estimators', 'mean_test_score', 'rank_test_score']
df_3 = xgb_grid_search_results_3[cols_3]
df_3_sorted = df_3.sort_values(by='rank_test_score')
df_3_sorted.head(10)

NameError: name 'xgb_grid_search_results_3' is not defined

#### Full Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [25]:
# Full Tuning
random.seed(10)

xgb = XGBClassifier()

xgb_parameters = {'max_depth': [1,2,3,4,5]
                   , 'subsample': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
                   , 'colsample_bytree': [0.3, 0.4, 0.5, 0.6, 0.8, 0.9]
                   , 'colsample_bylevel': [0.1, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8]
                   , 'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3, 0.4]
                   , 'n_estimators': [50, 100, 150, 200, 250, 300, 400, 500]}

stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
xgb_grid_search = GridSearchCV(xgb, xgb_parameters, scoring='accuracy', cv=stratified_10_fold_cv)

xgb_grid_search.fit(X_train, y_train)

xgb_grid_search_results_full = pd.DataFrame(xgb_grid_search.cv_results_)
display(xgb_grid_search_results_full)

print("best score is {} with params {}".format(xgb_grid_search.best_score_, xgb_grid_search.best_params_))

# best values for

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.092642,0.005757,0.010668,0.001535,1,{'max_depth': 1},0.84127,0.873016,0.888889,0.887097,0.83871,0.854839,0.822581,0.790323,0.645161,0.83871,0.828059,0.067254,5
1,0.09611,0.005208,0.010446,0.001452,2,{'max_depth': 2},0.904762,0.857143,0.873016,0.870968,0.790323,0.790323,0.822581,0.822581,0.758065,0.822581,0.831234,0.042811,3
2,0.107289,0.004847,0.009863,0.001459,3,{'max_depth': 3},0.873016,0.904762,0.888889,0.870968,0.806452,0.774194,0.83871,0.83871,0.725806,0.870968,0.839247,0.052996,2
3,0.117434,0.005594,0.009368,0.001473,4,{'max_depth': 4},0.873016,0.904762,0.857143,0.870968,0.822581,0.774194,0.854839,0.83871,0.758065,0.854839,0.840911,0.0428,1
4,0.131997,0.007728,0.011317,0.001337,5,{'max_depth': 5},0.888889,0.904762,0.825397,0.83871,0.822581,0.741935,0.854839,0.83871,0.725806,0.854839,0.829647,0.053972,4
5,0.146077,0.007255,0.01093,0.001784,6,{'max_depth': 6},0.888889,0.888889,0.84127,0.854839,0.806452,0.774194,0.83871,0.822581,0.709677,0.83871,0.826421,0.050897,6
6,0.222718,0.085059,0.009438,0.00118,7,{'max_depth': 7},0.904762,0.888889,0.825397,0.83871,0.790323,0.790323,0.83871,0.822581,0.725806,0.83871,0.826421,0.048206,8
7,0.219484,0.0199,0.010934,0.001361,8,{'max_depth': 8},0.888889,0.888889,0.825397,0.83871,0.790323,0.758065,0.83871,0.822581,0.725806,0.83871,0.821608,0.049133,12
8,0.234315,0.020496,0.00976,0.001204,9,{'max_depth': 9},0.904762,0.888889,0.84127,0.83871,0.790323,0.741935,0.83871,0.822581,0.709677,0.83871,0.821557,0.057062,13
9,0.243091,0.029045,0.011674,0.002686,10,{'max_depth': 10},0.888889,0.888889,0.84127,0.83871,0.790323,0.774194,0.83871,0.806452,0.709677,0.854839,0.823195,0.052047,10


best score is 0.8409114183307731 with params {'max_depth': 4}


In [None]:
# save dataframe

from datetime import datetime

# save data with date to track changes 
date = str(datetime.now().date()).replace("-", "")

xgb_grid_search_results_full.to_csv(f"data/xgb_results_full_{date}.csv")

In [None]:
cols_full = ['param_max_depth', 'param_subsample', 'param_colsample_bytree', 'param_colsample_bylevel', 'param_learning_rate', 'param_n_estimators', 'mean_test_score', 'rank_test_score']
df_full = xgb_grid_search_results_full[cols_full]
df_full_sorted = df_full.sort_values(by='rank_test_score')
df_full_sorted.head(10)

#### Full Tuning in Parts - Round 1

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [8]:
# Full Tuning - Part 1
random.seed(10)

xgb = XGBClassifier()

xgb_parameters = {'max_depth': [2,3,4,5]
                   , 'subsample': [0.5, 0.6, 0.7, 0.8, 0.9]
                   , 'colsample_bytree': [0.3, 0.4, 0.5, 0.6, 0.9]
                   , 'colsample_bylevel': [0.1]
                   , 'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3, 0.4]
                   , 'n_estimators': [100, 150, 200, 250, 300]}

stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
xgb_grid_search = GridSearchCV(xgb, xgb_parameters, scoring='accuracy', cv=stratified_10_fold_cv)

xgb_grid_search.fit(X_train, y_train)

xgb_grid_search_results_full_1 = pd.DataFrame(xgb_grid_search.cv_results_)
display(xgb_grid_search_results_full_1)

print("best score is {} with params {}".format(xgb_grid_search.best_score_, xgb_grid_search.best_params_))

# best values for

# save dataframe
from datetime import datetime
date = str(datetime.now().date()).replace("-", "")
xgb_grid_search_results_full_1.to_csv(f"data/xgb_results_full_1_bylevel=0.1_{date}.csv")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bylevel,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.064527,0.004136,0.005885,5.376236e-04,0.1,0.3,0.1,2,100,0.5,...,0.822581,0.822581,0.806452,0.822581,0.806452,0.693548,0.838710,0.820020,0.048275,2651
1,0.064327,0.002493,0.005885,5.374151e-04,0.1,0.3,0.1,2,100,0.6,...,0.838710,0.822581,0.806452,0.838710,0.806452,0.661290,0.854839,0.823221,0.059057,2358
2,0.063231,0.001954,0.005785,3.990535e-04,0.1,0.3,0.1,2,100,0.7,...,0.838710,0.806452,0.806452,0.838710,0.806452,0.677419,0.838710,0.824782,0.057512,2252
3,0.062235,0.001017,0.006082,3.001655e-04,0.1,0.3,0.1,2,100,0.8,...,0.854839,0.822581,0.806452,0.854839,0.806452,0.661290,0.838710,0.824834,0.059251,2175
4,0.062333,0.002285,0.005784,3.991850e-04,0.1,0.3,0.1,2,100,0.9,...,0.854839,0.822581,0.838710,0.838710,0.806452,0.677419,0.838710,0.828059,0.053977,1747
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0.180018,0.001496,0.007081,2.993507e-04,0.1,0.9,0.4,5,300,0.5,...,0.887097,0.806452,0.774194,0.822581,0.822581,0.725806,0.854839,0.828085,0.048801,1701
2996,0.182212,0.003339,0.006982,4.460618e-04,0.1,0.9,0.4,5,300,0.6,...,0.870968,0.774194,0.758065,0.806452,0.838710,0.741935,0.854839,0.826421,0.052840,1992
2997,0.181813,0.001548,0.007082,2.990570e-04,0.1,0.9,0.4,5,300,0.7,...,0.870968,0.790323,0.790323,0.790323,0.854839,0.709677,0.822581,0.824808,0.053239,2204
2998,0.181666,0.001723,0.006982,4.529953e-07,0.1,0.9,0.4,5,300,0.8,...,0.870968,0.790323,0.774194,0.806452,0.854839,0.741935,0.822581,0.823272,0.043729,2305


best score is 0.8505376344086022 with params {'colsample_bylevel': 0.1, 'colsample_bytree': 0.6, 'learning_rate': 0.15, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.6}


In [14]:
cols_full = ['param_max_depth', 'param_subsample', 'param_colsample_bytree', 'param_colsample_bylevel', 'param_learning_rate', 'param_n_estimators', 'mean_test_score', 'rank_test_score']
df_full_1 = xgb_grid_search_results_full_1[cols_full]
df_full_1_sorted = df_full_1.sort_values(by='rank_test_score')
df_full_1_sorted.head(10)

# best values for colsample_bylevel = [0.1]
# max_depth = [4, 5]
# subsample = [0.6, 0.8]
# colsample_bytree = [0.6, 0.9]
# learning_rate = [0.1, 0.15]
# n_estimators = [100, 150]
# 0.850538

Unnamed: 0,param_max_depth,param_subsample,param_colsample_bytree,param_colsample_bylevel,param_learning_rate,param_n_estimators,mean_test_score,rank_test_score
1976,5,0.6,0.6,0.1,0.15,100,0.850538,1
2456,4,0.6,0.9,0.1,0.1,150,0.848925,2
1853,4,0.8,0.6,0.1,0.1,100,0.847312,3
1951,4,0.6,0.6,0.1,0.15,100,0.845776,4
1801,2,0.6,0.6,0.1,0.1,100,0.845776,4
1850,4,0.5,0.6,0.1,0.1,100,0.845776,4
1328,3,0.8,0.5,0.1,0.15,100,0.845776,4
2475,5,0.5,0.9,0.1,0.1,100,0.845776,4
2508,2,0.8,0.9,0.1,0.15,150,0.845776,9
1802,2,0.7,0.6,0.1,0.1,100,0.845776,9


In [9]:
# Full Tuning - Part 2
random.seed(10)

xgb = XGBClassifier()

xgb_parameters = {'max_depth': [2,3,4,5]
                   , 'subsample': [0.5, 0.6, 0.7, 0.8, 0.9]
                   , 'colsample_bytree': [0.3, 0.4, 0.5, 0.6, 0.9]
                   , 'colsample_bylevel': [0.2]
                   , 'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3, 0.4]
                   , 'n_estimators': [100, 150, 200, 250, 300]}

stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
xgb_grid_search = GridSearchCV(xgb, xgb_parameters, scoring='accuracy', cv=stratified_10_fold_cv)

xgb_grid_search.fit(X_train, y_train)

xgb_grid_search_results_full_2 = pd.DataFrame(xgb_grid_search.cv_results_)
display(xgb_grid_search_results_full_2)

print("best score is {} with params {}".format(xgb_grid_search.best_score_, xgb_grid_search.best_params_))

# best values for

# save dataframe
from datetime import datetime
date = str(datetime.now().date()).replace("-", "")
xgb_grid_search_results_full_2.to_csv(f"data/xgb_results_full_2_bylevel=0.2_{date}.csv")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bylevel,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.062133,0.002485,0.006284,0.000639,0.2,0.3,0.1,2,100,0.5,...,0.822581,0.822581,0.806452,0.822581,0.806452,0.693548,0.838710,0.820020,0.048275,2663
1,0.063131,0.003027,0.006683,0.000457,0.2,0.3,0.1,2,100,0.6,...,0.838710,0.822581,0.806452,0.838710,0.806452,0.661290,0.854839,0.823221,0.059057,2338
2,0.060438,0.000798,0.006782,0.000398,0.2,0.3,0.1,2,100,0.7,...,0.838710,0.806452,0.806452,0.838710,0.806452,0.677419,0.838710,0.824782,0.057512,2222
3,0.059541,0.000779,0.006682,0.000457,0.2,0.3,0.1,2,100,0.8,...,0.854839,0.822581,0.806452,0.854839,0.806452,0.661290,0.838710,0.824834,0.059251,2132
4,0.059840,0.001092,0.006782,0.000399,0.2,0.3,0.1,2,100,0.9,...,0.854839,0.822581,0.838710,0.838710,0.806452,0.677419,0.838710,0.828059,0.053977,1661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0.195577,0.001636,0.017653,0.031683,0.2,0.9,0.4,5,300,0.5,...,0.822581,0.806452,0.774194,0.790323,0.790323,0.741935,0.822581,0.819918,0.052433,2744
2996,0.221308,0.011234,0.006782,0.000399,0.2,0.9,0.4,5,300,0.6,...,0.838710,0.806452,0.774194,0.822581,0.838710,0.693548,0.822581,0.819995,0.058351,2693
2997,0.226194,0.002309,0.007082,0.000299,0.2,0.9,0.4,5,300,0.7,...,0.838710,0.790323,0.758065,0.806452,0.854839,0.725806,0.822581,0.821582,0.054587,2581
2998,0.199167,0.006213,0.007381,0.000489,0.2,0.9,0.4,5,300,0.8,...,0.838710,0.822581,0.774194,0.838710,0.822581,0.725806,0.790323,0.823195,0.052389,2389


best score is 0.8505376344086022 with params {'colsample_bylevel': 0.2, 'colsample_bytree': 0.6, 'learning_rate': 0.15, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.6}


In [None]:
cols_full = ['param_max_depth', 'param_subsample', 'param_colsample_bytree', 'param_colsample_bylevel', 'param_learning_rate', 'param_n_estimators', 'mean_test_score', 'rank_test_score']
df_full_1 = xgb_grid_search_results_full_1[cols_full]
df_full_1_sorted = df_full_1.sort_values(by='rank_test_score')
df_full_1_sorted.head(10)

# best values for colsample_bylevel = [0.1]
# max_depth = []
# subsample = []
# colsample_bytree = []
# learning_rate = []
# n_estimators = []

In [10]:
# Full Tuning - Part 3
random.seed(10)

xgb = XGBClassifier()

xgb_parameters = {'max_depth': [2,3,4,5]
                   , 'subsample': [0.5, 0.6, 0.7, 0.8, 0.9]
                   , 'colsample_bytree': [0.3, 0.4, 0.5, 0.6, 0.9]
                   , 'colsample_bylevel': [0.3]
                   , 'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3, 0.4]
                   , 'n_estimators': [100, 150, 200, 250, 300]}

stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
xgb_grid_search = GridSearchCV(xgb, xgb_parameters, scoring='accuracy', cv=stratified_10_fold_cv)

xgb_grid_search.fit(X_train, y_train)

xgb_grid_search_results_full_3 = pd.DataFrame(xgb_grid_search.cv_results_)
display(xgb_grid_search_results_full_3)

print("best score is {} with params {}".format(xgb_grid_search.best_score_, xgb_grid_search.best_params_))

# best values for

# save dataframe
from datetime import datetime
date = str(datetime.now().date()).replace("-", "")
xgb_grid_search_results_full_3.to_csv(f"data/xgb_results_full_3_bylevel=0.3_{date}.csv")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bylevel,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.062034,0.001596,0.006882,0.000537,0.3,0.3,0.1,2,100,0.5,...,0.822581,0.822581,0.806452,0.822581,0.806452,0.693548,0.838710,0.820020,0.048275,2509
1,0.068327,0.005910,0.006573,0.000502,0.3,0.3,0.1,2,100,0.6,...,0.838710,0.822581,0.806452,0.838710,0.806452,0.661290,0.854839,0.823221,0.059057,2140
2,0.065225,0.003191,0.006683,0.000457,0.3,0.3,0.1,2,100,0.7,...,0.838710,0.806452,0.806452,0.838710,0.806452,0.677419,0.838710,0.824782,0.057512,2021
3,0.063630,0.001596,0.006483,0.000499,0.3,0.3,0.1,2,100,0.8,...,0.854839,0.822581,0.806452,0.854839,0.806452,0.661290,0.838710,0.824834,0.059251,1898
4,0.061435,0.001493,0.006483,0.000498,0.3,0.3,0.1,2,100,0.9,...,0.854839,0.822581,0.838710,0.838710,0.806452,0.677419,0.838710,0.828059,0.053977,1394
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0.206226,0.002782,0.007182,0.000598,0.3,0.9,0.4,5,300,0.5,...,0.838710,0.790323,0.774194,0.838710,0.822581,0.709677,0.822581,0.813646,0.044452,2900
2996,0.207998,0.002457,0.007280,0.000457,0.3,0.9,0.4,5,300,0.6,...,0.854839,0.774194,0.790323,0.854839,0.806452,0.693548,0.854839,0.818459,0.053856,2648
2997,0.207744,0.000898,0.007480,0.000499,0.3,0.9,0.4,5,300,0.7,...,0.854839,0.790323,0.758065,0.822581,0.806452,0.709677,0.822581,0.813594,0.056215,2908
2998,0.208343,0.001863,0.007280,0.000457,0.3,0.9,0.4,5,300,0.8,...,0.854839,0.790323,0.790323,0.822581,0.806452,0.693548,0.822581,0.815207,0.053961,2850


best score is 0.850563236047107 with params {'colsample_bylevel': 0.3, 'colsample_bytree': 0.5, 'learning_rate': 0.15, 'max_depth': 2, 'n_estimators': 100, 'subsample': 0.7}


In [None]:
cols_full = ['param_max_depth', 'param_subsample', 'param_colsample_bytree', 'param_colsample_bylevel', 'param_learning_rate', 'param_n_estimators', 'mean_test_score', 'rank_test_score']
df_full_1 = xgb_grid_search_results_full_1[cols_full]
df_full_1_sorted = df_full_1.sort_values(by='rank_test_score')
df_full_1_sorted.head(10)

# best values for colsample_bylevel = [0.1]
# max_depth = []
# subsample = []
# colsample_bytree = []
# learning_rate = []
# n_estimators = []

In [11]:
# Full Tuning - Part 4
random.seed(10)

xgb = XGBClassifier()

xgb_parameters = {'max_depth': [2,3,4,5]
                   , 'subsample': [0.5, 0.6, 0.7, 0.8, 0.9]
                   , 'colsample_bytree': [0.3, 0.4, 0.5, 0.6, 0.9]
                   , 'colsample_bylevel': [0.4]
                   , 'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3, 0.4]
                   , 'n_estimators': [100, 150, 200, 250, 300]}

stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
xgb_grid_search = GridSearchCV(xgb, xgb_parameters, scoring='accuracy', cv=stratified_10_fold_cv)

xgb_grid_search.fit(X_train, y_train)

xgb_grid_search_results_full_4 = pd.DataFrame(xgb_grid_search.cv_results_)
display(xgb_grid_search_results_full_4)

print("best score is {} with params {}".format(xgb_grid_search.best_score_, xgb_grid_search.best_params_))

# best values for

# save dataframe
from datetime import datetime
date = str(datetime.now().date()).replace("-", "")
xgb_grid_search_results_full_4.to_csv(f"data/xgb_results_full_4_bylevel=0.4_{date}.csv")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bylevel,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.061735,0.001296,0.006482,0.000669,0.4,0.3,0.1,2,100,0.5,...,0.822581,0.822581,0.806452,0.822581,0.806452,0.693548,0.838710,0.820020,0.048275,2502
1,0.063031,0.002778,0.006483,0.000669,0.4,0.3,0.1,2,100,0.6,...,0.838710,0.822581,0.806452,0.838710,0.806452,0.661290,0.854839,0.823221,0.059057,2077
2,0.060638,0.000746,0.006782,0.000399,0.4,0.3,0.1,2,100,0.7,...,0.838710,0.806452,0.806452,0.838710,0.806452,0.677419,0.838710,0.824782,0.057512,1941
3,0.060339,0.000919,0.006483,0.000499,0.4,0.3,0.1,2,100,0.8,...,0.854839,0.822581,0.806452,0.854839,0.806452,0.661290,0.838710,0.824834,0.059251,1818
4,0.059840,0.001180,0.006682,0.000457,0.4,0.3,0.1,2,100,0.9,...,0.854839,0.822581,0.838710,0.838710,0.806452,0.677419,0.838710,0.828059,0.053977,1289
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0.218216,0.004179,0.007181,0.000399,0.4,0.9,0.4,5,300,0.5,...,0.838710,0.758065,0.790323,0.806452,0.790323,0.741935,0.822581,0.808807,0.046180,2975
2996,0.221009,0.003402,0.007480,0.000499,0.4,0.9,0.4,5,300,0.6,...,0.838710,0.790323,0.806452,0.838710,0.774194,0.725806,0.790323,0.812007,0.049754,2922
2997,0.221308,0.004515,0.007380,0.000489,0.4,0.9,0.4,5,300,0.7,...,0.838710,0.790323,0.790323,0.790323,0.822581,0.725806,0.822581,0.816795,0.050406,2770
2998,0.219213,0.003908,0.007194,0.000425,0.4,0.9,0.4,5,300,0.8,...,0.838710,0.790323,0.806452,0.838710,0.790323,0.725806,0.806452,0.816820,0.047212,2763


best score is 0.850563236047107 with params {'colsample_bylevel': 0.4, 'colsample_bytree': 0.5, 'learning_rate': 0.15, 'max_depth': 2, 'n_estimators': 100, 'subsample': 0.7}


In [None]:
cols_full = ['param_max_depth', 'param_subsample', 'param_colsample_bytree', 'param_colsample_bylevel', 'param_learning_rate', 'param_n_estimators', 'mean_test_score', 'rank_test_score']
df_full_1 = xgb_grid_search_results_full_1[cols_full]
df_full_1_sorted = df_full_1.sort_values(by='rank_test_score')
df_full_1_sorted.head(10)

# best values for colsample_bylevel = [0.1]
# max_depth = []
# subsample = []
# colsample_bytree = []
# learning_rate = []
# n_estimators = []

In [12]:
# Full Tuning - Part 5
random.seed(10)

xgb = XGBClassifier()

xgb_parameters = {'max_depth': [2,3,4,5]
                   , 'subsample': [0.5, 0.6, 0.7, 0.8, 0.9]
                   , 'colsample_bytree': [0.3, 0.4, 0.5, 0.6, 0.9]
                   , 'colsample_bylevel': [0.6]
                   , 'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3, 0.4]
                   , 'n_estimators': [100, 150, 200, 250, 300]}

stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
xgb_grid_search = GridSearchCV(xgb, xgb_parameters, scoring='accuracy', cv=stratified_10_fold_cv)

xgb_grid_search.fit(X_train, y_train)

xgb_grid_search_results_full_5 = pd.DataFrame(xgb_grid_search.cv_results_)
display(xgb_grid_search_results_full_5)

print("best score is {} with params {}".format(xgb_grid_search.best_score_, xgb_grid_search.best_params_))

# best values for

# save dataframe
from datetime import datetime
date = str(datetime.now().date()).replace("-", "")
xgb_grid_search_results_full_5.to_csv(f"data/xgb_results_full_5_bylevel=0.6_{date}.csv")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bylevel,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.063530,0.002446,0.006383,0.000662,0.6,0.3,0.1,2,100,0.5,...,0.870968,0.822581,0.838710,0.854839,0.822581,0.709677,0.838710,0.834537,0.044892,215
1,0.065725,0.003351,0.006882,0.000299,0.6,0.3,0.1,2,100,0.6,...,0.854839,0.822581,0.854839,0.854839,0.822581,0.693548,0.854839,0.832949,0.048291,320
2,0.062236,0.001196,0.006780,0.000398,0.6,0.3,0.1,2,100,0.7,...,0.887097,0.838710,0.854839,0.854839,0.838710,0.709677,0.854839,0.842601,0.046342,17
3,0.061335,0.001798,0.006583,0.000489,0.6,0.3,0.1,2,100,0.8,...,0.870968,0.822581,0.870968,0.838710,0.822581,0.693548,0.854839,0.834562,0.050284,208
4,0.060937,0.000698,0.006483,0.000669,0.6,0.3,0.1,2,100,0.9,...,0.870968,0.822581,0.887097,0.822581,0.806452,0.677419,0.854839,0.832924,0.057218,327
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0.219613,0.001532,0.007380,0.000488,0.6,0.9,0.4,5,300,0.5,...,0.806452,0.741935,0.774194,0.806452,0.790323,0.741935,0.806452,0.805504,0.051226,2997
2996,0.222105,0.001787,0.006882,0.000698,0.6,0.9,0.4,5,300,0.6,...,0.854839,0.790323,0.758065,0.822581,0.741935,0.725806,0.806452,0.810317,0.059764,2968
2997,0.221906,0.002534,0.007281,0.000457,0.6,0.9,0.4,5,300,0.7,...,0.838710,0.774194,0.790323,0.822581,0.774194,0.725806,0.822581,0.818331,0.051641,2706
2998,0.224599,0.011023,0.007380,0.000489,0.6,0.9,0.4,5,300,0.8,...,0.838710,0.790323,0.758065,0.822581,0.806452,0.709677,0.806452,0.816718,0.054669,2825


best score is 0.8458269329237071 with params {'colsample_bylevel': 0.6, 'colsample_bytree': 0.4, 'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100, 'subsample': 0.8}


In [None]:
cols_full = ['param_max_depth', 'param_subsample', 'param_colsample_bytree', 'param_colsample_bylevel', 'param_learning_rate', 'param_n_estimators', 'mean_test_score', 'rank_test_score']
df_full_1 = xgb_grid_search_results_full_1[cols_full]
df_full_1_sorted = df_full_1.sort_values(by='rank_test_score')
df_full_1_sorted.head(10)

# best values for colsample_bylevel = [0.1]
# max_depth = []
# subsample = []
# colsample_bytree = []
# learning_rate = []
# n_estimators = []

In [13]:
# Full Tuning - Part 6
random.seed(10)

xgb = XGBClassifier()

xgb_parameters = {'max_depth': [2,3,4,5]
                   , 'subsample': [0.5, 0.6, 0.7, 0.8, 0.9]
                   , 'colsample_bytree': [0.3, 0.4, 0.5, 0.6, 0.9]
                   , 'colsample_bylevel': [0.7]
                   , 'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3, 0.4]
                   , 'n_estimators': [100, 150, 200, 250, 300]}

stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
xgb_grid_search = GridSearchCV(xgb, xgb_parameters, scoring='accuracy', cv=stratified_10_fold_cv)

xgb_grid_search.fit(X_train, y_train)

xgb_grid_search_results_full_6 = pd.DataFrame(xgb_grid_search.cv_results_)
display(xgb_grid_search_results_full_6)

print("best score is {} with params {}".format(xgb_grid_search.best_score_, xgb_grid_search.best_params_))

# best values for

# save dataframe
from datetime import datetime
date = str(datetime.now().date()).replace("-", "")
xgb_grid_search_results_full_6.to_csv(f"data/xgb_results_full_6_bylevel=0.7_{date}.csv")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bylevel,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.063128,0.002527,0.006483,0.000669,0.7,0.3,0.1,2,100,0.5,...,0.870968,0.822581,0.838710,0.854839,0.822581,0.709677,0.838710,0.834537,0.044892,218
1,0.065325,0.003097,0.006882,0.000537,0.7,0.3,0.1,2,100,0.6,...,0.854839,0.822581,0.854839,0.854839,0.822581,0.693548,0.854839,0.832949,0.048291,327
2,0.061886,0.000566,0.006583,0.000489,0.7,0.3,0.1,2,100,0.7,...,0.887097,0.838710,0.854839,0.854839,0.838710,0.709677,0.854839,0.842601,0.046342,16
3,0.061137,0.000638,0.006882,0.000299,0.7,0.3,0.1,2,100,0.8,...,0.870968,0.822581,0.870968,0.838710,0.822581,0.693548,0.854839,0.834562,0.050284,210
4,0.060537,0.000638,0.006682,0.000457,0.7,0.3,0.1,2,100,0.9,...,0.870968,0.822581,0.887097,0.822581,0.806452,0.677419,0.854839,0.832924,0.057218,333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0.228489,0.002462,0.007480,0.000499,0.7,0.9,0.4,5,300,0.5,...,0.822581,0.774194,0.806452,0.822581,0.774194,0.709677,0.774194,0.802355,0.049888,2998
2996,0.228688,0.002142,0.007281,0.000457,0.7,0.9,0.4,5,300,0.6,...,0.822581,0.774194,0.790323,0.822581,0.790323,0.693548,0.790323,0.800768,0.052893,2999
2997,0.228987,0.001197,0.007580,0.000488,0.7,0.9,0.4,5,300,0.7,...,0.870968,0.774194,0.774194,0.790323,0.790323,0.709677,0.790323,0.810317,0.057193,2964
2998,0.228788,0.001620,0.007181,0.000399,0.7,0.9,0.4,5,300,0.8,...,0.822581,0.790323,0.774194,0.822581,0.790323,0.709677,0.806452,0.816692,0.055624,2812


best score is 0.8490015360983103 with params {'colsample_bylevel': 0.7, 'colsample_bytree': 0.4, 'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 150, 'subsample': 0.9}


In [None]:
cols_full = ['param_max_depth', 'param_subsample', 'param_colsample_bytree', 'param_colsample_bylevel', 'param_learning_rate', 'param_n_estimators', 'mean_test_score', 'rank_test_score']
df_full_1 = xgb_grid_search_results_full_1[cols_full]
df_full_1_sorted = df_full_1.sort_values(by='rank_test_score')
df_full_1_sorted.head(10)

# best values for colsample_bylevel = [0.1]
# max_depth = []
# subsample = []
# colsample_bytree = []
# learning_rate = []
# n_estimators = []

#### Combine single hyperparameter tuning to find best values for hyperparameters

In [18]:
a = xgb_grid_search_results_full_1.copy()
b = xgb_grid_search_results_full_2.copy()
c = xgb_grid_search_results_full_3.copy()
d = xgb_grid_search_results_full_4.copy()
e = xgb_grid_search_results_full_5.copy()
f = xgb_grid_search_results_full_6.copy()

In [19]:
xgb_grid_search_results_full = pd.concat([a,b,c,d,e,f])
from datetime import datetime
date = str(datetime.now().date()).replace("-", "")
xgb_grid_search_results_full.to_csv(f"data/xgb_results_full_combined_{date}.csv")

In [43]:
cols_full = ['param_max_depth', 'param_subsample', 'param_colsample_bytree', 'param_colsample_bylevel', 'param_learning_rate', 'param_n_estimators', 'mean_test_score']
df_full = xgb_grid_search_results_full[cols_full]
df_full_sorted = df_full.sort_values(by='mean_test_score', ascending=False)
df_full_sorted['rank_test_score'] = range(1, len(df_full_sorted)+1)
df_full_sorted.head(30)

Unnamed: 0,param_max_depth,param_subsample,param_colsample_bytree,param_colsample_bylevel,param_learning_rate,param_n_estimators,mean_test_score,rank_test_score
1302,2,0.7,0.5,0.4,0.15,100,0.850563,1
1302,2,0.7,0.5,0.3,0.15,100,0.850563,2
1976,5,0.6,0.6,0.2,0.15,100,0.850538,3
1976,5,0.6,0.6,0.1,0.15,100,0.850538,4
2419,2,0.9,0.9,0.2,0.1,250,0.849002,5
609,2,0.9,0.4,0.7,0.1,150,0.849002,6
2603,2,0.8,0.9,0.2,0.2,100,0.84895,7
2456,4,0.6,0.9,0.1,0.1,150,0.848925,8
1329,3,0.9,0.5,0.4,0.15,100,0.847312,9
1329,3,0.9,0.5,0.3,0.15,100,0.847312,10


In [44]:
from datetime import datetime
date = str(datetime.now().date()).replace("-", "")
df_full_sorted.to_csv(f"data/xgb_results_full_sorted_{date}.csv")

In [65]:
# Fit and evaluate best model - 1

#colsample_bylevel = 0.3 liefert gleiche Ergebnisse
xgb_best = XGBClassifier(max_depth = 2, subsample = 0.7, colsample_bytree = 0.5, colsample_bylevel = 0.4, learning_rate = 0.15, n_estimators = 100)
#0.8246268656716418
xgb_best.fit(X_train, y_train)
xgb_best_pred = xgb_best.predict(X_test)
xgb_best_acc = accuracy_score(y_test, xgb_best_pred)
print(xgb_best_acc)

print("Classification Report:")
print(classification_report(y_test, xgb_best_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, xgb_best_pred))

0.8246268656716418
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.89      0.86       157
           1       0.83      0.73      0.78       111

    accuracy                           0.82       268
   macro avg       0.83      0.81      0.82       268
weighted avg       0.82      0.82      0.82       268

Confusion Matrix:
[[140  17]
 [ 30  81]]


In [69]:
# Fit and evaluate best model - 2

#colsample_bylevel = 0.2 liefert gleiche Ergebnisse
#ebenso wie Kombi: max_depth = 2, subsample = 0.9, colsample_bytree = 0.9, colsample_bylevel = 0.2, learning_rate = 0.1, n_estimators = 250
xgb_best = XGBClassifier(max_depth = 5, subsample = 0.6, colsample_bytree = 0.6, colsample_bylevel = 0.2, learning_rate = 0.15, n_estimators = 100)
#0.8283582089552238
xgb_best.fit(X_train, y_train)
xgb_best_pred = xgb_best.predict(X_test)
xgb_best_acc = accuracy_score(y_test, xgb_best_pred)
print(xgb_best_acc)

print("Classification Report:")
print(classification_report(y_test, xgb_best_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, xgb_best_pred))

0.8283582089552238
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.89      0.86       157
           1       0.82      0.75      0.78       111

    accuracy                           0.83       268
   macro avg       0.83      0.82      0.82       268
weighted avg       0.83      0.83      0.83       268

Confusion Matrix:
[[139  18]
 [ 28  83]]


In [64]:
for v in df_full_sorted['param_n_estimators'].unique():
    h = df_full_sorted.rank_test_score[df_full_sorted['param_n_estimators'] == v]
    print(v, min(h))

100 1
250 5
150 6
200 35
300 58


beim nächsten Tuning:  
- param_subsample ungleich 0.5
- param_colsample_bytree ungleich 0.3
- param_colsample_bylevel ungleich 0.6
- param_learning_rate kleiner gleich 0.2
- param_n_estimators ungleich 200 und ungleich 300

#### Full Tuning in Parts - Round 2

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [8]:
# Full Tuning - Part 1
random.seed(10)

xgb = XGBClassifier()

xgb_parameters = {'max_depth': [2,3,4,5]
                   , 'subsample': [0.6, 0.65, 0.7, 0,75, 0.8, 0.85, 0.9]
                   , 'colsample_bytree': [0.4, 0.5, 0.55, 0.6, 0.85, 0.9]
                   , 'colsample_bylevel': [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]
                   , 'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3]
                   , 'n_estimators': [30, 50, 80, 100, 150, 250]}

stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
xgb_grid_search = GridSearchCV(xgb, xgb_parameters, scoring='accuracy', cv=stratified_10_fold_cv)

xgb_grid_search.fit(X_train, y_train)

xgb_grid_search_results_full_1 = pd.DataFrame(xgb_grid_search.cv_results_)
display(xgb_grid_search_results_full_1)

print("best score is {} with params {}".format(xgb_grid_search.best_score_, xgb_grid_search.best_params_))

# best values for

# save dataframe
from datetime import datetime
date = str(datetime.now().date()).replace("-", "")
xgb_grid_search_results_full_1.to_csv(f"data/xgb_results_full_1_bylevel=0.1_{date}.csv")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bylevel,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.064527,0.004136,0.005885,5.376236e-04,0.1,0.3,0.1,2,100,0.5,...,0.822581,0.822581,0.806452,0.822581,0.806452,0.693548,0.838710,0.820020,0.048275,2651
1,0.064327,0.002493,0.005885,5.374151e-04,0.1,0.3,0.1,2,100,0.6,...,0.838710,0.822581,0.806452,0.838710,0.806452,0.661290,0.854839,0.823221,0.059057,2358
2,0.063231,0.001954,0.005785,3.990535e-04,0.1,0.3,0.1,2,100,0.7,...,0.838710,0.806452,0.806452,0.838710,0.806452,0.677419,0.838710,0.824782,0.057512,2252
3,0.062235,0.001017,0.006082,3.001655e-04,0.1,0.3,0.1,2,100,0.8,...,0.854839,0.822581,0.806452,0.854839,0.806452,0.661290,0.838710,0.824834,0.059251,2175
4,0.062333,0.002285,0.005784,3.991850e-04,0.1,0.3,0.1,2,100,0.9,...,0.854839,0.822581,0.838710,0.838710,0.806452,0.677419,0.838710,0.828059,0.053977,1747
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0.180018,0.001496,0.007081,2.993507e-04,0.1,0.9,0.4,5,300,0.5,...,0.887097,0.806452,0.774194,0.822581,0.822581,0.725806,0.854839,0.828085,0.048801,1701
2996,0.182212,0.003339,0.006982,4.460618e-04,0.1,0.9,0.4,5,300,0.6,...,0.870968,0.774194,0.758065,0.806452,0.838710,0.741935,0.854839,0.826421,0.052840,1992
2997,0.181813,0.001548,0.007082,2.990570e-04,0.1,0.9,0.4,5,300,0.7,...,0.870968,0.790323,0.790323,0.790323,0.854839,0.709677,0.822581,0.824808,0.053239,2204
2998,0.181666,0.001723,0.006982,4.529953e-07,0.1,0.9,0.4,5,300,0.8,...,0.870968,0.790323,0.774194,0.806452,0.854839,0.741935,0.822581,0.823272,0.043729,2305


best score is 0.8505376344086022 with params {'colsample_bylevel': 0.1, 'colsample_bytree': 0.6, 'learning_rate': 0.15, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.6}


In [14]:
cols_full = ['param_max_depth', 'param_subsample', 'param_colsample_bytree', 'param_colsample_bylevel', 'param_learning_rate', 'param_n_estimators', 'mean_test_score', 'rank_test_score']
df_full_1 = xgb_grid_search_results_full_1[cols_full]
df_full_1_sorted = df_full_1.sort_values(by='rank_test_score')
df_full_1_sorted.head(10)

# best values for colsample_bylevel = [0.1]
# max_depth = [4, 5]
# subsample = [0.6, 0.8]
# colsample_bytree = [0.6, 0.9]
# learning_rate = [0.1, 0.15]
# n_estimators = [100, 150]
# 0.850538

Unnamed: 0,param_max_depth,param_subsample,param_colsample_bytree,param_colsample_bylevel,param_learning_rate,param_n_estimators,mean_test_score,rank_test_score
1976,5,0.6,0.6,0.1,0.15,100,0.850538,1
2456,4,0.6,0.9,0.1,0.1,150,0.848925,2
1853,4,0.8,0.6,0.1,0.1,100,0.847312,3
1951,4,0.6,0.6,0.1,0.15,100,0.845776,4
1801,2,0.6,0.6,0.1,0.1,100,0.845776,4
1850,4,0.5,0.6,0.1,0.1,100,0.845776,4
1328,3,0.8,0.5,0.1,0.15,100,0.845776,4
2475,5,0.5,0.9,0.1,0.1,100,0.845776,4
2508,2,0.8,0.9,0.1,0.15,150,0.845776,9
1802,2,0.7,0.6,0.1,0.1,100,0.845776,9


In [9]:
# Full Tuning - Part 2
random.seed(10)

xgb = XGBClassifier()

xgb_parameters = {'max_depth': [2,3,4,5]
                   , 'subsample': [0.6, 0.65, 0.7, 0,75, 0.8, 0.85, 0.9]
                   , 'colsample_bytree': [0.4, 0.5, 0.55, 0.6, 0.85, 0.9]
                   , 'colsample_bylevel': [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]
                   , 'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3]
                   , 'n_estimators': [30, 50, 80, 100, 150, 250]}

stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
xgb_grid_search = GridSearchCV(xgb, xgb_parameters, scoring='accuracy', cv=stratified_10_fold_cv)

xgb_grid_search.fit(X_train, y_train)

xgb_grid_search_results_full_2 = pd.DataFrame(xgb_grid_search.cv_results_)
display(xgb_grid_search_results_full_2)

print("best score is {} with params {}".format(xgb_grid_search.best_score_, xgb_grid_search.best_params_))

# best values for

# save dataframe
from datetime import datetime
date = str(datetime.now().date()).replace("-", "")
xgb_grid_search_results_full_2.to_csv(f"data/xgb_results_full_2_bylevel=0.2_{date}.csv")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bylevel,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.062133,0.002485,0.006284,0.000639,0.2,0.3,0.1,2,100,0.5,...,0.822581,0.822581,0.806452,0.822581,0.806452,0.693548,0.838710,0.820020,0.048275,2663
1,0.063131,0.003027,0.006683,0.000457,0.2,0.3,0.1,2,100,0.6,...,0.838710,0.822581,0.806452,0.838710,0.806452,0.661290,0.854839,0.823221,0.059057,2338
2,0.060438,0.000798,0.006782,0.000398,0.2,0.3,0.1,2,100,0.7,...,0.838710,0.806452,0.806452,0.838710,0.806452,0.677419,0.838710,0.824782,0.057512,2222
3,0.059541,0.000779,0.006682,0.000457,0.2,0.3,0.1,2,100,0.8,...,0.854839,0.822581,0.806452,0.854839,0.806452,0.661290,0.838710,0.824834,0.059251,2132
4,0.059840,0.001092,0.006782,0.000399,0.2,0.3,0.1,2,100,0.9,...,0.854839,0.822581,0.838710,0.838710,0.806452,0.677419,0.838710,0.828059,0.053977,1661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0.195577,0.001636,0.017653,0.031683,0.2,0.9,0.4,5,300,0.5,...,0.822581,0.806452,0.774194,0.790323,0.790323,0.741935,0.822581,0.819918,0.052433,2744
2996,0.221308,0.011234,0.006782,0.000399,0.2,0.9,0.4,5,300,0.6,...,0.838710,0.806452,0.774194,0.822581,0.838710,0.693548,0.822581,0.819995,0.058351,2693
2997,0.226194,0.002309,0.007082,0.000299,0.2,0.9,0.4,5,300,0.7,...,0.838710,0.790323,0.758065,0.806452,0.854839,0.725806,0.822581,0.821582,0.054587,2581
2998,0.199167,0.006213,0.007381,0.000489,0.2,0.9,0.4,5,300,0.8,...,0.838710,0.822581,0.774194,0.838710,0.822581,0.725806,0.790323,0.823195,0.052389,2389


best score is 0.8505376344086022 with params {'colsample_bylevel': 0.2, 'colsample_bytree': 0.6, 'learning_rate': 0.15, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.6}


In [None]:
cols_full = ['param_max_depth', 'param_subsample', 'param_colsample_bytree', 'param_colsample_bylevel', 'param_learning_rate', 'param_n_estimators', 'mean_test_score', 'rank_test_score']
df_full_1 = xgb_grid_search_results_full_1[cols_full]
df_full_1_sorted = df_full_1.sort_values(by='rank_test_score')
df_full_1_sorted.head(10)

# best values for colsample_bylevel = [0.1]
# max_depth = []
# subsample = []
# colsample_bytree = []
# learning_rate = []
# n_estimators = []

In [10]:
# Full Tuning - Part 3
random.seed(10)

xgb = XGBClassifier()

xgb_parameters = {'max_depth': [2,3,4,5]
                   , 'subsample': [0.6, 0.65, 0.7, 0,75, 0.8, 0.85, 0.9]
                   , 'colsample_bytree': [0.4, 0.5, 0.55, 0.6, 0.85, 0.9]
                   , 'colsample_bylevel': [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]
                   , 'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3]
                   , 'n_estimators': [30, 50, 80, 100, 150, 250]}

stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
xgb_grid_search = GridSearchCV(xgb, xgb_parameters, scoring='accuracy', cv=stratified_10_fold_cv)

xgb_grid_search.fit(X_train, y_train)

xgb_grid_search_results_full_3 = pd.DataFrame(xgb_grid_search.cv_results_)
display(xgb_grid_search_results_full_3)

print("best score is {} with params {}".format(xgb_grid_search.best_score_, xgb_grid_search.best_params_))

# best values for

# save dataframe
from datetime import datetime
date = str(datetime.now().date()).replace("-", "")
xgb_grid_search_results_full_3.to_csv(f"data/xgb_results_full_3_bylevel=0.3_{date}.csv")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bylevel,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.062034,0.001596,0.006882,0.000537,0.3,0.3,0.1,2,100,0.5,...,0.822581,0.822581,0.806452,0.822581,0.806452,0.693548,0.838710,0.820020,0.048275,2509
1,0.068327,0.005910,0.006573,0.000502,0.3,0.3,0.1,2,100,0.6,...,0.838710,0.822581,0.806452,0.838710,0.806452,0.661290,0.854839,0.823221,0.059057,2140
2,0.065225,0.003191,0.006683,0.000457,0.3,0.3,0.1,2,100,0.7,...,0.838710,0.806452,0.806452,0.838710,0.806452,0.677419,0.838710,0.824782,0.057512,2021
3,0.063630,0.001596,0.006483,0.000499,0.3,0.3,0.1,2,100,0.8,...,0.854839,0.822581,0.806452,0.854839,0.806452,0.661290,0.838710,0.824834,0.059251,1898
4,0.061435,0.001493,0.006483,0.000498,0.3,0.3,0.1,2,100,0.9,...,0.854839,0.822581,0.838710,0.838710,0.806452,0.677419,0.838710,0.828059,0.053977,1394
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0.206226,0.002782,0.007182,0.000598,0.3,0.9,0.4,5,300,0.5,...,0.838710,0.790323,0.774194,0.838710,0.822581,0.709677,0.822581,0.813646,0.044452,2900
2996,0.207998,0.002457,0.007280,0.000457,0.3,0.9,0.4,5,300,0.6,...,0.854839,0.774194,0.790323,0.854839,0.806452,0.693548,0.854839,0.818459,0.053856,2648
2997,0.207744,0.000898,0.007480,0.000499,0.3,0.9,0.4,5,300,0.7,...,0.854839,0.790323,0.758065,0.822581,0.806452,0.709677,0.822581,0.813594,0.056215,2908
2998,0.208343,0.001863,0.007280,0.000457,0.3,0.9,0.4,5,300,0.8,...,0.854839,0.790323,0.790323,0.822581,0.806452,0.693548,0.822581,0.815207,0.053961,2850


best score is 0.850563236047107 with params {'colsample_bylevel': 0.3, 'colsample_bytree': 0.5, 'learning_rate': 0.15, 'max_depth': 2, 'n_estimators': 100, 'subsample': 0.7}


In [None]:
cols_full = ['param_max_depth', 'param_subsample', 'param_colsample_bytree', 'param_colsample_bylevel', 'param_learning_rate', 'param_n_estimators', 'mean_test_score', 'rank_test_score']
df_full_1 = xgb_grid_search_results_full_1[cols_full]
df_full_1_sorted = df_full_1.sort_values(by='rank_test_score')
df_full_1_sorted.head(10)

# best values for colsample_bylevel = [0.1]
# max_depth = []
# subsample = []
# colsample_bytree = []
# learning_rate = []
# n_estimators = []

In [11]:
# Full Tuning - Part 4
random.seed(10)

xgb = XGBClassifier()

xgb_parameters = {'max_depth': [2,3,4,5]
                   , 'subsample': [0.6, 0.65, 0.7, 0,75, 0.8, 0.85, 0.9]
                   , 'colsample_bytree': [0.4, 0.5, 0.55, 0.6, 0.85, 0.9]
                   , 'colsample_bylevel': [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]
                   , 'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3]
                   , 'n_estimators': [30, 50, 80, 100, 150, 250]}

stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
xgb_grid_search = GridSearchCV(xgb, xgb_parameters, scoring='accuracy', cv=stratified_10_fold_cv)

xgb_grid_search.fit(X_train, y_train)

xgb_grid_search_results_full_4 = pd.DataFrame(xgb_grid_search.cv_results_)
display(xgb_grid_search_results_full_4)

print("best score is {} with params {}".format(xgb_grid_search.best_score_, xgb_grid_search.best_params_))

# best values for

# save dataframe
from datetime import datetime
date = str(datetime.now().date()).replace("-", "")
xgb_grid_search_results_full_4.to_csv(f"data/xgb_results_full_4_bylevel=0.4_{date}.csv")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bylevel,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.061735,0.001296,0.006482,0.000669,0.4,0.3,0.1,2,100,0.5,...,0.822581,0.822581,0.806452,0.822581,0.806452,0.693548,0.838710,0.820020,0.048275,2502
1,0.063031,0.002778,0.006483,0.000669,0.4,0.3,0.1,2,100,0.6,...,0.838710,0.822581,0.806452,0.838710,0.806452,0.661290,0.854839,0.823221,0.059057,2077
2,0.060638,0.000746,0.006782,0.000399,0.4,0.3,0.1,2,100,0.7,...,0.838710,0.806452,0.806452,0.838710,0.806452,0.677419,0.838710,0.824782,0.057512,1941
3,0.060339,0.000919,0.006483,0.000499,0.4,0.3,0.1,2,100,0.8,...,0.854839,0.822581,0.806452,0.854839,0.806452,0.661290,0.838710,0.824834,0.059251,1818
4,0.059840,0.001180,0.006682,0.000457,0.4,0.3,0.1,2,100,0.9,...,0.854839,0.822581,0.838710,0.838710,0.806452,0.677419,0.838710,0.828059,0.053977,1289
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0.218216,0.004179,0.007181,0.000399,0.4,0.9,0.4,5,300,0.5,...,0.838710,0.758065,0.790323,0.806452,0.790323,0.741935,0.822581,0.808807,0.046180,2975
2996,0.221009,0.003402,0.007480,0.000499,0.4,0.9,0.4,5,300,0.6,...,0.838710,0.790323,0.806452,0.838710,0.774194,0.725806,0.790323,0.812007,0.049754,2922
2997,0.221308,0.004515,0.007380,0.000489,0.4,0.9,0.4,5,300,0.7,...,0.838710,0.790323,0.790323,0.790323,0.822581,0.725806,0.822581,0.816795,0.050406,2770
2998,0.219213,0.003908,0.007194,0.000425,0.4,0.9,0.4,5,300,0.8,...,0.838710,0.790323,0.806452,0.838710,0.790323,0.725806,0.806452,0.816820,0.047212,2763


best score is 0.850563236047107 with params {'colsample_bylevel': 0.4, 'colsample_bytree': 0.5, 'learning_rate': 0.15, 'max_depth': 2, 'n_estimators': 100, 'subsample': 0.7}


In [None]:
cols_full = ['param_max_depth', 'param_subsample', 'param_colsample_bytree', 'param_colsample_bylevel', 'param_learning_rate', 'param_n_estimators', 'mean_test_score', 'rank_test_score']
df_full_1 = xgb_grid_search_results_full_1[cols_full]
df_full_1_sorted = df_full_1.sort_values(by='rank_test_score')
df_full_1_sorted.head(10)

# best values for colsample_bylevel = [0.1]
# max_depth = []
# subsample = []
# colsample_bytree = []
# learning_rate = []
# n_estimators = []

In [12]:
# Full Tuning - Part 5
random.seed(10)

xgb = XGBClassifier()

xgb_parameters = {'max_depth': [2,3,4,5]
                   , 'subsample': [0.6, 0.65, 0.7, 0,75, 0.8, 0.85, 0.9]
                   , 'colsample_bytree': [0.4, 0.5, 0.55, 0.6, 0.85, 0.9]
                   , 'colsample_bylevel': [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]
                   , 'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3]
                   , 'n_estimators': [30, 50, 80, 100, 150, 250]}

stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
xgb_grid_search = GridSearchCV(xgb, xgb_parameters, scoring='accuracy', cv=stratified_10_fold_cv)

xgb_grid_search.fit(X_train, y_train)

xgb_grid_search_results_full_5 = pd.DataFrame(xgb_grid_search.cv_results_)
display(xgb_grid_search_results_full_5)

print("best score is {} with params {}".format(xgb_grid_search.best_score_, xgb_grid_search.best_params_))

# best values for

# save dataframe
from datetime import datetime
date = str(datetime.now().date()).replace("-", "")
xgb_grid_search_results_full_5.to_csv(f"data/xgb_results_full_5_bylevel=0.6_{date}.csv")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bylevel,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.063530,0.002446,0.006383,0.000662,0.6,0.3,0.1,2,100,0.5,...,0.870968,0.822581,0.838710,0.854839,0.822581,0.709677,0.838710,0.834537,0.044892,215
1,0.065725,0.003351,0.006882,0.000299,0.6,0.3,0.1,2,100,0.6,...,0.854839,0.822581,0.854839,0.854839,0.822581,0.693548,0.854839,0.832949,0.048291,320
2,0.062236,0.001196,0.006780,0.000398,0.6,0.3,0.1,2,100,0.7,...,0.887097,0.838710,0.854839,0.854839,0.838710,0.709677,0.854839,0.842601,0.046342,17
3,0.061335,0.001798,0.006583,0.000489,0.6,0.3,0.1,2,100,0.8,...,0.870968,0.822581,0.870968,0.838710,0.822581,0.693548,0.854839,0.834562,0.050284,208
4,0.060937,0.000698,0.006483,0.000669,0.6,0.3,0.1,2,100,0.9,...,0.870968,0.822581,0.887097,0.822581,0.806452,0.677419,0.854839,0.832924,0.057218,327
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0.219613,0.001532,0.007380,0.000488,0.6,0.9,0.4,5,300,0.5,...,0.806452,0.741935,0.774194,0.806452,0.790323,0.741935,0.806452,0.805504,0.051226,2997
2996,0.222105,0.001787,0.006882,0.000698,0.6,0.9,0.4,5,300,0.6,...,0.854839,0.790323,0.758065,0.822581,0.741935,0.725806,0.806452,0.810317,0.059764,2968
2997,0.221906,0.002534,0.007281,0.000457,0.6,0.9,0.4,5,300,0.7,...,0.838710,0.774194,0.790323,0.822581,0.774194,0.725806,0.822581,0.818331,0.051641,2706
2998,0.224599,0.011023,0.007380,0.000489,0.6,0.9,0.4,5,300,0.8,...,0.838710,0.790323,0.758065,0.822581,0.806452,0.709677,0.806452,0.816718,0.054669,2825


best score is 0.8458269329237071 with params {'colsample_bylevel': 0.6, 'colsample_bytree': 0.4, 'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100, 'subsample': 0.8}


In [None]:
cols_full = ['param_max_depth', 'param_subsample', 'param_colsample_bytree', 'param_colsample_bylevel', 'param_learning_rate', 'param_n_estimators', 'mean_test_score', 'rank_test_score']
df_full_1 = xgb_grid_search_results_full_1[cols_full]
df_full_1_sorted = df_full_1.sort_values(by='rank_test_score')
df_full_1_sorted.head(10)

# best values for colsample_bylevel = [0.1]
# max_depth = []
# subsample = []
# colsample_bytree = []
# learning_rate = []
# n_estimators = []

In [13]:
# Full Tuning - Part 6
random.seed(10)

xgb = XGBClassifier()

xgb_parameters = {'max_depth': [2,3,4,5]
                   , 'subsample': [0.6, 0.65, 0.7, 0,75, 0.8, 0.85, 0.9]
                   , 'colsample_bytree': [0.4, 0.5, 0.55, 0.6, 0.85, 0.9]
                   , 'colsample_bylevel': [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]
                   , 'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3]
                   , 'n_estimators': [30, 50, 80, 100, 150, 250]}

stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
xgb_grid_search = GridSearchCV(xgb, xgb_parameters, scoring='accuracy', cv=stratified_10_fold_cv)

xgb_grid_search.fit(X_train, y_train)

xgb_grid_search_results_full_6 = pd.DataFrame(xgb_grid_search.cv_results_)
display(xgb_grid_search_results_full_6)

print("best score is {} with params {}".format(xgb_grid_search.best_score_, xgb_grid_search.best_params_))

# best values for

# save dataframe
from datetime import datetime
date = str(datetime.now().date()).replace("-", "")
xgb_grid_search_results_full_6.to_csv(f"data/xgb_results_full_6_bylevel=0.7_{date}.csv")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bylevel,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.063128,0.002527,0.006483,0.000669,0.7,0.3,0.1,2,100,0.5,...,0.870968,0.822581,0.838710,0.854839,0.822581,0.709677,0.838710,0.834537,0.044892,218
1,0.065325,0.003097,0.006882,0.000537,0.7,0.3,0.1,2,100,0.6,...,0.854839,0.822581,0.854839,0.854839,0.822581,0.693548,0.854839,0.832949,0.048291,327
2,0.061886,0.000566,0.006583,0.000489,0.7,0.3,0.1,2,100,0.7,...,0.887097,0.838710,0.854839,0.854839,0.838710,0.709677,0.854839,0.842601,0.046342,16
3,0.061137,0.000638,0.006882,0.000299,0.7,0.3,0.1,2,100,0.8,...,0.870968,0.822581,0.870968,0.838710,0.822581,0.693548,0.854839,0.834562,0.050284,210
4,0.060537,0.000638,0.006682,0.000457,0.7,0.3,0.1,2,100,0.9,...,0.870968,0.822581,0.887097,0.822581,0.806452,0.677419,0.854839,0.832924,0.057218,333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0.228489,0.002462,0.007480,0.000499,0.7,0.9,0.4,5,300,0.5,...,0.822581,0.774194,0.806452,0.822581,0.774194,0.709677,0.774194,0.802355,0.049888,2998
2996,0.228688,0.002142,0.007281,0.000457,0.7,0.9,0.4,5,300,0.6,...,0.822581,0.774194,0.790323,0.822581,0.790323,0.693548,0.790323,0.800768,0.052893,2999
2997,0.228987,0.001197,0.007580,0.000488,0.7,0.9,0.4,5,300,0.7,...,0.870968,0.774194,0.774194,0.790323,0.790323,0.709677,0.790323,0.810317,0.057193,2964
2998,0.228788,0.001620,0.007181,0.000399,0.7,0.9,0.4,5,300,0.8,...,0.822581,0.790323,0.774194,0.822581,0.790323,0.709677,0.806452,0.816692,0.055624,2812


best score is 0.8490015360983103 with params {'colsample_bylevel': 0.7, 'colsample_bytree': 0.4, 'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 150, 'subsample': 0.9}


In [None]:
cols_full = ['param_max_depth', 'param_subsample', 'param_colsample_bytree', 'param_colsample_bylevel', 'param_learning_rate', 'param_n_estimators', 'mean_test_score', 'rank_test_score']
df_full_1 = xgb_grid_search_results_full_1[cols_full]
df_full_1_sorted = df_full_1.sort_values(by='rank_test_score')
df_full_1_sorted.head(10)

# best values for colsample_bylevel = [0.1]
# max_depth = []
# subsample = []
# colsample_bytree = []
# learning_rate = []
# n_estimators = []

In [None]:
# Full Tuning - Part 7
random.seed(10)

xgb = XGBClassifier()

xgb_parameters = {'max_depth': [2,3,4,5]
                   , 'subsample': [0.6, 0.65, 0.7, 0,75, 0.8, 0.85, 0.9]
                   , 'colsample_bytree': [0.4, 0.5, 0.55, 0.6, 0.85, 0.9]
                   , 'colsample_bylevel': [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]
                   , 'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3]
                   , 'n_estimators': [30, 50, 80, 100, 150, 250]}

stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
xgb_grid_search = GridSearchCV(xgb, xgb_parameters, scoring='accuracy', cv=stratified_10_fold_cv)

xgb_grid_search.fit(X_train, y_train)

xgb_grid_search_results_full_7 = pd.DataFrame(xgb_grid_search.cv_results_)
display(xgb_grid_search_results_full_7)

print("best score is {} with params {}".format(xgb_grid_search.best_score_, xgb_grid_search.best_params_))

# best values for

# save dataframe
from datetime import datetime
date = str(datetime.now().date()).replace("-", "")
xgb_grid_search_results_full_7.to_csv(f"data/xgb_results_full_7_bylevel=0.7_{date}.csv")

In [None]:
cols_full = ['param_max_depth', 'param_subsample', 'param_colsample_bytree', 'param_colsample_bylevel', 'param_learning_rate', 'param_n_estimators', 'mean_test_score', 'rank_test_score']
df_full_1 = xgb_grid_search_results_full_1[cols_full]
df_full_1_sorted = df_full_1.sort_values(by='rank_test_score')
df_full_1_sorted.head(10)

# best values for colsample_bylevel = [0.1]
# max_depth = []
# subsample = []
# colsample_bytree = []
# learning_rate = []
# n_estimators = []

### cross_val_score and cross_val_predict
Das sagt uns nur für welchen Fold der Estimator am besten ist, aber gibt kein Modell

In [12]:
# cross_val_score

from sklearn.model_selection import cross_val_score

xgb_cv = XGBClassifier()
xgb_cv_score = cross_val_score(xgb_cv, X_train, y_train, cv=10, scoring = 'accuracy') #scoring='f1_macro', 'f1_micro'

for i, acc in enumerate(xgb_cv_score):
    print("Fold {}: Accuracy = {}%".format(i, acc*100.0))
print ("Average Accuracy = {}%".format(xgb_cv_score.mean()*100.0))

Fold 0: Accuracy = 84.12698412698413%
Fold 1: Accuracy = 80.95238095238095%
Fold 2: Accuracy = 87.3015873015873%
Fold 3: Accuracy = 87.09677419354838%
Fold 4: Accuracy = 82.25806451612904%
Fold 5: Accuracy = 72.58064516129032%
Fold 6: Accuracy = 82.25806451612904%
Fold 7: Accuracy = 83.87096774193549%
Fold 8: Accuracy = 82.25806451612904%
Fold 9: Accuracy = 90.32258064516128%
Average Accuracy = 83.3026113671275%


In [13]:
# cross_val_predict

from sklearn.model_selection import cross_val_predict

xgb_cv_pred = cross_val_predict(xgb_cv, X_train, y_train, cv=10)
xgb_cv_acc = accuracy_score(y_train, xgb_cv_pred)
xgb_cv_acc

0.8330658105939005

In [14]:
# Nested CV

from sklearn.model_selection import cross_val_score

# create an estimator
xgb_nested = XGBClassifier()

# specify the parameter grid
parameters = {
    'learning_rate': [0.3, 0.7]
    , 'max_depth': [3, 5]
    , 'colsample_bytree': [0.3, 0.8]
    , 'n_estimators': [50, 100]
    #, 'gamma': [0.5, 1, 3]
}

# specify the cross validation
#stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# create grid search instance
xgb_nested_grid_search = GridSearchCV(xgb_nested, parameters, scoring='accuracy', cv=5)
#cv=stratified_10_fold_cv

xgb_nested_cv_score = cross_val_score(xgb_nested_grid_search, X_train, y_train, cv=5, scoring = 'accuracy') # kein Modell
display(xgb_nested_cv_score.mean())

# run the grid search
xgb_nested_grid_search.fit(X_train, y_train)

# print the results of all hyper-parameter combinations
xgb_nested_grid_search_results = pd.DataFrame(xgb_nested_grid_search.cv_results_)
display(xgb_nested_grid_search_results)

# print the best parameter setting
print("best score is {} with params {}".format(xgb_nested_grid_search.best_score_, xgb_nested_grid_search.best_params_))
#best score is 0.8298838709677421 with params {'colsample_bytree': 0.3, 'learning_rate': 0.3, 'max_depth': 3, 'n_estimators': 100}

0.8235096774193549

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.092668,0.068974,0.010241,0.004584,0.3,0.3,3,50,"{'colsample_bytree': 0.3, 'learning_rate': 0.3...",0.8,0.856,0.8,0.798387,0.854839,0.821845,0.027422,11
1,0.288282,0.145202,0.013407,0.008197,0.3,0.3,3,100,"{'colsample_bytree': 0.3, 'learning_rate': 0.3...",0.792,0.888,0.792,0.814516,0.862903,0.829884,0.038921,1
2,0.120098,0.070939,0.015562,0.011004,0.3,0.3,5,50,"{'colsample_bytree': 0.3, 'learning_rate': 0.3...",0.8,0.864,0.784,0.814516,0.854839,0.823471,0.031034,8
3,0.114842,0.010915,0.009151,0.001635,0.3,0.3,5,100,"{'colsample_bytree': 0.3, 'learning_rate': 0.3...",0.784,0.848,0.808,0.814516,0.862903,0.823484,0.028404,7
4,0.052624,0.002154,0.00751,0.000662,0.3,0.7,3,50,"{'colsample_bytree': 0.3, 'learning_rate': 0.7...",0.792,0.848,0.792,0.814516,0.870968,0.823497,0.031364,6
5,0.090101,0.019783,0.008794,0.001738,0.3,0.7,3,100,"{'colsample_bytree': 0.3, 'learning_rate': 0.7...",0.768,0.848,0.784,0.846774,0.879032,0.825161,0.042077,5
6,0.059273,0.005971,0.007694,0.001034,0.3,0.7,5,50,"{'colsample_bytree': 0.3, 'learning_rate': 0.7...",0.792,0.848,0.784,0.830645,0.854839,0.821897,0.02889,10
7,0.082045,0.010339,0.007185,0.000484,0.3,0.7,5,100,"{'colsample_bytree': 0.3, 'learning_rate': 0.7...",0.8,0.832,0.776,0.846774,0.854839,0.821923,0.029638,9
8,0.04089,0.003696,0.007092,0.000434,0.8,0.3,3,50,"{'colsample_bytree': 0.8, 'learning_rate': 0.3...",0.808,0.888,0.776,0.814516,0.854839,0.828271,0.039002,3
9,0.085391,0.00237,0.008212,0.00073,0.8,0.3,3,100,"{'colsample_bytree': 0.8, 'learning_rate': 0.3...",0.808,0.872,0.784,0.814516,0.862903,0.828284,0.03368,2


best score is 0.8298838709677421 with params {'colsample_bytree': 0.3, 'learning_rate': 0.3, 'max_depth': 3, 'n_estimators': 100}
