# Modeling: Random Forest and XGBoost

### Importing Libraries

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV, cross_val_score
from tqdm import tqdm_notebook as tqdm
import _pickle as pickle

### Loading in the DataFrame

In [2]:
with open("main_df.pickle",'rb') as fp:
    main_df = pickle.load(fp)

### Preparing Data for Modeling

In [3]:
# Assigning appropriate variables for modeling
X = main_df.drop('price_will_increase?', axis=1)
y = main_df['price_will_increase?']

# Splitting the data 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

## Modeling - Random Forest

In [8]:
from sklearn.ensemble import RandomForestClassifier

In [7]:
# Fitting and training
forest = RandomForestClassifier(n_estimators=100, max_depth=5)
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [13]:
# Accuracy of the RandomForest
score = forest.score(X_test, y_test) * 100
print("Forest Accuracy:", score)

# Testing Area under the Curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("AUC:", round(roc_auc, 4))

Forest Accuracy: 65.53518334985134
AUC: 0.5679


In [23]:
y_pred = forest.predict(X_test)
print("Confusion Matrix \n-----------------")
pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Confusion Matrix 
-----------------


Predicted,False,True,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,320,1205,1525
True,186,2325,2511
All,506,3530,4036


In [24]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.63      0.21      0.32      1525
        True       0.66      0.93      0.77      2511

   micro avg       0.66      0.66      0.66      4036
   macro avg       0.65      0.57      0.54      4036
weighted avg       0.65      0.66      0.60      4036



### Improving RandomForest with GridSearchCV

In [28]:
# Using Cross Validation
rf_clf = RandomForestClassifier(n_estimators=100)
rf_cv_score = cross_val_score(rf_clf, X, y, cv=3)
mean_rf_cv_score = np.mean(rf_cv_score)

print("Mean Cross Validation Score:", mean_rf_cv_score*100)

Mean Cross Validation Score: 63.83821829582883


In [29]:
# Setting the Paramters to be tested
rf_param_grid = {'n_estimators': [10,30,100],
                 'criterion': ['gini', 'entropy'],
                 'max_depth': [None, 2, 6, 10],
                 'min_samples_split': [5,10],
                 'min_samples_leaf': [1, 2, 5]}

#### Finding the Best Parameters along with Accuracy

In [31]:
rf_grid_search = GridSearchCV(rf_clf, rf_param_grid, cv=3, return_train_score=True, verbose=10)
rf_grid_search.fit(X, y)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10, score=0.6109128754088612, total=   0.5s
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10, score=0.6050557620817844, total=   0.5s
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.0s remaining:    0.0s


[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10, score=0.6215613382899629, total=   0.5s
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.6s remaining:    0.0s


[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30, score=0.6329170383586084, total=   1.5s
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    3.2s remaining:    0.0s


[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30, score=0.635092936802974, total=   1.5s
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    4.8s remaining:    0.0s


[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30, score=0.6353903345724907, total=   1.7s
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    6.5s remaining:    0.0s


[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100, score=0.6347011596788582, total=   4.9s
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   11.7s remaining:    0.0s


[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100, score=0.6408921933085502, total=   5.2s
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   17.1s remaining:    0.0s


[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100, score=0.6446096654275093, total=   4.8s
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=10 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   22.1s remaining:    0.0s


[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=10, score=0.611804936068986, total=   0.5s
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=10 
[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=10, score=0.6156133828996283, total=   0.5s
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=10 
[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=10, score=0.6169516728624536, total=   0.5s
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=30 
[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=30, score=0.6326196848052335, total=   1.4s
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=30 
[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples

[CV]  criterion=gini, max_depth=None, min_samples_leaf=5, min_samples_split=10, n_estimators=10, score=0.6293487957181089, total=   0.5s
[CV] criterion=gini, max_depth=None, min_samples_leaf=5, min_samples_split=10, n_estimators=10 
[CV]  criterion=gini, max_depth=None, min_samples_leaf=5, min_samples_split=10, n_estimators=10, score=0.6261710037174721, total=   0.5s
[CV] criterion=gini, max_depth=None, min_samples_leaf=5, min_samples_split=10, n_estimators=10 
[CV]  criterion=gini, max_depth=None, min_samples_leaf=5, min_samples_split=10, n_estimators=10, score=0.6286988847583643, total=   0.5s
[CV] criterion=gini, max_depth=None, min_samples_leaf=5, min_samples_split=10, n_estimators=30 
[CV]  criterion=gini, max_depth=None, min_samples_leaf=5, min_samples_split=10, n_estimators=30, score=0.636485280999108, total=   1.3s
[CV] criterion=gini, max_depth=None, min_samples_leaf=5, min_samples_split=10, n_estimators=30 
[CV]  criterion=gini, max_depth=None, min_samples_leaf=5, min_samples

[CV]  criterion=gini, max_depth=2, min_samples_leaf=2, min_samples_split=10, n_estimators=10, score=0.6353903345724907, total=   0.1s
[CV] criterion=gini, max_depth=2, min_samples_leaf=2, min_samples_split=10, n_estimators=10 
[CV]  criterion=gini, max_depth=2, min_samples_leaf=2, min_samples_split=10, n_estimators=10, score=0.6478810408921933, total=   0.1s
[CV] criterion=gini, max_depth=2, min_samples_leaf=2, min_samples_split=10, n_estimators=30 
[CV]  criterion=gini, max_depth=2, min_samples_leaf=2, min_samples_split=10, n_estimators=30, score=0.6373773416592329, total=   0.3s
[CV] criterion=gini, max_depth=2, min_samples_leaf=2, min_samples_split=10, n_estimators=30 
[CV]  criterion=gini, max_depth=2, min_samples_leaf=2, min_samples_split=10, n_estimators=30, score=0.6376208178438662, total=   0.3s
[CV] criterion=gini, max_depth=2, min_samples_leaf=2, min_samples_split=10, n_estimators=30 
[CV]  criterion=gini, max_depth=2, min_samples_leaf=2, min_samples_split=10, n_estimators=30

[CV]  criterion=gini, max_depth=6, min_samples_leaf=1, min_samples_split=10, n_estimators=10, score=0.6459479553903346, total=   0.4s
[CV] criterion=gini, max_depth=6, min_samples_leaf=1, min_samples_split=10, n_estimators=30 
[CV]  criterion=gini, max_depth=6, min_samples_leaf=1, min_samples_split=10, n_estimators=30, score=0.6473386856972941, total=   0.9s
[CV] criterion=gini, max_depth=6, min_samples_leaf=1, min_samples_split=10, n_estimators=30 
[CV]  criterion=gini, max_depth=6, min_samples_leaf=1, min_samples_split=10, n_estimators=30, score=0.6446096654275093, total=   0.7s
[CV] criterion=gini, max_depth=6, min_samples_leaf=1, min_samples_split=10, n_estimators=30 
[CV]  criterion=gini, max_depth=6, min_samples_leaf=1, min_samples_split=10, n_estimators=30, score=0.6492193308550186, total=   0.9s
[CV] criterion=gini, max_depth=6, min_samples_leaf=1, min_samples_split=10, n_estimators=100 
[CV]  criterion=gini, max_depth=6, min_samples_leaf=1, min_samples_split=10, n_estimators=1

[CV]  criterion=gini, max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators=30, score=0.6421349985132322, total=   0.7s
[CV] criterion=gini, max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators=30 
[CV]  criterion=gini, max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators=30, score=0.6423791821561339, total=   0.7s
[CV] criterion=gini, max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators=30 
[CV]  criterion=gini, max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators=30, score=0.6511524163568774, total=   0.6s
[CV] criterion=gini, max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators=100 
[CV]  criterion=gini, max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators=100, score=0.6474873624739815, total=   2.2s
[CV] criterion=gini, max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators=100 
[CV]  criterion=gini, max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators

[CV]  criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=30, score=0.6449598572702944, total=   1.0s
[CV] criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=30 
[CV]  criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=30, score=0.6453531598513012, total=   0.9s
[CV] criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=30 
[CV]  criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=30, score=0.6510037174721189, total=   1.0s
[CV] criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=100 
[CV]  criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=100, score=0.6461492714837942, total=   3.3s
[CV] criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=100 
[CV]  criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_e

[CV]  criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=30, score=0.6387154326494202, total=   2.1s
[CV] criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=30 
[CV]  criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=30, score=0.6342007434944238, total=   2.2s
[CV] criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=30 
[CV]  criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=30, score=0.6410408921933085, total=   2.1s
[CV] criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100 
[CV]  criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100, score=0.6465953018138567, total=   6.8s
[CV] criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100 
[CV]  criterion=entropy, max_depth=None, 

[CV]  criterion=entropy, max_depth=None, min_samples_leaf=5, min_samples_split=10, n_estimators=10, score=0.6333085501858736, total=   0.6s
[CV] criterion=entropy, max_depth=None, min_samples_leaf=5, min_samples_split=10, n_estimators=30 
[CV]  criterion=entropy, max_depth=None, min_samples_leaf=5, min_samples_split=10, n_estimators=30, score=0.6381207255426702, total=   1.9s
[CV] criterion=entropy, max_depth=None, min_samples_leaf=5, min_samples_split=10, n_estimators=30 
[CV]  criterion=entropy, max_depth=None, min_samples_leaf=5, min_samples_split=10, n_estimators=30, score=0.6377695167286246, total=   1.9s
[CV] criterion=entropy, max_depth=None, min_samples_leaf=5, min_samples_split=10, n_estimators=30 
[CV]  criterion=entropy, max_depth=None, min_samples_leaf=5, min_samples_split=10, n_estimators=30, score=0.6468401486988847, total=   1.9s
[CV] criterion=entropy, max_depth=None, min_samples_leaf=5, min_samples_split=10, n_estimators=100 
[CV]  criterion=entropy, max_depth=None, mi

[CV]  criterion=entropy, max_depth=2, min_samples_leaf=2, min_samples_split=10, n_estimators=30, score=0.6477847160273565, total=   0.4s
[CV] criterion=entropy, max_depth=2, min_samples_leaf=2, min_samples_split=10, n_estimators=30 
[CV]  criterion=entropy, max_depth=2, min_samples_leaf=2, min_samples_split=10, n_estimators=30, score=0.6324163568773234, total=   0.3s
[CV] criterion=entropy, max_depth=2, min_samples_leaf=2, min_samples_split=10, n_estimators=30 
[CV]  criterion=entropy, max_depth=2, min_samples_leaf=2, min_samples_split=10, n_estimators=30, score=0.6434200743494424, total=   0.4s
[CV] criterion=entropy, max_depth=2, min_samples_leaf=2, min_samples_split=10, n_estimators=100 
[CV]  criterion=entropy, max_depth=2, min_samples_leaf=2, min_samples_split=10, n_estimators=100, score=0.6361879274457329, total=   1.3s
[CV] criterion=entropy, max_depth=2, min_samples_leaf=2, min_samples_split=10, n_estimators=100 
[CV]  criterion=entropy, max_depth=2, min_samples_leaf=2, min_sam

[CV]  criterion=entropy, max_depth=6, min_samples_leaf=1, min_samples_split=10, n_estimators=30, score=0.647636039250669, total=   0.9s
[CV] criterion=entropy, max_depth=6, min_samples_leaf=1, min_samples_split=10, n_estimators=30 
[CV]  criterion=entropy, max_depth=6, min_samples_leaf=1, min_samples_split=10, n_estimators=30, score=0.6422304832713754, total=   1.0s
[CV] criterion=entropy, max_depth=6, min_samples_leaf=1, min_samples_split=10, n_estimators=30 
[CV]  criterion=entropy, max_depth=6, min_samples_leaf=1, min_samples_split=10, n_estimators=30, score=0.6504089219330855, total=   1.0s
[CV] criterion=entropy, max_depth=6, min_samples_leaf=1, min_samples_split=10, n_estimators=100 
[CV]  criterion=entropy, max_depth=6, min_samples_leaf=1, min_samples_split=10, n_estimators=100, score=0.6500148676776687, total=   3.5s
[CV] criterion=entropy, max_depth=6, min_samples_leaf=1, min_samples_split=10, n_estimators=100 
[CV]  criterion=entropy, max_depth=6, min_samples_leaf=1, min_samp

[CV]  criterion=entropy, max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators=30, score=0.6504608980077312, total=   0.9s
[CV] criterion=entropy, max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators=30 
[CV]  criterion=entropy, max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators=30, score=0.6449070631970261, total=   0.9s
[CV] criterion=entropy, max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators=30 
[CV]  criterion=entropy, max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators=30, score=0.6492193308550186, total=   0.9s
[CV] criterion=entropy, max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators=100 
[CV]  criterion=entropy, max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators=100, score=0.648528099910794, total=   2.9s
[CV] criterion=entropy, max_depth=6, min_samples_leaf=5, min_samples_split=10, n_estimators=100 
[CV]  criterion=entropy, max_depth=6, min_samples_leaf=5, min_samp

[CV]  criterion=entropy, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=10, score=0.6408921933085502, total=   0.4s
[CV] criterion=entropy, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=30 
[CV]  criterion=entropy, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=30, score=0.6467439785905441, total=   1.3s
[CV] criterion=entropy, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=30 
[CV]  criterion=entropy, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=30, score=0.644460966542751, total=   1.3s
[CV] criterion=entropy, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=30 
[CV]  criterion=entropy, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=30, score=0.64817843866171, total=   1.3s
[CV] criterion=entropy, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=100 
[CV]  criterion=entropy, max_depth=10, min_samples_leaf=2, min

[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed: 12.0min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [10, 30, 100], 'criterion': ['gini', 'entropy'], 'max_depth': [None, 2, 6, 10], 'min_samples_split': [5, 10], 'min_samples_leaf': [1, 2, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=10)

In [32]:
print(f"Testing Accuracy: {rf_grid_search.best_score_*100}")
print(f"Optimal Parameters: {rf_grid_search.best_params_}")

Testing Accuracy: 64.9534099920698
Optimal Parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 100}


#### Using the RandomForest parameters from GridSearch

In [33]:
# Fitting and training
forest = RandomForestClassifier(n_estimators=100, max_depth=10, criterion='entropy', min_samples_leaf=5, min_samples_split=5)
forest.fit(X_train, y_train)

# Accuracy of the RandomForest
score = forest.score(X_test, y_test) * 100
print("Forest Accuracy:", score)

# Testing Area under the Curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("AUC:", round(roc_auc, 4))

Forest Accuracy: 65.70862239841428
AUC: 0.5679


In [36]:
# Classification Report
print(classification_report(y_test, y_pred))

# Confusion Matrix
y_pred = forest.predict(X_test)
print("Confusion Matrix \n-------------------------")
pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

              precision    recall  f1-score   support

       False       0.63      0.22      0.33      1525
        True       0.66      0.92      0.77      2511

   micro avg       0.66      0.66      0.66      4036
   macro avg       0.65      0.57      0.55      4036
weighted avg       0.65      0.66      0.60      4036

Confusion Matrix 
-------------------------


Predicted,False,True,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,336,1189,1525
True,195,2316,2511
All,531,3505,4036


##### GridSearch slightly improved the model by increasing the accuracy by around 0.2% and AUC remained largely unchanged.

## Modeling - XGBoost

In [4]:
# Preventing error from occuring: XGBoost causes kernel to die.
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from xgboost import XGBClassifier
import xgboost as xgb

In [5]:
x_clf = xgb.XGBClassifier()
x_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [11]:
# Predictions and accuracy from XGBoost
y_pred = x_clf.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("XGBoost Accuracy:", acc)

# Area under the curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("AUC:", round(roc_auc, 4))

XGBoost Accuracy: 0.6546085232903865
AUC: 0.5705


In [12]:
# Classification Report
print(classification_report(y_test, y_pred))

# Confusion Matrix
y_pred = x_clf.predict(X_test)
print("Confusion Matrix \n-------------------------")
pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

              precision    recall  f1-score   support

       False       0.62      0.23      0.33      1525
        True       0.66      0.91      0.77      2511

   micro avg       0.65      0.65      0.65      4036
   macro avg       0.64      0.57      0.55      4036
weighted avg       0.64      0.65      0.60      4036

Confusion Matrix 
-------------------------


Predicted,False,True,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,345,1180,1525
True,214,2297,2511
All,559,3477,4036


### Improving XGBoost with GridSearch

In [14]:
# Using Cross Validation for XGB
x_clf = xgb.XGBClassifier()
x_cv_score = cross_val_score(x_clf, X, y, cv=3)
mean_x_cv_score = np.mean(x_cv_score)

print("Mean Cross Validation Score:", mean_x_cv_score*100)

Mean Cross Validation Score: 64.87907585315966


In [20]:
x_param_grid = {'learning_rate': [1, .1, .01],
                'max_depth': [2, 4, 6],
                'min_child_weight': [4, 10, 20],
                'subsample': [.7, .5, .8],
                'n_estimators': [5,30,100,250]}

In [21]:
x_grid_search = GridSearchCV(x_clf, x_param_grid, cv=3, return_train_score=True, verbose=5)
x_grid_search.fit(X, y)

Fitting 3 folds for each of 324 candidates, totalling 972 fits
[CV] learning_rate=1, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.7 
[CV]  learning_rate=1, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.7, score=0.6424323520666072, total=   0.2s
[CV] learning_rate=1, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.7 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV]  learning_rate=1, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.7, score=0.641189591078067, total=   0.2s
[CV] learning_rate=1, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.7 
[CV]  learning_rate=1, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.7, score=0.6456505576208178, total=   0.1s
[CV] learning_rate=1, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.5 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.5s remaining:    0.0s


[CV]  learning_rate=1, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.5, score=0.6419863217365448, total=   0.2s
[CV] learning_rate=1, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.5 
[CV]  learning_rate=1, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.5, score=0.6423791821561339, total=   0.1s
[CV] learning_rate=1, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.5 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.7s remaining:    0.0s


[CV]  learning_rate=1, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.5, score=0.6471375464684015, total=   0.2s
[CV] learning_rate=1, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.8 
[CV]  learning_rate=1, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.8, score=0.6464466250371692, total=   0.2s
[CV] learning_rate=1, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.8 
[CV]  learning_rate=1, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.8, score=0.6441635687732342, total=   0.2s
[CV] learning_rate=1, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.8 
[CV]  learning_rate=1, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.8, score=0.6478810408921933, total=   0.1s
[CV] learning_rate=1, max_depth=2, min_child_weight=4, n_estimators=30, subsample=0.7 
[CV]  learning_rate=1, max_depth=2, min_child_weight=4, n_estimators=30, subsample=0.7, score=0.6257805530776093, total=   0.7s
[CV] learning_rate=

[CV]  learning_rate=1, max_depth=2, min_child_weight=10, n_estimators=5, subsample=0.8, score=0.6443122676579925, total=   0.2s
[CV] learning_rate=1, max_depth=2, min_child_weight=10, n_estimators=5, subsample=0.8 
[CV]  learning_rate=1, max_depth=2, min_child_weight=10, n_estimators=5, subsample=0.8, score=0.6471375464684015, total=   0.1s
[CV] learning_rate=1, max_depth=2, min_child_weight=10, n_estimators=30, subsample=0.7 
[CV]  learning_rate=1, max_depth=2, min_child_weight=10, n_estimators=30, subsample=0.7, score=0.6271186440677966, total=   0.6s
[CV] learning_rate=1, max_depth=2, min_child_weight=10, n_estimators=30, subsample=0.7 
[CV]  learning_rate=1, max_depth=2, min_child_weight=10, n_estimators=30, subsample=0.7, score=0.6353903345724907, total=   0.6s
[CV] learning_rate=1, max_depth=2, min_child_weight=10, n_estimators=30, subsample=0.7 
[CV]  learning_rate=1, max_depth=2, min_child_weight=10, n_estimators=30, subsample=0.7, score=0.6377695167286246, total=   0.6s
[CV] l

[CV]  learning_rate=1, max_depth=2, min_child_weight=20, n_estimators=30, subsample=0.7, score=0.6308355634849836, total=   0.6s
[CV] learning_rate=1, max_depth=2, min_child_weight=20, n_estimators=30, subsample=0.7 
[CV]  learning_rate=1, max_depth=2, min_child_weight=20, n_estimators=30, subsample=0.7, score=0.6397026022304833, total=   0.6s
[CV] learning_rate=1, max_depth=2, min_child_weight=20, n_estimators=30, subsample=0.7 
[CV]  learning_rate=1, max_depth=2, min_child_weight=20, n_estimators=30, subsample=0.7, score=0.6325650557620818, total=   0.6s
[CV] learning_rate=1, max_depth=2, min_child_weight=20, n_estimators=30, subsample=0.5 
[CV]  learning_rate=1, max_depth=2, min_child_weight=20, n_estimators=30, subsample=0.5, score=0.6296461492714838, total=   0.6s
[CV] learning_rate=1, max_depth=2, min_child_weight=20, n_estimators=30, subsample=0.5 
[CV]  learning_rate=1, max_depth=2, min_child_weight=20, n_estimators=30, subsample=0.5, score=0.6327137546468401, total=   0.6s
[CV

[CV]  learning_rate=1, max_depth=4, min_child_weight=4, n_estimators=30, subsample=0.7, score=0.614721189591078, total=   1.0s
[CV] learning_rate=1, max_depth=4, min_child_weight=4, n_estimators=30, subsample=0.5 
[CV]  learning_rate=1, max_depth=4, min_child_weight=4, n_estimators=30, subsample=0.5, score=0.5948557835266132, total=   1.0s
[CV] learning_rate=1, max_depth=4, min_child_weight=4, n_estimators=30, subsample=0.5 
[CV]  learning_rate=1, max_depth=4, min_child_weight=4, n_estimators=30, subsample=0.5, score=0.6035687732342008, total=   0.9s
[CV] learning_rate=1, max_depth=4, min_child_weight=4, n_estimators=30, subsample=0.5 
[CV]  learning_rate=1, max_depth=4, min_child_weight=4, n_estimators=30, subsample=0.5, score=0.6053531598513011, total=   0.9s
[CV] learning_rate=1, max_depth=4, min_child_weight=4, n_estimators=30, subsample=0.8 
[CV]  learning_rate=1, max_depth=4, min_child_weight=4, n_estimators=30, subsample=0.8, score=0.6057091882247992, total=   1.1s
[CV] learning

[CV]  learning_rate=1, max_depth=4, min_child_weight=10, n_estimators=30, subsample=0.5, score=0.6035687732342008, total=   0.9s
[CV] learning_rate=1, max_depth=4, min_child_weight=10, n_estimators=30, subsample=0.5 
[CV]  learning_rate=1, max_depth=4, min_child_weight=10, n_estimators=30, subsample=0.5, score=0.6013382899628252, total=   0.9s
[CV] learning_rate=1, max_depth=4, min_child_weight=10, n_estimators=30, subsample=0.8 
[CV]  learning_rate=1, max_depth=4, min_child_weight=10, n_estimators=30, subsample=0.8, score=0.6123996431757359, total=   1.1s
[CV] learning_rate=1, max_depth=4, min_child_weight=10, n_estimators=30, subsample=0.8 
[CV]  learning_rate=1, max_depth=4, min_child_weight=10, n_estimators=30, subsample=0.8, score=0.6113011152416357, total=   1.1s
[CV] learning_rate=1, max_depth=4, min_child_weight=10, n_estimators=30, subsample=0.8 
[CV]  learning_rate=1, max_depth=4, min_child_weight=10, n_estimators=30, subsample=0.8, score=0.6194795539033457, total=   1.2s
[CV

[CV]  learning_rate=1, max_depth=4, min_child_weight=20, n_estimators=30, subsample=0.8, score=0.620130835563485, total=   1.1s
[CV] learning_rate=1, max_depth=4, min_child_weight=20, n_estimators=30, subsample=0.8 
[CV]  learning_rate=1, max_depth=4, min_child_weight=20, n_estimators=30, subsample=0.8, score=0.6130855018587361, total=   1.0s
[CV] learning_rate=1, max_depth=4, min_child_weight=20, n_estimators=30, subsample=0.8 
[CV]  learning_rate=1, max_depth=4, min_child_weight=20, n_estimators=30, subsample=0.8, score=0.6141263940520446, total=   1.1s
[CV] learning_rate=1, max_depth=4, min_child_weight=20, n_estimators=100, subsample=0.7 
[CV]  learning_rate=1, max_depth=4, min_child_weight=20, n_estimators=100, subsample=0.7, score=0.5900981266726137, total=   3.3s
[CV] learning_rate=1, max_depth=4, min_child_weight=20, n_estimators=100, subsample=0.7 
[CV]  learning_rate=1, max_depth=4, min_child_weight=20, n_estimators=100, subsample=0.7, score=0.5952416356877324, total=   3.3s


[CV]  learning_rate=1, max_depth=6, min_child_weight=4, n_estimators=30, subsample=0.8, score=0.5931598513011153, total=   1.6s
[CV] learning_rate=1, max_depth=6, min_child_weight=4, n_estimators=100, subsample=0.7 
[CV]  learning_rate=1, max_depth=6, min_child_weight=4, n_estimators=100, subsample=0.7, score=0.5798394290811775, total=   4.9s
[CV] learning_rate=1, max_depth=6, min_child_weight=4, n_estimators=100, subsample=0.7 
[CV]  learning_rate=1, max_depth=6, min_child_weight=4, n_estimators=100, subsample=0.7, score=0.5751672862453532, total=   5.0s
[CV] learning_rate=1, max_depth=6, min_child_weight=4, n_estimators=100, subsample=0.7 
[CV]  learning_rate=1, max_depth=6, min_child_weight=4, n_estimators=100, subsample=0.7, score=0.5823048327137547, total=   4.9s
[CV] learning_rate=1, max_depth=6, min_child_weight=4, n_estimators=100, subsample=0.5 
[CV]  learning_rate=1, max_depth=6, min_child_weight=4, n_estimators=100, subsample=0.5, score=0.5704727921498662, total=   4.3s
[CV]

[CV]  learning_rate=1, max_depth=6, min_child_weight=10, n_estimators=100, subsample=0.7, score=0.5763568773234201, total=   4.7s
[CV] learning_rate=1, max_depth=6, min_child_weight=10, n_estimators=100, subsample=0.7 
[CV]  learning_rate=1, max_depth=6, min_child_weight=10, n_estimators=100, subsample=0.7, score=0.5846840148698885, total=   4.7s
[CV] learning_rate=1, max_depth=6, min_child_weight=10, n_estimators=100, subsample=0.5 
[CV]  learning_rate=1, max_depth=6, min_child_weight=10, n_estimators=100, subsample=0.5, score=0.5730002973535534, total=   4.1s
[CV] learning_rate=1, max_depth=6, min_child_weight=10, n_estimators=100, subsample=0.5 
[CV]  learning_rate=1, max_depth=6, min_child_weight=10, n_estimators=100, subsample=0.5, score=0.5644609665427509, total=   4.2s
[CV] learning_rate=1, max_depth=6, min_child_weight=10, n_estimators=100, subsample=0.5 
[CV]  learning_rate=1, max_depth=6, min_child_weight=10, n_estimators=100, subsample=0.5, score=0.5815613382899628, total=  

[CV]  learning_rate=1, max_depth=6, min_child_weight=20, n_estimators=100, subsample=0.5, score=0.5674992566161166, total=   4.0s
[CV] learning_rate=1, max_depth=6, min_child_weight=20, n_estimators=100, subsample=0.5 
[CV]  learning_rate=1, max_depth=6, min_child_weight=20, n_estimators=100, subsample=0.5, score=0.5601486988847584, total=   4.0s
[CV] learning_rate=1, max_depth=6, min_child_weight=20, n_estimators=100, subsample=0.5 
[CV]  learning_rate=1, max_depth=6, min_child_weight=20, n_estimators=100, subsample=0.5, score=0.5904832713754646, total=   4.1s
[CV] learning_rate=1, max_depth=6, min_child_weight=20, n_estimators=100, subsample=0.8 
[CV]  learning_rate=1, max_depth=6, min_child_weight=20, n_estimators=100, subsample=0.8, score=0.5851917930419268, total=   5.5s
[CV] learning_rate=1, max_depth=6, min_child_weight=20, n_estimators=100, subsample=0.8 
[CV]  learning_rate=1, max_depth=6, min_child_weight=20, n_estimators=100, subsample=0.8, score=0.5843866171003718, total=  

[CV]  learning_rate=0.1, max_depth=2, min_child_weight=4, n_estimators=100, subsample=0.5, score=0.6539776951672862, total=   1.8s
[CV] learning_rate=0.1, max_depth=2, min_child_weight=4, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.1, max_depth=2, min_child_weight=4, n_estimators=100, subsample=0.8, score=0.6491228070175439, total=   2.2s
[CV] learning_rate=0.1, max_depth=2, min_child_weight=4, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.1, max_depth=2, min_child_weight=4, n_estimators=100, subsample=0.8, score=0.6469888475836432, total=   2.1s
[CV] learning_rate=0.1, max_depth=2, min_child_weight=4, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.1, max_depth=2, min_child_weight=4, n_estimators=100, subsample=0.8, score=0.6547211895910781, total=   2.0s
[CV] learning_rate=0.1, max_depth=2, min_child_weight=4, n_estimators=250, subsample=0.7 
[CV]  learning_rate=0.1, max_depth=2, min_child_weight=4, n_estimators=250, subsample=0.7, score=0.6470413321439191,

[CV]  learning_rate=0.1, max_depth=2, min_child_weight=10, n_estimators=100, subsample=0.8, score=0.6472862453531598, total=   2.2s
[CV] learning_rate=0.1, max_depth=2, min_child_weight=10, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.1, max_depth=2, min_child_weight=10, n_estimators=100, subsample=0.8, score=0.6529368029739777, total=   2.1s
[CV] learning_rate=0.1, max_depth=2, min_child_weight=10, n_estimators=250, subsample=0.7 
[CV]  learning_rate=0.1, max_depth=2, min_child_weight=10, n_estimators=250, subsample=0.7, score=0.6470413321439191, total=   4.9s
[CV] learning_rate=0.1, max_depth=2, min_child_weight=10, n_estimators=250, subsample=0.7 
[CV]  learning_rate=0.1, max_depth=2, min_child_weight=10, n_estimators=250, subsample=0.7, score=0.6489219330855018, total=   4.9s
[CV] learning_rate=0.1, max_depth=2, min_child_weight=10, n_estimators=250, subsample=0.7 
[CV]  learning_rate=0.1, max_depth=2, min_child_weight=10, n_estimators=250, subsample=0.7, score=0.65263940

[CV]  learning_rate=0.1, max_depth=2, min_child_weight=20, n_estimators=250, subsample=0.7, score=0.6474873624739815, total=   5.0s
[CV] learning_rate=0.1, max_depth=2, min_child_weight=20, n_estimators=250, subsample=0.7 
[CV]  learning_rate=0.1, max_depth=2, min_child_weight=20, n_estimators=250, subsample=0.7, score=0.6507063197026022, total=   5.6s
[CV] learning_rate=0.1, max_depth=2, min_child_weight=20, n_estimators=250, subsample=0.7 
[CV]  learning_rate=0.1, max_depth=2, min_child_weight=20, n_estimators=250, subsample=0.7, score=0.6487732342007435, total=   5.0s
[CV] learning_rate=0.1, max_depth=2, min_child_weight=20, n_estimators=250, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=2, min_child_weight=20, n_estimators=250, subsample=0.5, score=0.6458519179304193, total=   5.0s
[CV] learning_rate=0.1, max_depth=2, min_child_weight=20, n_estimators=250, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=2, min_child_weight=20, n_estimators=250, subsample=0.5, score=0.65026022

[CV]  learning_rate=0.1, max_depth=4, min_child_weight=4, n_estimators=250, subsample=0.7, score=0.6460966542750929, total=   8.5s
[CV] learning_rate=0.1, max_depth=4, min_child_weight=4, n_estimators=250, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=4, min_child_weight=4, n_estimators=250, subsample=0.5, score=0.6391614629794826, total=   7.5s
[CV] learning_rate=0.1, max_depth=4, min_child_weight=4, n_estimators=250, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=4, min_child_weight=4, n_estimators=250, subsample=0.5, score=0.6389591078066914, total=   7.5s
[CV] learning_rate=0.1, max_depth=4, min_child_weight=4, n_estimators=250, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=4, min_child_weight=4, n_estimators=250, subsample=0.5, score=0.6434200743494424, total=   7.5s
[CV] learning_rate=0.1, max_depth=4, min_child_weight=4, n_estimators=250, subsample=0.8 
[CV]  learning_rate=0.1, max_depth=4, min_child_weight=4, n_estimators=250, subsample=0.8, score=0.643919119833482, 

[CV]  learning_rate=0.1, max_depth=4, min_child_weight=10, n_estimators=250, subsample=0.5, score=0.6324163568773234, total=   8.1s
[CV] learning_rate=0.1, max_depth=4, min_child_weight=10, n_estimators=250, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=4, min_child_weight=10, n_estimators=250, subsample=0.5, score=0.6446096654275093, total=   7.8s
[CV] learning_rate=0.1, max_depth=4, min_child_weight=10, n_estimators=250, subsample=0.8 
[CV]  learning_rate=0.1, max_depth=4, min_child_weight=10, n_estimators=250, subsample=0.8, score=0.64049955396967, total=   8.7s
[CV] learning_rate=0.1, max_depth=4, min_child_weight=10, n_estimators=250, subsample=0.8 
[CV]  learning_rate=0.1, max_depth=4, min_child_weight=10, n_estimators=250, subsample=0.8, score=0.6413382899628253, total=   8.9s
[CV] learning_rate=0.1, max_depth=4, min_child_weight=10, n_estimators=250, subsample=0.8 
[CV]  learning_rate=0.1, max_depth=4, min_child_weight=10, n_estimators=250, subsample=0.8, score=0.6496654275

[CV]  learning_rate=0.1, max_depth=4, min_child_weight=20, n_estimators=250, subsample=0.8, score=0.6373773416592329, total=   8.5s
[CV] learning_rate=0.1, max_depth=4, min_child_weight=20, n_estimators=250, subsample=0.8 
[CV]  learning_rate=0.1, max_depth=4, min_child_weight=20, n_estimators=250, subsample=0.8, score=0.6410408921933085, total=   8.6s
[CV] learning_rate=0.1, max_depth=4, min_child_weight=20, n_estimators=250, subsample=0.8 
[CV]  learning_rate=0.1, max_depth=4, min_child_weight=20, n_estimators=250, subsample=0.8, score=0.6483271375464684, total=   8.4s
[CV] learning_rate=0.1, max_depth=6, min_child_weight=4, n_estimators=5, subsample=0.7 
[CV]  learning_rate=0.1, max_depth=6, min_child_weight=4, n_estimators=5, subsample=0.7, score=0.6451085340469819, total=   0.3s
[CV] learning_rate=0.1, max_depth=6, min_child_weight=4, n_estimators=5, subsample=0.7 
[CV]  learning_rate=0.1, max_depth=6, min_child_weight=4, n_estimators=5, subsample=0.7, score=0.6460966542750929, to

[CV]  learning_rate=0.1, max_depth=6, min_child_weight=4, n_estimators=250, subsample=0.8, score=0.6364312267657992, total=  12.7s
[CV] learning_rate=0.1, max_depth=6, min_child_weight=10, n_estimators=5, subsample=0.7 
[CV]  learning_rate=0.1, max_depth=6, min_child_weight=10, n_estimators=5, subsample=0.7, score=0.644513826940232, total=   0.3s
[CV] learning_rate=0.1, max_depth=6, min_child_weight=10, n_estimators=5, subsample=0.7 
[CV]  learning_rate=0.1, max_depth=6, min_child_weight=10, n_estimators=5, subsample=0.7, score=0.6443122676579925, total=   0.3s
[CV] learning_rate=0.1, max_depth=6, min_child_weight=10, n_estimators=5, subsample=0.7 
[CV]  learning_rate=0.1, max_depth=6, min_child_weight=10, n_estimators=5, subsample=0.7, score=0.6501115241635688, total=   0.3s
[CV] learning_rate=0.1, max_depth=6, min_child_weight=10, n_estimators=5, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=6, min_child_weight=10, n_estimators=5, subsample=0.5, score=0.6430270591733571, total=  

[CV]  learning_rate=0.1, max_depth=6, min_child_weight=20, n_estimators=5, subsample=0.7, score=0.6465427509293681, total=   0.3s
[CV] learning_rate=0.1, max_depth=6, min_child_weight=20, n_estimators=5, subsample=0.7 
[CV]  learning_rate=0.1, max_depth=6, min_child_weight=20, n_estimators=5, subsample=0.7, score=0.6459479553903346, total=   0.3s
[CV] learning_rate=0.1, max_depth=6, min_child_weight=20, n_estimators=5, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=6, min_child_weight=20, n_estimators=5, subsample=0.5, score=0.6415402914064823, total=   0.3s
[CV] learning_rate=0.1, max_depth=6, min_child_weight=20, n_estimators=5, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=6, min_child_weight=20, n_estimators=5, subsample=0.5, score=0.6499628252788104, total=   0.3s
[CV] learning_rate=0.1, max_depth=6, min_child_weight=20, n_estimators=5, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=6, min_child_weight=20, n_estimators=5, subsample=0.5, score=0.6474349442379183, total=  

[CV]  learning_rate=0.01, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.5, score=0.6510556051144811, total=   0.1s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.5, score=0.6407434944237919, total=   0.1s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.5, score=0.6450557620817844, total=   0.1s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.8, score=0.6498661909009813, total=   0.1s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=2, min_child_weight=4, n_estimators=5, subsample=0.8, score=0.6408921933085502, total=  

[CV]  learning_rate=0.01, max_depth=2, min_child_weight=10, n_estimators=5, subsample=0.5, score=0.6407434944237919, total=   0.1s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=10, n_estimators=5, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=2, min_child_weight=10, n_estimators=5, subsample=0.5, score=0.6450557620817844, total=   0.1s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=10, n_estimators=5, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=2, min_child_weight=10, n_estimators=5, subsample=0.8, score=0.6498661909009813, total=   0.2s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=10, n_estimators=5, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=2, min_child_weight=10, n_estimators=5, subsample=0.8, score=0.6408921933085502, total=   0.1s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=10, n_estimators=5, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=2, min_child_weight=10, n_estimators=5, subsample=0.8, score=0.6457992565055762,

[CV]  learning_rate=0.01, max_depth=2, min_child_weight=20, n_estimators=5, subsample=0.8, score=0.6498661909009813, total=   0.2s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=20, n_estimators=5, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=2, min_child_weight=20, n_estimators=5, subsample=0.8, score=0.6408921933085502, total=   0.2s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=20, n_estimators=5, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=2, min_child_weight=20, n_estimators=5, subsample=0.8, score=0.6457992565055762, total=   0.2s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=20, n_estimators=30, subsample=0.7 
[CV]  learning_rate=0.01, max_depth=2, min_child_weight=20, n_estimators=30, subsample=0.7, score=0.6506095747844187, total=   0.6s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=20, n_estimators=30, subsample=0.7 
[CV]  learning_rate=0.01, max_depth=2, min_child_weight=20, n_estimators=30, subsample=0.7, score=0.6408921933085

[CV]  learning_rate=0.01, max_depth=4, min_child_weight=4, n_estimators=5, subsample=0.8, score=0.6469888475836432, total=   0.2s
[CV] learning_rate=0.01, max_depth=4, min_child_weight=4, n_estimators=5, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=4, min_child_weight=4, n_estimators=5, subsample=0.8, score=0.6521933085501859, total=   0.3s
[CV] learning_rate=0.01, max_depth=4, min_child_weight=4, n_estimators=30, subsample=0.7 
[CV]  learning_rate=0.01, max_depth=4, min_child_weight=4, n_estimators=30, subsample=0.7, score=0.651352958667856, total=   1.0s
[CV] learning_rate=0.01, max_depth=4, min_child_weight=4, n_estimators=30, subsample=0.7 
[CV]  learning_rate=0.01, max_depth=4, min_child_weight=4, n_estimators=30, subsample=0.7, score=0.6456505576208178, total=   1.0s
[CV] learning_rate=0.01, max_depth=4, min_child_weight=4, n_estimators=30, subsample=0.7 
[CV]  learning_rate=0.01, max_depth=4, min_child_weight=4, n_estimators=30, subsample=0.7, score=0.6518959107806691, tot

[CV]  learning_rate=0.01, max_depth=4, min_child_weight=10, n_estimators=5, subsample=0.8, score=0.652639405204461, total=   0.3s
[CV] learning_rate=0.01, max_depth=4, min_child_weight=10, n_estimators=30, subsample=0.7 
[CV]  learning_rate=0.01, max_depth=4, min_child_weight=10, n_estimators=30, subsample=0.7, score=0.651947665774606, total=   1.4s
[CV] learning_rate=0.01, max_depth=4, min_child_weight=10, n_estimators=30, subsample=0.7 
[CV]  learning_rate=0.01, max_depth=4, min_child_weight=10, n_estimators=30, subsample=0.7, score=0.6466914498141264, total=   1.4s
[CV] learning_rate=0.01, max_depth=4, min_child_weight=10, n_estimators=30, subsample=0.7 
[CV]  learning_rate=0.01, max_depth=4, min_child_weight=10, n_estimators=30, subsample=0.7, score=0.6510037174721189, total=   1.4s
[CV] learning_rate=0.01, max_depth=4, min_child_weight=10, n_estimators=30, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=4, min_child_weight=10, n_estimators=30, subsample=0.5, score=0.65477252453

[CV]  learning_rate=0.01, max_depth=4, min_child_weight=20, n_estimators=30, subsample=0.7, score=0.651947665774606, total=   1.0s
[CV] learning_rate=0.01, max_depth=4, min_child_weight=20, n_estimators=30, subsample=0.7 
[CV]  learning_rate=0.01, max_depth=4, min_child_weight=20, n_estimators=30, subsample=0.7, score=0.6471375464684015, total=   1.0s
[CV] learning_rate=0.01, max_depth=4, min_child_weight=20, n_estimators=30, subsample=0.7 
[CV]  learning_rate=0.01, max_depth=4, min_child_weight=20, n_estimators=30, subsample=0.7, score=0.6524907063197026, total=   1.1s
[CV] learning_rate=0.01, max_depth=4, min_child_weight=20, n_estimators=30, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=4, min_child_weight=20, n_estimators=30, subsample=0.5, score=0.6534344335414808, total=   1.0s
[CV] learning_rate=0.01, max_depth=4, min_child_weight=20, n_estimators=30, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=4, min_child_weight=20, n_estimators=30, subsample=0.5, score=0.646096654

[CV]  learning_rate=0.01, max_depth=6, min_child_weight=4, n_estimators=30, subsample=0.7, score=0.6457992565055762, total=   1.6s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=4, n_estimators=30, subsample=0.7 
[CV]  learning_rate=0.01, max_depth=6, min_child_weight=4, n_estimators=30, subsample=0.7, score=0.6521933085501859, total=   1.7s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=4, n_estimators=30, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=6, min_child_weight=4, n_estimators=30, subsample=0.5, score=0.648230746357419, total=   1.5s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=4, n_estimators=30, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=6, min_child_weight=4, n_estimators=30, subsample=0.5, score=0.6486245353159852, total=   1.5s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=4, n_estimators=30, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=6, min_child_weight=4, n_estimators=30, subsample=0.5, score=0.6532342007434945, 

[CV]  learning_rate=0.01, max_depth=6, min_child_weight=10, n_estimators=30, subsample=0.7, score=0.6495167286245354, total=   1.6s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=10, n_estimators=30, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=6, min_child_weight=10, n_estimators=30, subsample=0.5, score=0.6486767766874814, total=   1.4s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=10, n_estimators=30, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=6, min_child_weight=10, n_estimators=30, subsample=0.5, score=0.6475836431226766, total=   1.3s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=10, n_estimators=30, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=6, min_child_weight=10, n_estimators=30, subsample=0.5, score=0.652639405204461, total=   1.3s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=10, n_estimators=30, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=6, min_child_weight=10, n_estimators=30, subsample=0.8, score=0.643027059

[CV]  learning_rate=0.01, max_depth=6, min_child_weight=20, n_estimators=30, subsample=0.5, score=0.647933392804044, total=   1.3s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=20, n_estimators=30, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=6, min_child_weight=20, n_estimators=30, subsample=0.5, score=0.64817843866171, total=   1.3s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=20, n_estimators=30, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=6, min_child_weight=20, n_estimators=30, subsample=0.5, score=0.6523420074349442, total=   1.3s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=20, n_estimators=30, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=6, min_child_weight=20, n_estimators=30, subsample=0.8, score=0.6462979482604817, total=   1.6s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=20, n_estimators=30, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=6, min_child_weight=20, n_estimators=30, subsample=0.8, score=0.64594795539

[Parallel(n_jobs=1)]: Done 972 out of 972 | elapsed: 54.1min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=6, min_child_weight=10, missing=None,
       n_estimators=30, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=0.7, verbosity=1),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'learning_rate': [1, 0.1, 0.01], 'max_depth': [2, 4, 6], 'min_child_weight': [4, 10, 20], 'subsample': [0.7, 0.5, 0.8], 'n_estimators': [5, 30, 100, 250]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=5)

In [22]:
print(f"Testing Accuracy: {x_grid_search.best_score_*100}")
print(f"Optimal Parameters: {x_grid_search.best_params_}")

Testing Accuracy: 65.13679619349723
Optimal Parameters: {'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 4, 'n_estimators': 100, 'subsample': 0.5}


In [23]:
# Fitting and training XGB
x_clf = xgb.XGBClassifier(learning_rate=.1, max_depth=2, min_child_weight=4, n_estimators=100, subsample=.5)
x_clf.fit(X_train, y_train)

# Predictions and accuracy from XGBoost
y_pred = x_clf.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("XGBoost Accuracy:", acc)

# Area under the curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("AUC:", round(roc_auc, 4))

XGBoost Accuracy: 0.6585728444003964
AUC: 0.5736


In [19]:
# Classification Report
print(classification_report(y_test, y_pred))

# Confusion Matrix
y_pred = x_clf.predict(X_test)
print("Confusion Matrix \n-------------------------")
pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

              precision    recall  f1-score   support

       False       0.62      0.24      0.35      1525
        True       0.66      0.91      0.77      2511

   micro avg       0.66      0.66      0.66      4036
   macro avg       0.64      0.58      0.56      4036
weighted avg       0.65      0.66      0.61      4036

Confusion Matrix 
-------------------------


Predicted,False,True,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,368,1157,1525
True,225,2286,2511
All,593,3443,4036
