# Modeling: Random Forest and XGBoost

### Importing Libraries

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, roc_curve, auc, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, cross_val_score
from tqdm import tqdm_notebook as tqdm
import _pickle as pickle

### Loading in the DataFrame

In [33]:
with open("final_df.pickle",'rb') as fp:
    final_df = pickle.load(fp)

### Preparing Data for Modeling

In [34]:
# Assigning appropriate variables for modeling
X = final_df.drop('price_will_increase?', axis=1)
y = final_df['price_will_increase?']

# Splitting the data 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

### Function to measure each model's success

In [35]:
def print_metrics(test, preds):
    print("Precision Score: {}".format(precision_score(test, preds)))
    print("Recall Score: {}".format(recall_score(test, preds)))
    print("Accuracy Score: {}".format(accuracy_score(test, preds)))
    print("F1 Score: {}".format(f1_score(test, preds)))

## Modeling - Random Forest

In [36]:
from sklearn.ensemble import RandomForestClassifier

In [37]:
# Fitting and training
forest = RandomForestClassifier(n_estimators=100, max_depth=5)
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### Results from the Random Forest Model

In [39]:
# Predictions from RF
y_pred = forest.predict(X_test)

print_metrics(y_test, y_pred)

Precision Score: 0.6298507462686567
Recall Score: 0.6762820512820513
Accuracy Score: 0.6237458193979933
F1 Score: 0.6522411128284389


In [40]:
# Testing Area under the Curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("AUC:", round(roc_auc, 4))

AUC: 0.6214


In [41]:
print("Confusion Matrix \n-----------------")
pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

Confusion Matrix 
-----------------


Predicted,False,True,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,162,124,286
True,101,211,312
All,263,335,598


In [42]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.62      0.57      0.59       286
        True       0.63      0.68      0.65       312

   micro avg       0.62      0.62      0.62       598
   macro avg       0.62      0.62      0.62       598
weighted avg       0.62      0.62      0.62       598



### Improving RandomForest with GridSearchCV

In [43]:
# Using Cross Validation
rf_clf = RandomForestClassifier(n_estimators=100)
rf_cv_score = cross_val_score(rf_clf, X, y, cv=3)
mean_rf_cv_score = np.mean(rf_cv_score)

print("Mean Cross Validation Score:", mean_rf_cv_score*100)

Mean Cross Validation Score: 60.95044160545603


In [49]:
# Setting the Paramters to be tested
rf_param_grid = {'n_estimators': [10,30, 60,100],
                 'criterion': ['gini', 'entropy'],
                 'max_depth': [None, 2, 5, 10],
                 'min_samples_split': [5,10],
                 'min_samples_leaf': [1, 2, 5]}

#### Finding the Best Parameters along with Accuracy

In [50]:
rf_grid_search = GridSearchCV(rf_clf, rf_param_grid, cv=3, return_train_score=True, verbose=2)
rf_grid_search.fit(X, y)

Fitting 3 folds for each of 192 candidates, totalling 576 fits
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10 
[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10, total=   0.1s
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10 
[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10, total=   0.1s
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10, total=   0.1s
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30 
[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30, total=   0.2s
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30 
[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30, total=   0.2s
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30 
[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30, total=   0.2s
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=60 
[CV]  criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=60, total=   0.4s
[CV] criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_s

[CV]  criterion=gini, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=60, total=   0.5s
[CV] criterion=gini, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=60 
[CV]  criterion=gini, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=60, total=   0.4s
[CV] criterion=gini, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=60 
[CV]  criterion=gini, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=60, total=   0.4s
[CV] criterion=gini, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=100 
[CV]  criterion=gini, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=100, total=   0.8s
[CV] criterion=gini, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=100 
[CV]  criterion=gini, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=100, total=   0.8s
[CV] criterion=gini, max_depth=None, min_samples_leaf=2, 

[CV]  criterion=gini, max_depth=2, min_samples_leaf=1, min_samples_split=5, n_estimators=100, total=   0.2s
[CV] criterion=gini, max_depth=2, min_samples_leaf=1, min_samples_split=5, n_estimators=100 
[CV]  criterion=gini, max_depth=2, min_samples_leaf=1, min_samples_split=5, n_estimators=100, total=   0.2s
[CV] criterion=gini, max_depth=2, min_samples_leaf=1, min_samples_split=10, n_estimators=10 
[CV]  criterion=gini, max_depth=2, min_samples_leaf=1, min_samples_split=10, n_estimators=10, total=   0.0s
[CV] criterion=gini, max_depth=2, min_samples_leaf=1, min_samples_split=10, n_estimators=10 
[CV]  criterion=gini, max_depth=2, min_samples_leaf=1, min_samples_split=10, n_estimators=10, total=   0.0s
[CV] criterion=gini, max_depth=2, min_samples_leaf=1, min_samples_split=10, n_estimators=10 
[CV]  criterion=gini, max_depth=2, min_samples_leaf=1, min_samples_split=10, n_estimators=10, total=   0.0s
[CV] criterion=gini, max_depth=2, min_samples_leaf=1, min_samples_split=10, n_estimators

[CV]  criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=30, total=   0.1s
[CV] criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=30 
[CV]  criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=30, total=   0.1s
[CV] criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=60 
[CV]  criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=60, total=   0.1s
[CV] criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=60 
[CV]  criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=60, total=   0.1s
[CV] criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=60 
[CV]  criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=60, total=   0.1s
[CV] criterion=gini, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=100 
[CV]

[CV]  criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=10, n_estimators=100, total=   0.4s
[CV] criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=10, n_estimators=100 
[CV]  criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=10, n_estimators=100, total=   0.4s
[CV] criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=10, n_estimators=100 
[CV]  criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=10, n_estimators=100, total=   0.4s
[CV] criterion=gini, max_depth=5, min_samples_leaf=2, min_samples_split=5, n_estimators=10 
[CV]  criterion=gini, max_depth=5, min_samples_leaf=2, min_samples_split=5, n_estimators=10, total=   0.0s
[CV] criterion=gini, max_depth=5, min_samples_leaf=2, min_samples_split=5, n_estimators=10 
[CV]  criterion=gini, max_depth=5, min_samples_leaf=2, min_samples_split=5, n_estimators=10, total=   0.0s
[CV] criterion=gini, max_depth=5, min_samples_leaf=2, min_samples_split=5, n_estimators

[CV]  criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=30, total=   0.1s
[CV] criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=30 
[CV]  criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=30, total=   0.1s
[CV] criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=30 
[CV]  criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=30, total=   0.1s
[CV] criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=60 
[CV]  criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=60, total=   0.2s
[CV] criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=60 
[CV]  criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=60, total=   0.2s
[CV] criterion=gini, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators

[CV]  criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=60, total=   0.3s
[CV] criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100 
[CV]  criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100, total=   0.6s
[CV] criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100 
[CV]  criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100, total=   0.6s
[CV] criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100 
[CV]  criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100, total=   0.6s
[CV] criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=10 
[CV]  criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=10, total=   0.1s
[CV] criterion=gini, max_depth=10, min_samples_leaf=2, min_samples_split=10, n_e

[CV]  criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10, total=   0.1s
[CV] criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10 
[CV]  criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=10, total=   0.1s
[CV] criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30 
[CV]  criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30, total=   0.3s
[CV] criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30 
[CV]  criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30, total=   0.3s
[CV] criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30 
[CV]  criterion=entropy, max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=30, total=   0.3s
[CV] criterion=entropy, max_depth=None, min

[CV]  criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=30, total=   0.3s
[CV] criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=30 
[CV]  criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=30, total=   0.3s
[CV] criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=60 
[CV]  criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=60, total=   0.6s
[CV] criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=60 
[CV]  criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=60, total=   0.6s
[CV] criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=60 
[CV]  criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=10, n_estimators=60, total=   0.6s
[CV] criterion=entropy, max_depth=

[CV]  criterion=entropy, max_depth=2, min_samples_leaf=1, min_samples_split=5, n_estimators=60, total=   0.2s
[CV] criterion=entropy, max_depth=2, min_samples_leaf=1, min_samples_split=5, n_estimators=100 
[CV]  criterion=entropy, max_depth=2, min_samples_leaf=1, min_samples_split=5, n_estimators=100, total=   0.3s
[CV] criterion=entropy, max_depth=2, min_samples_leaf=1, min_samples_split=5, n_estimators=100 
[CV]  criterion=entropy, max_depth=2, min_samples_leaf=1, min_samples_split=5, n_estimators=100, total=   0.3s
[CV] criterion=entropy, max_depth=2, min_samples_leaf=1, min_samples_split=5, n_estimators=100 
[CV]  criterion=entropy, max_depth=2, min_samples_leaf=1, min_samples_split=5, n_estimators=100, total=   0.3s
[CV] criterion=entropy, max_depth=2, min_samples_leaf=1, min_samples_split=10, n_estimators=10 
[CV]  criterion=entropy, max_depth=2, min_samples_leaf=1, min_samples_split=10, n_estimators=10, total=   0.0s
[CV] criterion=entropy, max_depth=2, min_samples_leaf=1, min_s

[CV]  criterion=entropy, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=30, total=   0.1s
[CV] criterion=entropy, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=30 
[CV]  criterion=entropy, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=30, total=   0.1s
[CV] criterion=entropy, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=60 
[CV]  criterion=entropy, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=60, total=   0.2s
[CV] criterion=entropy, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=60 
[CV]  criterion=entropy, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=60, total=   0.2s
[CV] criterion=entropy, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=60 
[CV]  criterion=entropy, max_depth=2, min_samples_leaf=5, min_samples_split=5, n_estimators=60, total=   0.2s
[CV] criterion=entropy, max_depth=2, min_samples_leaf=5, min_samples_s

[CV]  criterion=entropy, max_depth=5, min_samples_leaf=1, min_samples_split=10, n_estimators=60, total=   0.3s
[CV] criterion=entropy, max_depth=5, min_samples_leaf=1, min_samples_split=10, n_estimators=100 
[CV]  criterion=entropy, max_depth=5, min_samples_leaf=1, min_samples_split=10, n_estimators=100, total=   0.5s
[CV] criterion=entropy, max_depth=5, min_samples_leaf=1, min_samples_split=10, n_estimators=100 
[CV]  criterion=entropy, max_depth=5, min_samples_leaf=1, min_samples_split=10, n_estimators=100, total=   0.5s
[CV] criterion=entropy, max_depth=5, min_samples_leaf=1, min_samples_split=10, n_estimators=100 
[CV]  criterion=entropy, max_depth=5, min_samples_leaf=1, min_samples_split=10, n_estimators=100, total=   0.5s
[CV] criterion=entropy, max_depth=5, min_samples_leaf=2, min_samples_split=5, n_estimators=10 
[CV]  criterion=entropy, max_depth=5, min_samples_leaf=2, min_samples_split=5, n_estimators=10, total=   0.1s
[CV] criterion=entropy, max_depth=5, min_samples_leaf=2, 

[CV]  criterion=entropy, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=30, total=   0.2s
[CV] criterion=entropy, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=30 
[CV]  criterion=entropy, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=30, total=   0.1s
[CV] criterion=entropy, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=30 
[CV]  criterion=entropy, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=30, total=   0.2s
[CV] criterion=entropy, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=60 
[CV]  criterion=entropy, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=60, total=   0.3s
[CV] criterion=entropy, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=60 
[CV]  criterion=entropy, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=60, total=   0.3s
[CV] criterion=entropy, max_depth=5, min_samples_leaf=5, min_

[CV]  criterion=entropy, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=60, total=   0.5s
[CV] criterion=entropy, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=60 
[CV]  criterion=entropy, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=60, total=   0.5s
[CV] criterion=entropy, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100 
[CV]  criterion=entropy, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100, total=   0.8s
[CV] criterion=entropy, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100 
[CV]  criterion=entropy, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100, total=   0.8s
[CV] criterion=entropy, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100 
[CV]  criterion=entropy, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100, total=   0.8s
[CV] criterion=entropy, max_depth=10, min_samples_leaf=

[CV]  criterion=entropy, max_depth=10, min_samples_leaf=5, min_samples_split=10, n_estimators=100, total=   0.8s


[Parallel(n_jobs=1)]: Done 576 out of 576 | elapsed:  2.8min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [10, 30, 60, 100], 'criterion': ['gini', 'entropy'], 'max_depth': [None, 2, 5, 10], 'min_samples_split': [5, 10], 'min_samples_leaf': [1, 2, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=2)

In [51]:
print(f"Testing Accuracy: {rf_grid_search.best_score_*100}")
print(f"Optimal Parameters: {rf_grid_search.best_params_}")

Testing Accuracy: 60.98459477561956
Optimal Parameters: {'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 30}


#### Using the RandomForest parameters from GridSearch

In [52]:
# Fitting and training
forest = RandomForestClassifier(n_estimators=30, max_depth=5, criterion='gini', min_samples_leaf=1, min_samples_split=5)
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### Results from Using the parameters in GridSearch for Random Forest

In [53]:
# Predictions from RF
y_pred = forest.predict(X_test)

print_metrics(y_test, y_pred)

Precision Score: 0.6231884057971014
Recall Score: 0.6891025641025641
Accuracy Score: 0.6204013377926422
F1 Score: 0.6544901065449011


In [54]:
# Testing Area under the Curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("AUC:", round(roc_auc, 4))

AUC: 0.6173


In [55]:
# Classification Report
print(classification_report(y_test, y_pred))

# Confusion Matrix
y_pred = forest.predict(X_test)
print("Confusion Matrix \n-------------------------")
pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

              precision    recall  f1-score   support

       False       0.62      0.55      0.58       286
        True       0.62      0.69      0.65       312

   micro avg       0.62      0.62      0.62       598
   macro avg       0.62      0.62      0.62       598
weighted avg       0.62      0.62      0.62       598

Confusion Matrix 
-------------------------


Predicted,False,True,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,156,130,286
True,97,215,312
All,253,345,598


##### GridSearch slightly improved the model by increasing the accuracy by around 0.2% and AUC remained largely unchanged.

## Modeling - XGBoost

In [56]:
# Preventing error from occuring: XGBoost causes kernel to die.
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from xgboost import XGBClassifier
import xgboost as xgb

In [57]:
# Fitting and training the model
x_clf = xgb.XGBClassifier()
x_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

### Results from XGBoost

In [58]:
y_pred = x_clf.predict(X_test)

print_metrics(y_test, y_pred)

Precision Score: 0.6181229773462783
Recall Score: 0.6121794871794872
Accuracy Score: 0.6003344481605352
F1 Score: 0.6151368760064413


In [59]:
# Area under the curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("AUC:", round(roc_auc, 4))

AUC: 0.5998


In [60]:
# Classification Report
print(classification_report(y_test, y_pred))

# Confusion Matrix
y_pred = x_clf.predict(X_test)
print("Confusion Matrix \n-------------------------")
pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

              precision    recall  f1-score   support

       False       0.58      0.59      0.58       286
        True       0.62      0.61      0.62       312

   micro avg       0.60      0.60      0.60       598
   macro avg       0.60      0.60      0.60       598
weighted avg       0.60      0.60      0.60       598

Confusion Matrix 
-------------------------


Predicted,False,True,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,168,118,286
True,121,191,312
All,289,309,598


### Improving XGBoost with GridSearch

In [61]:
# Using Cross Validation for XGB
x_clf = xgb.XGBClassifier()
x_cv_score = cross_val_score(x_clf, X, y, cv=3)
mean_x_cv_score = np.mean(x_cv_score)

print("Mean Cross Validation Score:", mean_x_cv_score*100)

Mean Cross Validation Score: 59.711077711247405


In [62]:
x_param_grid = {'learning_rate': [1, .1, .01],
                'max_depth': [2, 3, 6],
                'min_child_weight': [1, 2, 5],
                'subsample': [.7, .5, .8],
                'n_estimators': [5,30,100,250]}

In [64]:
x_grid_search = GridSearchCV(x_clf, x_param_grid, cv=3, return_train_score=True, verbose=2)
x_grid_search.fit(X, y)

Fitting 3 folds for each of 324 candidates, totalling 972 fits
[CV] learning_rate=1, max_depth=2, min_child_weight=1, n_estimators=5, subsample=0.7 
[CV]  learning_rate=1, max_depth=2, min_child_weight=1, n_estimators=5, subsample=0.7, total=   0.0s
[CV] learning_rate=1, max_depth=2, min_child_weight=1, n_estimators=5, subsample=0.7 
[CV]  learning_rate=1, max_depth=2, min_child_weight=1, n_estimators=5, subsample=0.7, total=   0.0s
[CV] learning_rate=1, max_depth=2, min_child_weight=1, n_estimators=5, subsample=0.7 
[CV]  learning_rate=1, max_depth=2, min_child_weight=1, n_estimators=5, subsample=0.7, total=   0.0s
[CV] learning_rate=1, max_depth=2, min_child_weight=1, n_estimators=5, subsample=0.5 
[CV]  learning_rate=1, max_depth=2, min_child_weight=1, n_estimators=5, subsample=0.5, total=   0.0s
[CV] learning_rate=1, max_depth=2, min_child_weight=1, n_estimators=5, subsample=0.5 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV]  learning_rate=1, max_depth=2, min_child_weight=1, n_estimators=5, subsample=0.5, total=   0.0s
[CV] learning_rate=1, max_depth=2, min_child_weight=1, n_estimators=5, subsample=0.5 
[CV]  learning_rate=1, max_depth=2, min_child_weight=1, n_estimators=5, subsample=0.5, total=   0.0s
[CV] learning_rate=1, max_depth=2, min_child_weight=1, n_estimators=5, subsample=0.8 
[CV]  learning_rate=1, max_depth=2, min_child_weight=1, n_estimators=5, subsample=0.8, total=   0.0s
[CV] learning_rate=1, max_depth=2, min_child_weight=1, n_estimators=5, subsample=0.8 
[CV]  learning_rate=1, max_depth=2, min_child_weight=1, n_estimators=5, subsample=0.8, total=   0.0s
[CV] learning_rate=1, max_depth=2, min_child_weight=1, n_estimators=5, subsample=0.8 
[CV]  learning_rate=1, max_depth=2, min_child_weight=1, n_estimators=5, subsample=0.8, total=   0.0s
[CV] learning_rate=1, max_depth=2, min_child_weight=1, n_estimators=30, subsample=0.7 
[CV]  learning_rate=1, max_depth=2, min_child_weight=1, n_estima

[CV]  learning_rate=1, max_depth=2, min_child_weight=2, n_estimators=30, subsample=0.5, total=   0.1s
[CV] learning_rate=1, max_depth=2, min_child_weight=2, n_estimators=30, subsample=0.5 
[CV]  learning_rate=1, max_depth=2, min_child_weight=2, n_estimators=30, subsample=0.5, total=   0.1s
[CV] learning_rate=1, max_depth=2, min_child_weight=2, n_estimators=30, subsample=0.5 
[CV]  learning_rate=1, max_depth=2, min_child_weight=2, n_estimators=30, subsample=0.5, total=   0.1s
[CV] learning_rate=1, max_depth=2, min_child_weight=2, n_estimators=30, subsample=0.8 
[CV]  learning_rate=1, max_depth=2, min_child_weight=2, n_estimators=30, subsample=0.8, total=   0.2s
[CV] learning_rate=1, max_depth=2, min_child_weight=2, n_estimators=30, subsample=0.8 
[CV]  learning_rate=1, max_depth=2, min_child_weight=2, n_estimators=30, subsample=0.8, total=   0.2s
[CV] learning_rate=1, max_depth=2, min_child_weight=2, n_estimators=30, subsample=0.8 
[CV]  learning_rate=1, max_depth=2, min_child_weight=2,

[CV]  learning_rate=1, max_depth=2, min_child_weight=5, n_estimators=100, subsample=0.7, total=   0.5s
[CV] learning_rate=1, max_depth=2, min_child_weight=5, n_estimators=100, subsample=0.5 
[CV]  learning_rate=1, max_depth=2, min_child_weight=5, n_estimators=100, subsample=0.5, total=   0.4s
[CV] learning_rate=1, max_depth=2, min_child_weight=5, n_estimators=100, subsample=0.5 
[CV]  learning_rate=1, max_depth=2, min_child_weight=5, n_estimators=100, subsample=0.5, total=   0.4s
[CV] learning_rate=1, max_depth=2, min_child_weight=5, n_estimators=100, subsample=0.5 
[CV]  learning_rate=1, max_depth=2, min_child_weight=5, n_estimators=100, subsample=0.5, total=   0.4s
[CV] learning_rate=1, max_depth=2, min_child_weight=5, n_estimators=100, subsample=0.8 
[CV]  learning_rate=1, max_depth=2, min_child_weight=5, n_estimators=100, subsample=0.8, total=   0.5s
[CV] learning_rate=1, max_depth=2, min_child_weight=5, n_estimators=100, subsample=0.8 
[CV]  learning_rate=1, max_depth=2, min_child

[CV]  learning_rate=1, max_depth=3, min_child_weight=1, n_estimators=250, subsample=0.7, total=   1.6s
[CV] learning_rate=1, max_depth=3, min_child_weight=1, n_estimators=250, subsample=0.7 
[CV]  learning_rate=1, max_depth=3, min_child_weight=1, n_estimators=250, subsample=0.7, total=   1.6s
[CV] learning_rate=1, max_depth=3, min_child_weight=1, n_estimators=250, subsample=0.5 
[CV]  learning_rate=1, max_depth=3, min_child_weight=1, n_estimators=250, subsample=0.5, total=   1.5s
[CV] learning_rate=1, max_depth=3, min_child_weight=1, n_estimators=250, subsample=0.5 
[CV]  learning_rate=1, max_depth=3, min_child_weight=1, n_estimators=250, subsample=0.5, total=   1.4s
[CV] learning_rate=1, max_depth=3, min_child_weight=1, n_estimators=250, subsample=0.5 
[CV]  learning_rate=1, max_depth=3, min_child_weight=1, n_estimators=250, subsample=0.5, total=   1.4s
[CV] learning_rate=1, max_depth=3, min_child_weight=1, n_estimators=250, subsample=0.8 
[CV]  learning_rate=1, max_depth=3, min_child

[CV]  learning_rate=1, max_depth=3, min_child_weight=5, n_estimators=5, subsample=0.5, total=   0.0s
[CV] learning_rate=1, max_depth=3, min_child_weight=5, n_estimators=5, subsample=0.5 
[CV]  learning_rate=1, max_depth=3, min_child_weight=5, n_estimators=5, subsample=0.5, total=   0.0s
[CV] learning_rate=1, max_depth=3, min_child_weight=5, n_estimators=5, subsample=0.8 
[CV]  learning_rate=1, max_depth=3, min_child_weight=5, n_estimators=5, subsample=0.8, total=   0.0s
[CV] learning_rate=1, max_depth=3, min_child_weight=5, n_estimators=5, subsample=0.8 
[CV]  learning_rate=1, max_depth=3, min_child_weight=5, n_estimators=5, subsample=0.8, total=   0.0s
[CV] learning_rate=1, max_depth=3, min_child_weight=5, n_estimators=5, subsample=0.8 
[CV]  learning_rate=1, max_depth=3, min_child_weight=5, n_estimators=5, subsample=0.8, total=   0.0s
[CV] learning_rate=1, max_depth=3, min_child_weight=5, n_estimators=30, subsample=0.7 
[CV]  learning_rate=1, max_depth=3, min_child_weight=5, n_estima

[CV]  learning_rate=1, max_depth=6, min_child_weight=1, n_estimators=30, subsample=0.5, total=   0.3s
[CV] learning_rate=1, max_depth=6, min_child_weight=1, n_estimators=30, subsample=0.5 
[CV]  learning_rate=1, max_depth=6, min_child_weight=1, n_estimators=30, subsample=0.5, total=   0.3s
[CV] learning_rate=1, max_depth=6, min_child_weight=1, n_estimators=30, subsample=0.5 
[CV]  learning_rate=1, max_depth=6, min_child_weight=1, n_estimators=30, subsample=0.5, total=   0.3s
[CV] learning_rate=1, max_depth=6, min_child_weight=1, n_estimators=30, subsample=0.8 
[CV]  learning_rate=1, max_depth=6, min_child_weight=1, n_estimators=30, subsample=0.8, total=   0.4s
[CV] learning_rate=1, max_depth=6, min_child_weight=1, n_estimators=30, subsample=0.8 
[CV]  learning_rate=1, max_depth=6, min_child_weight=1, n_estimators=30, subsample=0.8, total=   0.4s
[CV] learning_rate=1, max_depth=6, min_child_weight=1, n_estimators=30, subsample=0.8 
[CV]  learning_rate=1, max_depth=6, min_child_weight=1,

[CV]  learning_rate=1, max_depth=6, min_child_weight=2, n_estimators=100, subsample=0.7, total=   0.9s
[CV] learning_rate=1, max_depth=6, min_child_weight=2, n_estimators=100, subsample=0.5 
[CV]  learning_rate=1, max_depth=6, min_child_weight=2, n_estimators=100, subsample=0.5, total=   0.8s
[CV] learning_rate=1, max_depth=6, min_child_weight=2, n_estimators=100, subsample=0.5 
[CV]  learning_rate=1, max_depth=6, min_child_weight=2, n_estimators=100, subsample=0.5, total=   0.8s
[CV] learning_rate=1, max_depth=6, min_child_weight=2, n_estimators=100, subsample=0.5 
[CV]  learning_rate=1, max_depth=6, min_child_weight=2, n_estimators=100, subsample=0.5, total=   0.9s
[CV] learning_rate=1, max_depth=6, min_child_weight=2, n_estimators=100, subsample=0.8 
[CV]  learning_rate=1, max_depth=6, min_child_weight=2, n_estimators=100, subsample=0.8, total=   1.0s
[CV] learning_rate=1, max_depth=6, min_child_weight=2, n_estimators=100, subsample=0.8 
[CV]  learning_rate=1, max_depth=6, min_child

[CV]  learning_rate=1, max_depth=6, min_child_weight=5, n_estimators=250, subsample=0.7, total=   1.9s
[CV] learning_rate=1, max_depth=6, min_child_weight=5, n_estimators=250, subsample=0.7 
[CV]  learning_rate=1, max_depth=6, min_child_weight=5, n_estimators=250, subsample=0.7, total=   1.9s
[CV] learning_rate=1, max_depth=6, min_child_weight=5, n_estimators=250, subsample=0.5 
[CV]  learning_rate=1, max_depth=6, min_child_weight=5, n_estimators=250, subsample=0.5, total=   1.6s
[CV] learning_rate=1, max_depth=6, min_child_weight=5, n_estimators=250, subsample=0.5 
[CV]  learning_rate=1, max_depth=6, min_child_weight=5, n_estimators=250, subsample=0.5, total=   1.6s
[CV] learning_rate=1, max_depth=6, min_child_weight=5, n_estimators=250, subsample=0.5 
[CV]  learning_rate=1, max_depth=6, min_child_weight=5, n_estimators=250, subsample=0.5, total=   1.6s
[CV] learning_rate=1, max_depth=6, min_child_weight=5, n_estimators=250, subsample=0.8 
[CV]  learning_rate=1, max_depth=6, min_child

[CV]  learning_rate=0.1, max_depth=2, min_child_weight=1, n_estimators=250, subsample=0.8, total=   1.2s
[CV] learning_rate=0.1, max_depth=2, min_child_weight=2, n_estimators=5, subsample=0.7 
[CV]  learning_rate=0.1, max_depth=2, min_child_weight=2, n_estimators=5, subsample=0.7, total=   0.0s
[CV] learning_rate=0.1, max_depth=2, min_child_weight=2, n_estimators=5, subsample=0.7 
[CV]  learning_rate=0.1, max_depth=2, min_child_weight=2, n_estimators=5, subsample=0.7, total=   0.0s
[CV] learning_rate=0.1, max_depth=2, min_child_weight=2, n_estimators=5, subsample=0.7 
[CV]  learning_rate=0.1, max_depth=2, min_child_weight=2, n_estimators=5, subsample=0.7, total=   0.0s
[CV] learning_rate=0.1, max_depth=2, min_child_weight=2, n_estimators=5, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=2, min_child_weight=2, n_estimators=5, subsample=0.5, total=   0.0s
[CV] learning_rate=0.1, max_depth=2, min_child_weight=2, n_estimators=5, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=2, min_c

[CV]  learning_rate=0.1, max_depth=2, min_child_weight=5, n_estimators=5, subsample=0.8, total=   0.1s
[CV] learning_rate=0.1, max_depth=2, min_child_weight=5, n_estimators=30, subsample=0.7 
[CV]  learning_rate=0.1, max_depth=2, min_child_weight=5, n_estimators=30, subsample=0.7, total=   0.2s
[CV] learning_rate=0.1, max_depth=2, min_child_weight=5, n_estimators=30, subsample=0.7 
[CV]  learning_rate=0.1, max_depth=2, min_child_weight=5, n_estimators=30, subsample=0.7, total=   0.2s
[CV] learning_rate=0.1, max_depth=2, min_child_weight=5, n_estimators=30, subsample=0.7 
[CV]  learning_rate=0.1, max_depth=2, min_child_weight=5, n_estimators=30, subsample=0.7, total=   0.2s
[CV] learning_rate=0.1, max_depth=2, min_child_weight=5, n_estimators=30, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=2, min_child_weight=5, n_estimators=30, subsample=0.5, total=   0.1s
[CV] learning_rate=0.1, max_depth=2, min_child_weight=5, n_estimators=30, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=2

[CV]  learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=30, subsample=0.8, total=   0.3s
[CV] learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=30, subsample=0.8 
[CV]  learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=30, subsample=0.8, total=   0.2s
[CV] learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=30, subsample=0.8 
[CV]  learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=30, subsample=0.8, total=   0.2s
[CV] learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.7 
[CV]  learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.7, total=   0.7s
[CV] learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.7 
[CV]  learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.7, total=   0.7s
[CV] learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.7 
[CV]  learning_rate=0.1, max_d

[CV]  learning_rate=0.1, max_depth=3, min_child_weight=2, n_estimators=100, subsample=0.5, total=   0.6s
[CV] learning_rate=0.1, max_depth=3, min_child_weight=2, n_estimators=100, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=3, min_child_weight=2, n_estimators=100, subsample=0.5, total=   0.6s
[CV] learning_rate=0.1, max_depth=3, min_child_weight=2, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.1, max_depth=3, min_child_weight=2, n_estimators=100, subsample=0.8, total=   0.7s
[CV] learning_rate=0.1, max_depth=3, min_child_weight=2, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.1, max_depth=3, min_child_weight=2, n_estimators=100, subsample=0.8, total=   0.7s
[CV] learning_rate=0.1, max_depth=3, min_child_weight=2, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.1, max_depth=3, min_child_weight=2, n_estimators=100, subsample=0.8, total=   0.7s
[CV] learning_rate=0.1, max_depth=3, min_child_weight=2, n_estimators=250, subsample=0.7 
[CV]  learning_rate=0.1, 

[CV]  learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=250, subsample=0.7, total=   1.6s
[CV] learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=250, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=250, subsample=0.5, total=   1.6s
[CV] learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=250, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=250, subsample=0.5, total=   1.5s
[CV] learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=250, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=250, subsample=0.5, total=   1.4s
[CV] learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=250, subsample=0.8 
[CV]  learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=250, subsample=0.8, total=   2.0s
[CV] learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=250, subsample=0.8 
[CV]  learning_rate=0.1, 

[CV]  learning_rate=0.1, max_depth=6, min_child_weight=2, n_estimators=5, subsample=0.7, total=   0.1s
[CV] learning_rate=0.1, max_depth=6, min_child_weight=2, n_estimators=5, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=6, min_child_weight=2, n_estimators=5, subsample=0.5, total=   0.1s
[CV] learning_rate=0.1, max_depth=6, min_child_weight=2, n_estimators=5, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=6, min_child_weight=2, n_estimators=5, subsample=0.5, total=   0.1s
[CV] learning_rate=0.1, max_depth=6, min_child_weight=2, n_estimators=5, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=6, min_child_weight=2, n_estimators=5, subsample=0.5, total=   0.1s
[CV] learning_rate=0.1, max_depth=6, min_child_weight=2, n_estimators=5, subsample=0.8 
[CV]  learning_rate=0.1, max_depth=6, min_child_weight=2, n_estimators=5, subsample=0.8, total=   0.1s
[CV] learning_rate=0.1, max_depth=6, min_child_weight=2, n_estimators=5, subsample=0.8 
[CV]  learning_rate=0.1, max_depth=6, min_chi

[CV]  learning_rate=0.1, max_depth=6, min_child_weight=5, n_estimators=30, subsample=0.7, total=   0.4s
[CV] learning_rate=0.1, max_depth=6, min_child_weight=5, n_estimators=30, subsample=0.7 
[CV]  learning_rate=0.1, max_depth=6, min_child_weight=5, n_estimators=30, subsample=0.7, total=   0.3s
[CV] learning_rate=0.1, max_depth=6, min_child_weight=5, n_estimators=30, subsample=0.7 
[CV]  learning_rate=0.1, max_depth=6, min_child_weight=5, n_estimators=30, subsample=0.7, total=   0.3s
[CV] learning_rate=0.1, max_depth=6, min_child_weight=5, n_estimators=30, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=6, min_child_weight=5, n_estimators=30, subsample=0.5, total=   0.3s
[CV] learning_rate=0.1, max_depth=6, min_child_weight=5, n_estimators=30, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=6, min_child_weight=5, n_estimators=30, subsample=0.5, total=   0.3s
[CV] learning_rate=0.1, max_depth=6, min_child_weight=5, n_estimators=30, subsample=0.5 
[CV]  learning_rate=0.1, max_depth=

[CV]  learning_rate=0.01, max_depth=2, min_child_weight=1, n_estimators=30, subsample=0.8, total=   0.2s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=1, n_estimators=100, subsample=0.7 
[CV]  learning_rate=0.01, max_depth=2, min_child_weight=1, n_estimators=100, subsample=0.7, total=   0.5s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=1, n_estimators=100, subsample=0.7 
[CV]  learning_rate=0.01, max_depth=2, min_child_weight=1, n_estimators=100, subsample=0.7, total=   0.5s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=1, n_estimators=100, subsample=0.7 
[CV]  learning_rate=0.01, max_depth=2, min_child_weight=1, n_estimators=100, subsample=0.7, total=   0.5s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=1, n_estimators=100, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=2, min_child_weight=1, n_estimators=100, subsample=0.5, total=   0.4s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=1, n_estimators=100, subsample=0.5 
[CV]  learning_r

[CV]  learning_rate=0.01, max_depth=2, min_child_weight=2, n_estimators=100, subsample=0.5, total=   0.4s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=2, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=2, min_child_weight=2, n_estimators=100, subsample=0.8, total=   0.5s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=2, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=2, min_child_weight=2, n_estimators=100, subsample=0.8, total=   0.5s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=2, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=2, min_child_weight=2, n_estimators=100, subsample=0.8, total=   0.5s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=2, n_estimators=250, subsample=0.7 
[CV]  learning_rate=0.01, max_depth=2, min_child_weight=2, n_estimators=250, subsample=0.7, total=   1.2s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=2, n_estimators=250, subsample=0.7 
[CV]  learning_

[CV]  learning_rate=0.01, max_depth=2, min_child_weight=5, n_estimators=250, subsample=0.7, total=   1.1s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=5, n_estimators=250, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=2, min_child_weight=5, n_estimators=250, subsample=0.5, total=   1.0s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=5, n_estimators=250, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=2, min_child_weight=5, n_estimators=250, subsample=0.5, total=   1.0s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=5, n_estimators=250, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=2, min_child_weight=5, n_estimators=250, subsample=0.5, total=   1.0s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=5, n_estimators=250, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=2, min_child_weight=5, n_estimators=250, subsample=0.8, total=   1.2s
[CV] learning_rate=0.01, max_depth=2, min_child_weight=5, n_estimators=250, subsample=0.8 
[CV]  learning_

[CV]  learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=250, subsample=0.8, total=   1.6s
[CV] learning_rate=0.01, max_depth=3, min_child_weight=2, n_estimators=5, subsample=0.7 
[CV]  learning_rate=0.01, max_depth=3, min_child_weight=2, n_estimators=5, subsample=0.7, total=   0.0s
[CV] learning_rate=0.01, max_depth=3, min_child_weight=2, n_estimators=5, subsample=0.7 
[CV]  learning_rate=0.01, max_depth=3, min_child_weight=2, n_estimators=5, subsample=0.7, total=   0.0s
[CV] learning_rate=0.01, max_depth=3, min_child_weight=2, n_estimators=5, subsample=0.7 
[CV]  learning_rate=0.01, max_depth=3, min_child_weight=2, n_estimators=5, subsample=0.7, total=   0.0s
[CV] learning_rate=0.01, max_depth=3, min_child_weight=2, n_estimators=5, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=3, min_child_weight=2, n_estimators=5, subsample=0.5, total=   0.0s
[CV] learning_rate=0.01, max_depth=3, min_child_weight=2, n_estimators=5, subsample=0.5 
[CV]  learning_rate=0.01, max_dep

[CV]  learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=5, subsample=0.8, total=   0.0s
[CV] learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=30, subsample=0.7 
[CV]  learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=30, subsample=0.7, total=   0.2s
[CV] learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=30, subsample=0.7 
[CV]  learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=30, subsample=0.7, total=   0.2s
[CV] learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=30, subsample=0.7 
[CV]  learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=30, subsample=0.7, total=   0.2s
[CV] learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=30, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=30, subsample=0.5, total=   0.2s
[CV] learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=30, subsample=0.5 
[CV]  learning_rate=0.01, 

[CV]  learning_rate=0.01, max_depth=6, min_child_weight=1, n_estimators=30, subsample=0.5, total=   0.3s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=1, n_estimators=30, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=6, min_child_weight=1, n_estimators=30, subsample=0.8, total=   0.4s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=1, n_estimators=30, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=6, min_child_weight=1, n_estimators=30, subsample=0.8, total=   0.4s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=1, n_estimators=30, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=6, min_child_weight=1, n_estimators=30, subsample=0.8, total=   0.4s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=1, n_estimators=100, subsample=0.7 
[CV]  learning_rate=0.01, max_depth=6, min_child_weight=1, n_estimators=100, subsample=0.7, total=   1.2s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=1, n_estimators=100, subsample=0.7 
[CV]  learning_rate=0.

[CV]  learning_rate=0.01, max_depth=6, min_child_weight=2, n_estimators=100, subsample=0.7, total=   1.2s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=2, n_estimators=100, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=6, min_child_weight=2, n_estimators=100, subsample=0.5, total=   1.0s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=2, n_estimators=100, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=6, min_child_weight=2, n_estimators=100, subsample=0.5, total=   1.0s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=2, n_estimators=100, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=6, min_child_weight=2, n_estimators=100, subsample=0.5, total=   1.0s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=2, n_estimators=100, subsample=0.8 
[CV]  learning_rate=0.01, max_depth=6, min_child_weight=2, n_estimators=100, subsample=0.8, total=   1.3s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=2, n_estimators=100, subsample=0.8 
[CV]  learning_

[CV]  learning_rate=0.01, max_depth=6, min_child_weight=5, n_estimators=100, subsample=0.8, total=   1.2s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=5, n_estimators=250, subsample=0.7 
[CV]  learning_rate=0.01, max_depth=6, min_child_weight=5, n_estimators=250, subsample=0.7, total=   2.7s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=5, n_estimators=250, subsample=0.7 
[CV]  learning_rate=0.01, max_depth=6, min_child_weight=5, n_estimators=250, subsample=0.7, total=   3.1s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=5, n_estimators=250, subsample=0.7 
[CV]  learning_rate=0.01, max_depth=6, min_child_weight=5, n_estimators=250, subsample=0.7, total=   2.9s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=5, n_estimators=250, subsample=0.5 
[CV]  learning_rate=0.01, max_depth=6, min_child_weight=5, n_estimators=250, subsample=0.5, total=   2.6s
[CV] learning_rate=0.01, max_depth=6, min_child_weight=5, n_estimators=250, subsample=0.5 
[CV]  learning_

[Parallel(n_jobs=1)]: Done 972 out of 972 | elapsed: 11.6min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'learning_rate': [1, 0.1, 0.01], 'max_depth': [2, 3, 6], 'min_child_weight': [1, 2, 5], 'subsample': [0.7, 0.5, 0.8], 'n_estimators': [5, 30, 100, 250]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=2)

In [65]:
print(f"Testing Accuracy: {x_grid_search.best_score_*100}")
print(f"Optimal Parameters: {x_grid_search.best_params_}")

Testing Accuracy: 60.98459477561956
Optimal Parameters: {'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 1, 'n_estimators': 5, 'subsample': 0.7}


In [66]:
# Fitting and training XGB
x_clf = xgb.XGBClassifier(learning_rate=.1, max_depth=2, min_child_weight=1, n_estimators=5, subsample=.7)
x_clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=2, min_child_weight=1, missing=None,
       n_estimators=5, n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=0.7, verbosity=1)

### Results from GridSearch Parameters for XGBoost

In [67]:
y_pred = x_clf.predict(X_test)

print_metrics(y_test, y_pred)

Precision Score: 0.625
Recall Score: 0.4166666666666667
Accuracy Score: 0.5652173913043478
F1 Score: 0.5


In [68]:
# Area under the curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("AUC:", round(roc_auc, 4))

AUC: 0.572


In [69]:
# Classification Report
print(classification_report(y_test, y_pred))

# Confusion Matrix
y_pred = x_clf.predict(X_test)
print("Confusion Matrix \n-------------------------")
pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

              precision    recall  f1-score   support

       False       0.53      0.73      0.62       286
        True       0.62      0.42      0.50       312

   micro avg       0.57      0.57      0.57       598
   macro avg       0.58      0.57      0.56       598
weighted avg       0.58      0.57      0.56       598

Confusion Matrix 
-------------------------


Predicted,False,True,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,208,78,286
True,182,130,312
All,390,208,598
