In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE

In [None]:
clean = pd.read_csv("machinelearning.csv")

In [None]:
X = clean.copy()
Y = clean['MinorityDriver']

seed = 7
test_size = 0.30
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
X_train = X_train.drop("Unnamed: 0", axis = 1)
X_test = X_test.drop("Unnamed: 0", axis = 1)

In [None]:
folds = KFold(n_splits = 5, shuffle = True)
print("here")
for train_index, test_index in folds.split(X_train):
        current_train_x, current_test_x = X_train.iloc[train_index], X_train.iloc[test_index]
        current_train_y, current_test_y = X_train.iloc[train_index], X_train.iloc[test_index]
        print("here")
        total_x = len(current_train_x[current_train_x['MinorityDriver'] == 1])
        white_x = current_train_x[current_train_x.MinorityDriver == 0].index
        random_white_x = np.random.choice(white_x, total_x, replace = 'False')
        total_index_x = current_train_x[current_train_x.MinorityDriver == 1].index
        total_both_x = np.concatenate([total_index_x, random_white_x])
        current_train_x = current_train_x.loc[total_both_x]
        print("here")
        current_train_y = current_train_x['MinorityDriver']
        current_train_x = current_train_x.drop("MinorityDriver", axis = 1)
        current_test_x = current_test_x.drop("MinorityDriver", axis = 1)
        current_test_y = current_test_y['MinorityDriver']
        print("here")
        model = XGBClassifier(objective= 'binary:logistic')
        model.fit(current_train_x, current_train_y)
        print("here")
        y_pred = model.predict(current_test_x)
        print("here")
        predictions = [round(value) for value in y_pred]
        average_precision = average_precision_score(current_test_y, predictions)
        accuracy = accuracy_score(current_test_y, predictions)
        print("Accuracy: %.2f%%" % (accuracy * 100.0))
        print('Average precision-recall score: {0:0.2f}'.format(
              average_precision))
        auc = roc_auc_score(current_test_y, predictions)
        f1 = f1_score(current_test_y, predictions)
        print(f1)
        print(auc)
        print('EndResult')

In [None]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.3, max_depth=5,
 min_child_weight=2, objective= 'binary:logistic', eval_metric = 'error'), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=-1,iid=False, cv=3)
gsearch1.fit(X_train ,y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
param_test2 = {
 'max_depth':[6,7,8],
 'min_child_weight':[4,5,6]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier(learning_rate=0.3, max_depth=5,
 min_child_weight=2, objective= 'binary:logistic'), 
 param_grid = param_test2, scoring='roc_auc', n_jobs = -1, iid=False, cv=3)
gsearch2.fit(X_train, y_train)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

In [None]:
param_test2b = {
 'min_child_weight':[2,4,6,8,10,12]
}
gsearch2b = GridSearchCV(estimator = XGBClassifier(learning_rate=0.1, max_depth=8,
 min_child_weight=2,  objective= 'binary:logistic'), 
 param_grid = param_test2b, scoring='roc_auc',n_jobs=-1,iid=False, cv=3)
gsearch2b.fit(X_train,y_train)
gsearch2b.grid_scores_, gsearch2b.best_params_, gsearch2b.best_score_

In [None]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,7)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier(max_depth=8,
 min_child_weight=4, gamma=0, objective= 'binary:logistic'), 
 param_grid = param_test3, scoring='roc_auc',n_jobs=-1,iid=False, cv=3)
gsearch3.fit(X_train,y_train)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

In [None]:
param_test4 = {
 'n_estimators':[0,50,100,150,200,250,300,350,400,450,500]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier(learning_rate = 0.3, max_depth=8,
 min_child_weight=4, gamma=0.1, objective= 'binary:logistic'), 
 param_grid = param_test4, scoring='roc_auc',n_jobs=-1,iid=False, cv=3)
gsearch4.fit(X_train,y_train)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

In [None]:
param_test4a = {
 'n_estimators':[100,110,120,130,140,150]
}
gsearch4a = GridSearchCV(estimator = XGBClassifier(learning_rate = 0.3, max_depth=8,
 min_child_weight=4, gamma=0.1, objective= 'binary:logistic'), 
 param_grid = param_test4a, scoring='roc_auc',n_jobs=-1,iid=False, cv=3)
gsearch4a.fit(X_train,y_train)
gsearch4a.grid_scores_, gsearch4a.best_params_, gsearch4a.best_score_

In [None]:
param_test4b = {
 'n_estimators':[110,111,112,113,114,115,116,117,118,119,120]
}
gsearch4b = GridSearchCV(estimator = XGBClassifier(learning_rate = 0.3, max_depth=8,
 min_child_weight=4, gamma=0.1, objective= 'binary:logistic'), 
 param_grid = param_test4b, scoring='roc_auc',n_jobs=-1,iid=False, cv=3)
gsearch4b.fit(X_train,y_train)
gsearch4b.grid_scores_, gsearch4b.best_params_, gsearch4b.best_score_

In [None]:
param_test5 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.3, n_estimators=117, max_depth=8,
 min_child_weight=4, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic'), 
 param_grid = param_test5, scoring='roc_auc',n_jobs=-1,iid=False, cv=3)
gsearch5.fit(X_train, y_train)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_

In [None]:
param_test6 = {
 'subsample':[i/100.0 for i in range(85,100,5)],
 'colsample_bytree':[i/100.0 for i in range(55,70,5)]
}
gsearch6 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.3, n_estimators=117, max_depth=8,
 min_child_weight=4, gamma=0.1, subsample=0.9, colsample_bytree=0.6,
 objective= 'binary:logistic'), 
 param_grid = param_test6, scoring='roc_auc',n_jobs=-1,iid=False, cv=3)
gsearch6.fit(X_train, y_train)
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_

In [None]:
param_test7 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch7 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.3, n_estimators=117, max_depth=8,
 min_child_weight=4, gamma=0.1, subsample=0.95, colsample_bytree=0.55,
 objective= 'binary:logistic'), 
 param_grid = param_test7, scoring='roc_auc',n_jobs=-1,iid=False, cv=3)
gsearch7.fit(X_train, y_train)
gsearch7.grid_scores_, gsearch7.best_params_, gsearch7.best_score_

In [None]:
folds = KFold(n_splits = 5, shuffle = True)
print("here")
for train_index, test_index in folds.split(X_train):
        current_train_x, current_test_x = X_train.iloc[train_index], X_train.iloc[test_index]
        current_train_y, current_test_y = X_train.iloc[train_index], X_train.iloc[test_index]
        print("here")
        total_x = len(current_train_x[current_train_x['MinorityDriver'] == 1])
        white_x = current_train_x[current_train_x.MinorityDriver == 0].index
        random_white_x = np.random.choice(white_x, total_x, replace = 'False')
        total_index_x = current_train_x[current_train_x.MinorityDriver == 1].index
        total_both_x = np.concatenate([total_index_x, random_white_x])
        current_train_x = current_train_x.loc[total_both_x]
        print("here")
        current_train_y = current_train_x['MinorityDriver']
        current_train_x = current_train_x.drop("MinorityDriver", axis = 1)
        current_test_x = current_test_x.drop("MinorityDriver", axis = 1)
        current_test_y = current_test_y['MinorityDriver']
        print("here")
        model = XGBClassifier(learning_rate =0.1, 
                              n_estimators=300, max_depth=8, min_child_weight=4, gamma=0.1, 
                              subsample=0.95, colsample_bytree=0.55, reg_alpha = 1, objective= 'binary:logistic',
                              scale_pos_weight = 1)
        model.fit(current_train_x, current_train_y)
        print("here")
        y_pred = model.predict(current_test_x)
        print("here")
        predictions = [round(value) for value in y_pred]
        average_precision = average_precision_score(current_test_y, predictions)
        accuracy = accuracy_score(current_test_y, predictions)
        print("Accuracy: %.2f%%" % (accuracy * 100.0))
        print('Average precision-recall score: {0:0.2f}'.format(
              average_precision))
        auc = roc_auc_score(current_test_y, predictions)
        f1 = f1_score(current_test_y, predictions)
        print(f1)
        print(auc)
        print('EndResult')

In [None]:
total_x = len(X_train[X_train['MinorityDriver'] == 1])
white_x = X_train[X_train.MinorityDriver == 0].index
random_white_x = np.random.choice(white_x, total_x, replace = 'False')
total_index_x = X_train[X_train.MinorityDriver == 1].index
total_both_x = np.concatenate([total_index_x, random_white_x])
X_train = X_train.loc[total_both_x]

y_train = X_train['MinorityDriver']

X_train = X_train.drop("MinorityDriver", axis = 1)
X_train = X_train.drop("Unnamed: 0", axis = 1)
X_test = X_test.drop("MinorityDriver", axis = 1)
X_test = X_test.drop("Unnamed: 0", axis = 1)

In [None]:
model = XGBClassifier(learning_rate =0.3, n_estimators=300, max_depth=8,
                      min_child_weight=4, gamma=0.1, subsample=0.95, colsample_bytree=0.55,
                      reg_alpha = 1, objective= 'binary:logistic', scale_pos_weight = 1)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
average_precision = average_precision_score(y_test, predictions)

In [None]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
f1 = f1_score(y_test, predictions)
print(f1)
print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))

In [None]:
import matplotlib.pyplot as plt
import xgboost as xgb
xgb.plot_importance(model)
plt.rcParams['figure.figsize'] = [12, 12]
plt.savefig('importance.png', bbox_inches = 'tight')

In [None]:
from xgboost import plot_tree
plot_tree(model)
plt.savefig('tree.png')