In [1]:
!pip install pandas sklearn xgboost hyperopt



In [2]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold, GridSearchCV, RandomizedSearchCV, KFold
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.pipeline import Pipeline
import pickle
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import warnings
from sklearn.feature_selection import SelectKBest, chi2
warnings.filterwarnings('ignore')


## Generate baseline model

In [3]:
df = pd.read_csv(r"https://raw.githubusercontent.com/medinaltbx/G6_DP3/master/data/input/merged_data/train/merged_train.csv",sep=';')
df.drop(["customerid"],axis=1,inplace=True)
X, y = df.drop(["good_bad_flag"],axis=1), df['good_bad_flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

D_train = xgb.DMatrix(X_train, label=y_train)
D_test = xgb.DMatrix(X_test, label=y_test)

baseline_model = xgb.XGBClassifier()
baseline_model.fit(X_train, y_train)

y_pred = baseline_model.predict(X_test)
accuracy_baseline = accuracy_score(y_test, y_pred)
print("BASELINE ACCURACY: ",accuracy_baseline)

BASELINE ACCURACY:  0.7530487804878049


## Generate model with base variables + Grid Search

In [None]:
df = pd.read_csv(rf"https://raw.githubusercontent.com/medinaltbx/G6_DP3/master/data/input/merged_data/train/merged_train.csv",sep=';')
df.drop(["customerid"],axis=1,inplace=True)
X, y = df.drop(["good_bad_flag"],axis=1), df['good_bad_flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

pipe = Pipeline([('fs', SelectKBest()),('clf', xgb.XGBClassifier(objective='binary:logistic'))])

# Define our search space for grid search
search_space = [
  {
    'clf__n_estimators': [100, 200, 300],
    'clf__learning_rate': [0.01, 0.1, 0.2],
    'clf__max_depth': range(3, 10),
    'clf__colsample_bytree': [i/10.0 for i in range(1, 3)],
    'clf__gamma': [i/10.0 for i in range(3)],
    'fs__score_func': [chi2],
    'fs__k': [3],
  }
]
# Define cross validation
kfold = KFold(n_splits=3)
# AUC and accuracy as score
scoring = {'AUC':'roc_auc', 'Accuracy':make_scorer(accuracy_score)}
# Define grid search
grid = GridSearchCV(
  pipe,
  param_grid=search_space,
  cv=kfold,
  scoring=scoring,
  refit='AUC',
  verbose=10,
  n_jobs=-1
)
# Fit grid search
model = grid.fit(X_train, y_train)

predict = model.predict(X_test)
print('Best AUC Score: {}'.format(model.best_score_))
print('Accuracy: {}'.format(accuracy_score(y_test, predict)))
print(confusion_matrix(y_test,predict))

Fitting 3 folds for each of 378 candidates, totalling 1134 fits
[CV 2/3; 1/378] START clf__colsample_bytree=0.1, clf__gamma=0.0, clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, fs__k=3, fs__score_func=<function chi2 at 0x7f5918f3a040>
[CV 2/3; 1/378] END clf__colsample_bytree=0.1, clf__gamma=0.0, clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=100, fs__k=3, fs__score_func=<function chi2 at 0x7f5918f3a040>; AUC: (test=0.617) Accuracy: (test=0.784) total time=   0.7s
[CV 2/3; 2/378] START clf__colsample_bytree=0.1, clf__gamma=0.0, clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=200, fs__k=3, fs__score_func=<function chi2 at 0x7f5918f3a040>
[CV 2/3; 2/378] END clf__colsample_bytree=0.1, clf__gamma=0.0, clf__learning_rate=0.01, clf__max_depth=3, clf__n_estimators=200, fs__k=3, fs__score_func=<function chi2 at 0x7f5918f3a040>; AUC: (test=0.618) Accuracy: (test=0.784) total time=   1.3s
[CV 1/3; 4/378] START clf__colsample_bytree=0.1, clf__gamma=0.