In [1]:
import numpy as np
import pickle as pkl

from sklearn.linear_model import Perceptron 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

import matplotlib.pyplot as plt
%matplotlib inline

In [130]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score

In [73]:
from xgboost import XGBClassifier

## Load and prepare data

In [43]:
with open('points.pkl', 'rb') as f:
    data = pkl.load(f)
    
good_idxs = []
for i in range(len(data)):
    if np.array(data[i]).shape != (40, 2):
        pass
    else:
        good_idxs.append(i)

        
train_data = np.array([data[i] for i in range(len(data)) if i in good_idxs])
train_data.shape

(1447, 40, 2)

In [125]:
with open('generated_data.pkl', 'rb') as f:
    generated_data = np.array(pkl.load(f))
generated_data.shape

(2468, 40, 2)

In [126]:
train_true = train_data.reshape(-1, 80)
train_false = generated_data.reshape(-1, 80)

In [127]:
train_df = np.concatenate([train_true, train_false])

idxs = np.random.choice(train_df.shape[0], train_df.shape[0])
train_df = train_df[idxs]
train_df.shape

(3915, 80)

In [128]:
y_train = np.zeros(train_df.shape[0])
y_train[train_true.shape[0]:] = 1
y_train = y_train[idxs]
y_train.shape

(3915,)

In [129]:
train, X_test, train_y, y_test = train_test_split(train_df, y_train, test_size=0.1)

## Train a model and assess viability

In [10]:
xgb = XGBClassifier(n_estimators=300, max_depth=7, verbosity=1)

In [11]:
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=300, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=1)

In [12]:
pred_test = xgb.predict(X_test)

In [13]:
(accuracy_score(y_test, pred_test),
recall_score(y_test, pred_test),
precision_score(y_test, pred_test),
roc_auc_score(y_test, xgb.predict_proba(X_test)[:, 1]))

(0.946843853820598, 0.9556962025316456, 0.94375, 0.9856156501726122)

In [14]:
%%time

xgb = XGBClassifier(n_estimators=300, max_depth=7, verbosity=1)
cross_val_score(xgb, train, train_y, cv=5, scoring='roc_auc')

Wall time: 17.6 s


array([0.9823412 , 0.98058974, 0.97213169, 0.98181617, 0.98110655])

## Optimize hyperparameters

In [58]:
%%time

parameters = {'max_depth':[5, 7, 10], 'learning_rate':[0.2, 0.3, 0.5], 'n_estimators':[100, 300, 500, 900], 'min_child_weight': [1, 3, 5, 7]}

xgb = XGBClassifier(verbosity=1)

clf = GridSearchCV(xgb, parameters, cv=5, scoring='roc_auc')
clf.fit(train, train_y)

Wall time: 53min 15s


GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None,

In [59]:
clf.cv_results_

{'mean_fit_time': array([ 1.70682406,  3.98786511,  5.82111096, 10.04012213,  1.55788803,
         3.78511882,  5.37498064,  8.65793705,  1.35071592,  3.74731493,
         5.81800776,  9.20665002,  1.60092487,  4.28789177,  5.44178925,
         8.66478701,  2.01296463,  4.6593926 ,  7.26237397, 10.23862882,
         1.69431801,  4.00617762,  5.71888556,  8.71746011,  1.65176167,
         7.90208421,  8.38489952, 18.44388933,  2.05787473,  4.73514457,
         5.7254025 ,  8.36648135,  2.53626275,  5.2863843 ,  7.73922243,
        11.82309303,  2.15176425,  4.62118878,  6.75728278,  9.81070857,
         3.10522623,  5.70139437,  5.39218102,  9.04771347,  2.40659122,
         4.9299737 ,  6.29820585,  8.14018798,  1.40738335,  3.8059031 ,
         5.29619408,  8.18670754,  1.62246981,  3.76149035,  4.69401593,
         7.81611896,  1.58723116,  3.57893081,  5.38537393,  8.4270226 ,
         1.9091784 ,  3.20861869,  4.38093901,  7.51794286,  1.90099478,
         4.00581846,  6.70359774, 

In [63]:
best_clf = clf.best_estimator_

In [61]:
clf.best_params_

{'learning_rate': 0.3,
 'max_depth': 7,
 'min_child_weight': 1,
 'n_estimators': 300}

In [62]:
clf.best_score_

0.9787936726798782

In [197]:
%%time

parameters = {'max_depth':[7], 'learning_rate':[0.3], 'n_estimators':[300], 'min_child_weight': [2]}

xgb = XGBClassifier(verbosity=1)

clf = GridSearchCV(xgb, parameters, cv=5, scoring='roc_auc')
clf.fit(train, train_y)

Wall time: 24.8 s


GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None,

In [198]:
best_clf = clf.best_estimator_

In [199]:
y_pred = best_clf.predict(X_test)

In [200]:
print("Accuracy:   {}\nRecall:     {}\nPrecision:  {}\nROC-AUC:    {}".format(accuracy_score(y_test, y_pred),
recall_score(y_test, y_pred),
precision_score(y_test, y_pred),
roc_auc_score(y_test, best_clf.predict_proba(X_test)[:, 1])))

Accuracy:   0.9719387755102041
Recall:     0.9767441860465116
Precision:  0.980544747081712
ROC-AUC:    0.9947645493462919


Not bad.............