In [49]:
import joblib
import xgboost
import multiprocessing
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [2]:
features = joblib.load("features.gz")
labels = joblib.load("labels.gz")

In [5]:
print(features.shape)
features

(111, 22)


array([[ 1.75000000e+06,  1.95190000e+05, -3.50438600e+06, ...,
         3.20168498e+00, -3.33573621e-01,  2.22222222e-01],
       [ 5.60000000e+06,  1.95190000e+05, -1.17534000e+05, ...,
         2.21128965e-01, -3.33573621e-01,  2.77777778e-01],
       [ 2.00000000e+05,  1.95190000e+05, -4.16700000e+03, ...,
         2.77457723e-01, -3.33573621e-01,  3.58974359e-01],
       ...,
       [ 3.00000000e+05,  1.95190000e+05, -1.17534000e+05, ...,
         2.77457723e-01, -3.33573621e-01,  0.00000000e+00],
       [ 7.50000000e+05,  1.95190000e+05, -1.17534000e+05, ...,
         2.77457723e-01, -3.33573621e-01,  3.68421053e-01],
       [ 6.50000000e+05,  1.95190000e+05, -1.17534000e+05, ...,
         2.77457723e-01, -3.33573621e-01,  5.34979424e-02]])

In [7]:
print(labels.shape)
labels

(111,)


0       True
1       True
2       True
3       True
4       True
       ...  
108    False
109    False
110    False
111    False
112    False
Name: poi, Length: 111, dtype: bool

In [9]:
df_features = pd.DataFrame(features)
df_labels = pd.DataFrame(labels)

In [23]:
df = pd.concat([df_features, df_labels], axis=1)
df.dropna(inplace=True, axis=0)
df["poi"] = df["poi"].astype(int)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,poi
0,1750000.0,195190.0,-3504386.0,108579.0,19794175.0,46950.0,18.0,42.0,4.0,40962500.0,...,-139856.5,420636.0,864.0,905.0,505050.0,22542539.0,3.201685,-0.333574,0.222222,1
1,5600000.0,195190.0,-117534.0,108579.0,19250000.0,29336.0,108.0,88.0,30.0,40962500.0,...,-139856.5,1111258.0,2042.0,3627.0,8682716.0,26093672.0,0.221129,-0.333574,0.277778,1
2,200000.0,195190.0,-4167.0,108579.0,1624396.0,22884.0,39.0,13.0,14.0,40962500.0,...,-139856.5,211844.0,91.0,225.0,2003885.0,2493616.0,0.277458,-0.333574,0.358974,1
3,800000.0,195190.0,-117534.0,108579.0,850010.0,118134.0,45.0,28.0,7.0,40962500.0,...,-139856.5,224305.0,599.0,1088.0,2652612.0,985032.0,0.227199,-0.333574,0.070751,1
4,1250000.0,195190.0,-262500.0,108579.0,850010.0,35818.0,144.0,199.0,25.0,40962500.0,...,-139856.5,240189.0,2188.0,2598.0,1639297.0,126027.0,0.228942,-0.333574,0.173611,1


In [40]:
test_dataframe = df.sample(frac=0.2, random_state=914)
train_dataframe = df.drop(test_dataframe.index)

In [41]:
train_labels = train_dataframe.pop("poi")

In [42]:
xgb = xgboost.XGBClassifier(
    learning_rate=0.01,  
    n_estimators=400, 
    random_state=17)

params = { 
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1, 1.5, 2, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [5, 10, 15, 20]
}

folds = 10
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=914)

random_search = RandomizedSearchCV(
    xgb, 
    param_distributions=params, 
    n_iter=param_comb, 
    scoring='roc_auc', 
    n_jobs=multiprocessing.cpu_count(), 
    cv=skf.split(train_dataframe, train_labels), 
    verbose=2, 
    random_state=914)
random_search.fit(train_dataframe, train_labels)
xgb = random_search.best_estimator_

Fitting 10 folds for each of 5 candidates, totalling 50 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  50 out of  50 | elapsed:    3.4s finished


In [43]:
test_labels = test_dataframe.pop("poi")

In [45]:
test_score = xgb.score(test_dataframe, test_labels)
test_score

0.9090909090909091

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix