In [1]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, plot_roc_curve, auc, roc_auc_score, classification_report, confusion_matrix, plot_confusion_matrix
from matplotlib import pyplot as plt
import xgboost as xgb
import pandas as pd
import numpy as np
import utils
import preprocessing

In [2]:
imputer = SimpleImputer(strategy='most_frequent')
df = utils.get_data()
df_imp = pd.DataFrame(imputer.fit_transform(df[['ganancia_perdida_declarada_bolsa_argentina', 'edad', 'rol_familiar_registrado', 'anios_estudiados']]))


In [3]:
X = df_imp
X = pd.get_dummies(X)
Y = df['tiene_alto_valor_adquisitivo']

X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=112)

In [4]:
scaler = MinMaxScaler(feature_range=(0,1))

X_train = pd.DataFrame(scaler.fit_transform(X_train))
X_test = pd.DataFrame(scaler.transform(X_test))

In [41]:
params = { 'objective': ['binary:logistic', 'binary:logitraw', 'binary:hirenge', 'g:squarederror', 'reg:squaredlogerror'], 'n_estimators':range(50,400,50), 'max_depth':range(1,8,1), 'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3], 'booster': ['gbtree', 'gblinear', 'dart'] }
model = xgb.XGBClassifier()

In [42]:
gscv = GridSearchCV(model, params, scoring='roc_auc', cv=4, n_jobs=-1, verbose=10)

In [None]:
%%time
gscv.fit(X_train, y_train)
score = roc_auc_score(y_test, gscv.predict_proba(X_test)[:,1])

Fitting 4 folds for each of 100 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   47.3s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 10.6min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 12.5min


In [None]:
gscv.best_params_

In [None]:
score

In [None]:
print(classification_report(y_test,gscv.predict(X_test)))

In [None]:
fig, ax = plt.subplots(figsize=(15,7))
plt.grid(False)
plot_confusion_matrix(gscv, X_test, y_test, cmap=plt.cm.Blues, display_labels=['1', '0'], ax=ax)
plt.show()

In [None]:
plot_roc_curve(gscv, X_test, y_test)