# Credit Card Default
August 15, 2019<br>Ngoc, Modeling Transformed Data

-----------------------------------------------------------------

In this notebook, we will perform modeling for transformed data.

----------------------------------------------

# Import

In [1]:
%matplotlib inline

import os
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import matplotlib.pyplot as plt

from pathlib import Path
from sklearn.svm import SVC
from datetime import datetime
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from yellowbrick.classifier import ConfusionMatrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PowerTransformer, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import auc, f1_score, precision_score, recall_score, roc_curve, roc_auc_score

sns.set_style("whitegrid")
warnings.filterwarnings("ignore")

Change directory to get data source code:

In [2]:
main_path = os.getcwd()
parent_path = Path(os.getcwd()).parent
src_path = str(parent_path) + "/src/directories"
os.chdir(src_path)

More import:

In [3]:
import change_directory

Change back to notebook directory:

In [4]:
os.chdir(main_path)

# Helper functions

In [5]:
def plot_confusion_matrix(model, X_train, X_test, y_train, y_test):
    model_cm = ConfusionMatrix(model)
    model_cm.fit(X_train, y_train)
    model_cm.score(X_test, y_test)
    model_cm.poof()

def all_scores(y_true, y_pred):
    print(f'f-1 score: {f1_score(y_true, y_pred)}')
    print(f'recall: {recall_score(y_true, y_pred)}')
    print(f'precision: {precision_score(y_true, y_pred)}')

# ETL

## Load data

Change directory to get data:

In [6]:
cd = change_directory.ChangeDirectory()

In [7]:
cd.change_to_data_dir()

Load data:

In [8]:
ccd_df = pd.read_csv("processed/transformed_credit_card_default.csv")

Change back to notebook directory:

In [9]:
cd.change_to_notebook_dir()

## Sample data
Methods to tackle imbalance: over-sampling, under-sampling, and SMOTE.

In [10]:
ccd_df.drop("Unnamed: 0", axis=1, inplace=True)

In [11]:
ccd_ns = ccd_df.copy()
ccd_ou = ccd_df.copy()
ccd_smote = ccd_df.copy()

### Over-sampling

In [12]:
default_samples = ccd_ou[ccd_ou.DEFAULT == 1]
non_default_samples = ccd_ou[ccd_ou.DEFAULT == 0]

In [13]:
default_samples_train, default_samples_test = train_test_split(default_samples,
                                                               test_size=0.2,
                                                               random_state=111)
non_default_samples_train, non_default_samples_test = train_test_split(non_default_samples,
                                                                       test_size=0.2,
                                                                       random_state=111)

In [14]:
np.random.seed(42)
os_default_idx = np.random.choice(default_samples_train.index,
                                  size=non_default_samples_train.shape[0], replace=True)
os_default = default_samples_train.loc[os_default_idx, :]

In [15]:
ccd_over_sampled_train = pd.concat([os_default, non_default_samples_train], axis=0)
X_over_sampled_train = ccd_over_sampled_train.drop("DEFAULT", axis=1)
y_over_sampled_train = ccd_over_sampled_train.DEFAULT

ccd_over_sampled_test = pd.concat([default_samples_test, non_default_samples_test], axis=0)
X_over_sampled_test = ccd_over_sampled_test.drop("DEFAULT", axis=1)
y_over_sampled_test = ccd_over_sampled_test.DEFAULT

### Under-sampling

In [16]:
np.random.seed(42)
us_non_default_idx = np.random.choice(non_default_samples.index, size=default_samples.shape[0], replace=False)
us_non_default = non_default_samples.loc[us_non_default_idx, :]

In [17]:
ccd_under_sampled = pd.concat([us_non_default, default_samples], axis=0)
X_under_sampled = ccd_under_sampled.drop("DEFAULT", axis=1)
y_under_sampled = ccd_under_sampled.DEFAULT
X_under_sampled_train, X_under_sampled_test, y_under_sampled_train, y_under_sampled_test = train_test_split(X_under_sampled, y_under_sampled, test_size=0.2, random_state=111)

### SMOTE

In [18]:
X_smote = ccd_smote.drop("DEFAULT", axis=1)
y_smote = ccd_smote.DEFAULT
X_smote_train, X_smote_test, y_smote_train, y_smote_test = train_test_split(X_smote, y_smote,
                                                                            test_size=0.2, random_state=111)
sm = SMOTE(random_state=42)
X_sm, y_sm = sm.fit_resample(X_smote_train, y_smote_train)

## Data for modeling

In [19]:
# non-sampled
X_non_sampled = ccd_ns.drop("DEFAULT", axis=1)
y_non_sampled = ccd_ns.DEFAULT
non_sampled_X_train, non_sampled_X_test, non_sampled_y_train, non_sampled_y_test = train_test_split(X_non_sampled,
                                                                                                    y_non_sampled,
                                                                                                    test_size=0.2,
                                                                                                    random_state=111)

In [20]:
# over-sampled
over_sampled_X_train = X_over_sampled_train
over_sampled_y_train = y_over_sampled_train
over_sampled_X_test = X_over_sampled_test
over_sampled_y_test = y_over_sampled_test

In [21]:
# under-sampled
under_sampled_X_train = X_under_sampled_train
under_sampled_y_train = y_under_sampled_train
under_sampled_X_test = X_under_sampled_test
under_sampled_y_test = y_under_sampled_test

In [22]:
# SMOTE
smote_sampled_X_train = X_sm
smote_sampled_y_train = y_sm
smote_sampled_X_test = X_smote_test
smote_sampled_y_test = y_smote_test

# Model

## Non-sampled

### Logistic Regression

In [23]:
ns_lr = LogisticRegression()
param_grid = {"penalty": ["l1", "l2"],
              "C": [0.01, 0.1, 1, 10],
              "class_weight": ["balanced", None],
              "random_state": [42],
              "solver": ["liblinear"], 
              "n_jobs": [-1]}
ns_lr_gs = GridSearchCV(ns_lr, param_grid, cv=3, scoring="recall", n_jobs=-1)
ns_lr_gs.fit(non_sampled_X_train, non_sampled_y_train);
ns_lr_gs.best_params_

{'C': 10,
 'class_weight': 'balanced',
 'n_jobs': -1,
 'penalty': 'l1',
 'random_state': 42,
 'solver': 'liblinear'}

Evaluate the best logistic model:

In [24]:
best_ns_lr = LogisticRegression(**ns_lr_gs.best_params_)
best_ns_lr.fit(non_sampled_X_train, non_sampled_y_train)
y_preds_best_ns_lr = best_ns_lr.predict(non_sampled_X_test)
all_scores(non_sampled_y_test, y_preds_best_ns_lr)

f-1 score: 0.5164319248826291
recall: 0.5941358024691358
precision: 0.45670225385527874


### Decision Tree

In [25]:
ns_dt = DecisionTreeClassifier()
param_grid = {"criterion": ["gini", "entropy"],
              "max_depth": [None, 8, 10],
              "min_samples_split": [10, 25],
              "min_samples_leaf": [1, 5],
              "max_features": [None, "sqrt"],
              "class_weight": ["balanced", None],
              "random_state": [42]}
ns_dt_gs = GridSearchCV(ns_dt, param_grid, cv=3, scoring="recall", n_jobs=-1)
ns_dt_gs.fit(non_sampled_X_train, non_sampled_y_train);
ns_dt_gs.best_params_

{'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 5,
 'min_samples_split': 25,
 'random_state': 42}

Evaluate the best decision tree model:

In [26]:
best_ns_dt = DecisionTreeClassifier(**ns_dt_gs.best_params_)
best_ns_dt.fit(non_sampled_X_train, non_sampled_y_train)
y_preds_best_ns_dt = best_ns_dt.predict(non_sampled_X_test)
all_scores(non_sampled_y_test, y_preds_best_ns_dt)

f-1 score: 0.48752969121140144
recall: 0.6334876543209876
precision: 0.39623552123552125


### Random Forest

Random Forest with the same hyperparameters as those of the best Decision Tree.

In [27]:
ns_rf = RandomForestClassifier(n_estimators=100, **ns_dt_gs.best_params_)
ns_rf.fit(non_sampled_X_train, non_sampled_y_train)
y_preds_best_ns_rf = ns_rf.predict(non_sampled_X_test)
all_scores(non_sampled_y_test, y_preds_best_ns_rf)

f-1 score: 0.5281173594132028
recall: 0.5833333333333334
precision: 0.4824505424377792


### XGBoost

In [28]:
ns_xgb = xgb.XGBRegressor()
param_grid = {"max_depth": [5, 8, 10],
              "learning_rate": [0.01, 0.1, 1, 10, 100],
              "objective": ["binary:hinge"],
              "n_estimators": [10, 50, 100, 150],
              "n_jobs": [-1],
              "random_state": [42]}
ns_xgb_gs = GridSearchCV(ns_xgb, param_grid, cv=3, scoring="recall", n_jobs=-1)
ns_xgb_gs.fit(non_sampled_X_train, non_sampled_y_train);
ns_xgb_gs.best_params_

{'learning_rate': 0.01,
 'max_depth': 5,
 'n_estimators': 10,
 'n_jobs': -1,
 'objective': 'binary:hinge',
 'random_state': 42}

Evaluate the best XGBoost model:

In [29]:
best_ns_xgb = xgb.XGBRegressor(**ns_xgb_gs.best_params_)
best_ns_xgb.fit(non_sampled_X_train, non_sampled_y_train)
y_preds_best_ns_xgb = best_ns_xgb.predict(non_sampled_X_test)
all_scores(non_sampled_y_test, y_preds_best_ns_xgb)

f-1 score: 0.35526315789473684
recall: 1.0
precision: 0.216


f1...<br>
f-1 score: 0.5077233042310276<br>
recall: 0.5833333333333334<br>
precision: 0.44946492271105826

### SVM

In [30]:
# ns_svm = SVC()
# param_grid = {"C": [0.1, 1, 10],
#               "kernel": ["linear", "rbf"],
#               "class_weight": ["balanced", None],
#               "random_state": [42]}
# ns_svm_gs = GridSearchCV(ns_svm, param_grid, cv=3, scoring="recall", n_jobs=-1)
# ns_svm_gs.fit(non_sampled_X_train, non_sampled_y_train);
# ns_svm_gs.best_params_

Evaluate the best SVM model:

In [31]:
# best_ns_svm = SVC(**ns_svm_gs.best_params_)
# best_ns_svm.fit(non_sampled_X_train, non_sampled_y_train)
# y_preds_best_ns_svm = best_ns_svm.predict(non_sampled_X_test)
# all_scores(non_sampled_y_test, y_preds_best_ns_svm)

## Over-sampled

### Logistic Regression

In [32]:
os_lr = LogisticRegression()
param_grid = {"penalty": ["l1", "l2"],
              "C": [0.01, 0.1, 1, 10],
              "class_weight": ["balanced", None],
              "random_state": [42],
              "solver": ["liblinear"], 
              "n_jobs": [-1]}
os_lr_gs = GridSearchCV(os_lr, param_grid, cv=3, scoring="recall", n_jobs=-1)
os_lr_gs.fit(over_sampled_X_train, over_sampled_y_train);
os_lr_gs.best_params_

{'C': 10,
 'class_weight': 'balanced',
 'n_jobs': -1,
 'penalty': 'l1',
 'random_state': 42,
 'solver': 'liblinear'}

Evaluate the best logistic model:

In [33]:
best_os_lr = LogisticRegression(**os_lr_gs.best_params_)
best_os_lr.fit(over_sampled_X_train, over_sampled_y_train)
y_preds_best_os_lr = best_os_lr.predict(over_sampled_X_test)
all_scores(over_sampled_y_test, y_preds_best_os_lr)

f-1 score: 0.5532612258275976
recall: 0.6355421686746988
precision: 0.48984329657574


### Decision Tree

In [34]:
os_dt = DecisionTreeClassifier()
param_grid = {"criterion": ["gini", "entropy"],
              "max_depth": [None, 8, 10],
              "min_samples_split": [10, 25],
              "min_samples_leaf": [1, 5],
              "max_features": [None, "sqrt"],
              "class_weight": ["balanced", None],
              "random_state": [42]}
os_dt_gs = GridSearchCV(os_dt, param_grid, cv=3, scoring="recall", n_jobs=-1)
os_dt_gs.fit(over_sampled_X_train, over_sampled_y_train);
os_dt_gs.best_params_

{'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': None,
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'random_state': 42}

Evaluate the best decision tree model:

In [35]:
best_os_dt = DecisionTreeClassifier(**os_dt_gs.best_params_)
best_os_dt.fit(over_sampled_X_train, over_sampled_y_train)
y_preds_best_os_dt = best_os_dt.predict(over_sampled_X_test)
all_scores(over_sampled_y_test, y_preds_best_os_dt)

f-1 score: 0.41006661732050337
recall: 0.4171686746987952
precision: 0.4032023289665211


### Random Forest

Random Forest with the same hyperparameters as those of the best Decision Tree.

In [36]:
os_rf = RandomForestClassifier(n_estimators=100, **os_dt_gs.best_params_)
os_rf.fit(over_sampled_X_train, over_sampled_y_train)
y_preds_best_os_rf = os_rf.predict(over_sampled_X_test)
all_scores(over_sampled_y_test, y_preds_best_os_rf)

f-1 score: 0.5109866436880655
recall: 0.44653614457831325
precision: 0.5971802618328298


### XGBoost

In [37]:
os_xgb = xgb.XGBRegressor()
param_grid = {"max_depth": [5, 8, 10],
              "learning_rate": [0.01, 0.1, 1, 10, 100],
              "objective": ["binary:hinge"],
              "n_estimators": [10, 50, 100, 150],
              "n_jobs": [-1],
              "random_state": [42]}
os_xgb_gs = GridSearchCV(os_xgb, param_grid, cv=3, scoring="recall", n_jobs=-1)
os_xgb_gs.fit(over_sampled_X_train, over_sampled_y_train);
os_xgb_gs.best_params_

{'learning_rate': 0.01,
 'max_depth': 5,
 'n_estimators': 10,
 'n_jobs': -1,
 'objective': 'binary:hinge',
 'random_state': 42}

Evaluate the best XGBoost model:

In [38]:
best_os_xgb = xgb.XGBRegressor(**os_xgb_gs.best_params_)
best_os_xgb.fit(over_sampled_X_train, over_sampled_y_train)
y_preds_best_os_xgb = best_os_xgb.predict(over_sampled_X_test)
all_scores(over_sampled_y_test, y_preds_best_os_xgb)

f-1 score: 0.3623959612498295
recall: 1.0
precision: 0.22129645059156808


f1...<br>
f-1 score: 0.5077233042310276<br>
recall: 0.5833333333333334<br>
precision: 0.44946492271105826

### SVM

In [39]:
# os_svm = SVC()
# param_grid = {"C": [0.1, 1, 10],
#               "kernel": ["linear", "rbf"],
#               "class_weight": ["balanced", None],
#               "random_state": [42]}
# os_svm_gs = GridSearchCV(os_svm, param_grid, cv=3, scoring="recall", n_jobs=-1)
# os_svm_gs.fit(over_sampled_X_train, over_sampled_y_train);
# os_svm_gs.best_params_

Evaluate the best SVM model:

In [40]:
# best_os_svm = SVC(**os_svm_gs.best_params_)
# best_os_svm.fit(over_sampled_X_train, over_sampled_y_train)
# y_preds_best_os_svm = best_os_svm.predict(over_sampled_X_test)
# all_scores(over_sampled_y_test, y_preds_best_os_svm)

## Under-sampled

### Logistic Regression

In [41]:
us_lr = LogisticRegression()
param_grid = {"penalty": ["l1", "l2"],
              "C": [0.01, 0.1, 1, 10],
              "class_weight": ["balanced", None],
              "random_state": [42],
              "solver": ["liblinear"], 
              "n_jobs": [-1]}
us_lr_gs = GridSearchCV(us_lr, param_grid, cv=3, scoring="recall", n_jobs=-1)
us_lr_gs.fit(under_sampled_X_train, under_sampled_y_train);
us_lr_gs.best_params_

{'C': 10,
 'class_weight': 'balanced',
 'n_jobs': -1,
 'penalty': 'l1',
 'random_state': 42,
 'solver': 'liblinear'}

Evaluate the best logistic model:

In [42]:
best_us_lr = LogisticRegression(**us_lr_gs.best_params_)
best_us_lr.fit(under_sampled_X_train, under_sampled_y_train)
y_preds_best_us_lr = best_us_lr.predict(under_sampled_X_test)
all_scores(under_sampled_y_test, y_preds_best_us_lr)

f-1 score: 0.6934097421203439
recall: 0.6274074074074074
precision: 0.7749313815187557


### Decision Tree

In [43]:
us_dt = DecisionTreeClassifier()
param_grid = {"criterion": ["gini", "entropy"],
              "max_depth": [None, 8, 10],
              "min_samples_split": [10, 25],
              "min_samples_leaf": [1, 5],
              "max_features": [None, "sqrt"],
              "class_weight": ["balanced", None],
              "random_state": [42]}
us_dt_gs = GridSearchCV(us_dt, param_grid, cv=3, scoring="recall", n_jobs=-1)
us_dt_gs.fit(under_sampled_X_train, under_sampled_y_train);
us_dt_gs.best_params_

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 5,
 'min_samples_split': 10,
 'random_state': 42}

Evaluate the best decision tree model:

In [44]:
best_us_dt = DecisionTreeClassifier(**us_dt_gs.best_params_)
best_us_dt.fit(under_sampled_X_train, under_sampled_y_train)
y_preds_best_us_dt = best_us_dt.predict(under_sampled_X_test)
all_scores(under_sampled_y_test, y_preds_best_us_dt)

f-1 score: 0.6741248646697943
recall: 0.6918518518518518
precision: 0.657283603096411


### Random Forest

Random Forest with the same hyperparameters as those of the best Decision Tree.

In [45]:
us_rf = RandomForestClassifier(n_estimators=100, **us_dt_gs.best_params_)
us_rf.fit(under_sampled_X_train, under_sampled_y_train)
y_preds_best_us_rf = us_rf.predict(under_sampled_X_test)
all_scores(under_sampled_y_test, y_preds_best_us_rf)

f-1 score: 0.7017543859649122
recall: 0.6518518518518519
precision: 0.7599309153713298


### XGBoost

In [46]:
us_xgb = xgb.XGBRegressor()
param_grid = {"max_depth": [5, 8, 10],
              "learning_rate": [0.01, 0.1, 1, 10, 100],
              "objective": ["binary:hinge"],
              "n_estimators": [10, 50, 100, 150],
              "n_jobs": [-1],
              "random_state": [42]}
us_xgb_gs = GridSearchCV(us_xgb, param_grid, cv=3, scoring="recall", n_jobs=-1)
us_xgb_gs.fit(under_sampled_X_train, under_sampled_y_train);
us_xgb_gs.best_params_

{'learning_rate': 0.01,
 'max_depth': 5,
 'n_estimators': 10,
 'n_jobs': -1,
 'objective': 'binary:hinge',
 'random_state': 42}

Evaluate the best XGBoost model:

In [47]:
best_us_xgb = xgb.XGBRegressor(**us_xgb_gs.best_params_)
best_us_xgb.fit(under_sampled_X_train, under_sampled_y_train)
y_preds_best_us_xgb = best_us_xgb.predict(under_sampled_X_test)
all_scores(under_sampled_y_test, y_preds_best_us_xgb)

f-1 score: 0.6741573033707865
recall: 1.0
precision: 0.5084745762711864


f1...<br>
f-1 score: 0.5077233042310276<br>
recall: 0.5833333333333334<br>
precision: 0.44946492271105826

### SVM

In [48]:
# us_svm = SVC()
# param_grid = {"C": [0.1, 1, 10],
#               "kernel": ["linear", "rbf"],
#               "class_weight": ["balanced", None],
#               "random_state": [42]}
# us_svm_gs = GridSearchCV(us_svm, param_grid, cv=3, scoring="recall", n_jobs=-1)
# us_svm_gs.fit(under_sampled_X_train, under_sampled_y_train);
# us_svm_gs.best_params_

Evaluate the best SVM model:

In [49]:
# best_us_svm = SVC(**us_svm_gs.best_params_)
# best_us_svm.fit(under_sampled_X_train, under_sampled_y_train)
# y_preds_best_us_svm = best_us_svm.predict(under_sampled_X_test)
# all_scores(under_sampled_y_test, y_preds_best_us_svm)

## SMOTE-sampled

### Logistic Regression

In [50]:
sm_lr = LogisticRegression()
param_grid = {"penalty": ["l1", "l2"],
              "C": [0.01, 0.1, 1, 10],
              "class_weight": ["balanced", None],
              "random_state": [42],
              "solver": ["liblinear"], 
              "n_jobs": [-1]}
sm_lr_gs = GridSearchCV(sm_lr, param_grid, cv=3, scoring="recall", n_jobs=-1)
sm_lr_gs.fit(smote_sampled_X_train, smote_sampled_y_train);
sm_lr_gs.best_params_

{'C': 10,
 'class_weight': 'balanced',
 'n_jobs': -1,
 'penalty': 'l1',
 'random_state': 42,
 'solver': 'liblinear'}

Evaluate the best logistic model:

In [51]:
best_sm_lr = LogisticRegression(**sm_lr_gs.best_params_)
best_sm_lr.fit(smote_sampled_X_train, smote_sampled_y_train)
y_preds_best_sm_lr = best_sm_lr.predict(smote_sampled_X_test)
all_scores(smote_sampled_y_test, y_preds_best_sm_lr)

f-1 score: 0.5175237471339666
recall: 0.6095679012345679
precision: 0.4496300512236767


### Decision Tree

In [52]:
sm_dt = DecisionTreeClassifier()
param_grid = {"criterion": ["gini", "entropy"],
              "max_depth": [None, 8, 10],
              "min_samples_split": [10, 25],
              "min_samples_leaf": [1, 5],
              "max_features": [None, "sqrt"],
              "class_weight": ["balanced", None],
              "random_state": [42]}
sm_dt_gs = GridSearchCV(sm_dt, param_grid, cv=3, scoring="recall", n_jobs=-1)
sm_dt_gs.fit(smote_sampled_X_train, smote_sampled_y_train);
sm_dt_gs.best_params_

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'random_state': 42}

Evaluate the best decision tree model:

In [53]:
best_sm_dt = DecisionTreeClassifier(**sm_dt_gs.best_params_)
best_sm_dt.fit(smote_sampled_X_train, smote_sampled_y_train)
y_preds_best_sm_dt = best_sm_dt.predict(smote_sampled_X_test)
all_scores(smote_sampled_y_test, y_preds_best_sm_dt)

f-1 score: 0.3954293628808864
recall: 0.44058641975308643
precision: 0.3586683417085427


### Random Forest

Random Forest with the same hyperparameters as those of the best Decision Tree.

In [54]:
sm_rf = RandomForestClassifier(n_estimators=100, **sm_dt_gs.best_params_)
sm_rf.fit(smote_sampled_X_train, smote_sampled_y_train)
y_preds_best_sm_rf = sm_rf.predict(smote_sampled_X_test)
all_scores(smote_sampled_y_test, y_preds_best_sm_rf)

f-1 score: 0.509548963835839
recall: 0.4837962962962963
precision: 0.5381974248927038


### XGBoost

In [55]:
sm_xgb = xgb.XGBRegressor()
param_grid = {"max_depth": [5, 8, 10],
              "learning_rate": [0.01, 0.1, 1, 10, 100],
              "objective": ["binary:hinge"],
              "n_estimators": [10, 50, 100, 150],
              "n_jobs": [-1],
              "random_state": [42]}
sm_xgb_gs = GridSearchCV(sm_xgb, param_grid, cv=3, scoring="recall", n_jobs=-1)
sm_xgb_gs.fit(smote_sampled_X_train, smote_sampled_y_train);
sm_xgb_gs.best_params_

{'learning_rate': 0.01,
 'max_depth': 5,
 'n_estimators': 10,
 'n_jobs': -1,
 'objective': 'binary:hinge',
 'random_state': 42}

Evaluate the best XGBoost model:

In [57]:
best_sm_xgb = xgb.XGBRegressor(**sm_xgb_gs.best_params_)
best_sm_xgb.fit(smote_sampled_X_train, smote_sampled_y_train)
y_preds_best_sm_xgb = best_sm_xgb.predict(smote_sampled_X_test)
all_scores(smote_sampled_y_test, y_preds_best_sm_xgb)

ValueError: feature_names mismatch: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50', 'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60', 'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80', 'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87'] ['AGE', 'BILL_AMT_APR', 'BILL_AMT_AUG', 'BILL_AMT_JUL', 'BILL_AMT_JUN', 'BILL_AMT_MAY', 'BILL_AMT_SEPT', 'EDUCATION_OTHERS', 'EDUCATION_UNKNOWN_1', 'EDUCATION_UNKNOWN_2', 'FEMALE', 'GRADUATE_SCHOOL', 'HIGH_SCHOOL', 'LIMIT_BAL', 'MARRIAGE_OTHERS', 'MARRIED', 'PAY_AMT_APR', 'PAY_AMT_AUG', 'PAY_AMT_JUL', 'PAY_AMT_JUN', 'PAY_AMT_MAY', 'PAY_AMT_SEPT', 'REPAYMENT_STATUS_APR_-1', 'REPAYMENT_STATUS_APR_-2', 'REPAYMENT_STATUS_APR_0', 'REPAYMENT_STATUS_APR_2', 'REPAYMENT_STATUS_APR_3', 'REPAYMENT_STATUS_APR_4', 'REPAYMENT_STATUS_APR_5', 'REPAYMENT_STATUS_APR_6', 'REPAYMENT_STATUS_APR_7', 'REPAYMENT_STATUS_APR_8', 'REPAYMENT_STATUS_AUG_-1', 'REPAYMENT_STATUS_AUG_-2', 'REPAYMENT_STATUS_AUG_0', 'REPAYMENT_STATUS_AUG_1', 'REPAYMENT_STATUS_AUG_2', 'REPAYMENT_STATUS_AUG_3', 'REPAYMENT_STATUS_AUG_4', 'REPAYMENT_STATUS_AUG_5', 'REPAYMENT_STATUS_AUG_6', 'REPAYMENT_STATUS_AUG_7', 'REPAYMENT_STATUS_AUG_8', 'REPAYMENT_STATUS_JUL_-1', 'REPAYMENT_STATUS_JUL_-2', 'REPAYMENT_STATUS_JUL_0', 'REPAYMENT_STATUS_JUL_1', 'REPAYMENT_STATUS_JUL_2', 'REPAYMENT_STATUS_JUL_3', 'REPAYMENT_STATUS_JUL_4', 'REPAYMENT_STATUS_JUL_5', 'REPAYMENT_STATUS_JUL_6', 'REPAYMENT_STATUS_JUL_7', 'REPAYMENT_STATUS_JUL_8', 'REPAYMENT_STATUS_JUN_-1', 'REPAYMENT_STATUS_JUN_-2', 'REPAYMENT_STATUS_JUN_0', 'REPAYMENT_STATUS_JUN_1', 'REPAYMENT_STATUS_JUN_2', 'REPAYMENT_STATUS_JUN_3', 'REPAYMENT_STATUS_JUN_4', 'REPAYMENT_STATUS_JUN_5', 'REPAYMENT_STATUS_JUN_6', 'REPAYMENT_STATUS_JUN_7', 'REPAYMENT_STATUS_JUN_8', 'REPAYMENT_STATUS_MAY_-1', 'REPAYMENT_STATUS_MAY_-2', 'REPAYMENT_STATUS_MAY_0', 'REPAYMENT_STATUS_MAY_2', 'REPAYMENT_STATUS_MAY_3', 'REPAYMENT_STATUS_MAY_4', 'REPAYMENT_STATUS_MAY_5', 'REPAYMENT_STATUS_MAY_6', 'REPAYMENT_STATUS_MAY_7', 'REPAYMENT_STATUS_MAY_8', 'REPAYMENT_STATUS_SEPT_-1', 'REPAYMENT_STATUS_SEPT_-2', 'REPAYMENT_STATUS_SEPT_0', 'REPAYMENT_STATUS_SEPT_1', 'REPAYMENT_STATUS_SEPT_2', 'REPAYMENT_STATUS_SEPT_3', 'REPAYMENT_STATUS_SEPT_4', 'REPAYMENT_STATUS_SEPT_5', 'REPAYMENT_STATUS_SEPT_6', 'REPAYMENT_STATUS_SEPT_7', 'REPAYMENT_STATUS_SEPT_8', 'SINGLE', 'UNIVERSITY']
expected f6, f57, f61, f37, f77, f9, f3, f59, f82, f70, f17, f85, f60, f68, f67, f66, f4, f34, f38, f5, f62, f15, f50, f42, f72, f78, f53, f86, f21, f2, f26, f76, f65, f63, f52, f56, f48, f27, f39, f81, f10, f18, f58, f74, f41, f46, f31, f69, f54, f45, f84, f83, f80, f23, f75, f35, f44, f47, f29, f40, f64, f55, f25, f19, f51, f16, f79, f71, f73, f1, f43, f13, f49, f11, f30, f24, f87, f20, f7, f28, f14, f32, f22, f0, f33, f36, f8, f12 in input data
training data did not have the following fields: REPAYMENT_STATUS_AUG_4, REPAYMENT_STATUS_JUL_8, REPAYMENT_STATUS_MAY_2, REPAYMENT_STATUS_APR_0, HIGH_SCHOOL, REPAYMENT_STATUS_JUN_2, BILL_AMT_JUL, LIMIT_BAL, REPAYMENT_STATUS_JUN_6, REPAYMENT_STATUS_MAY_3, MARRIED, BILL_AMT_AUG, REPAYMENT_STATUS_JUN_1, PAY_AMT_SEPT, SINGLE, EDUCATION_OTHERS, UNIVERSITY, REPAYMENT_STATUS_SEPT_4, REPAYMENT_STATUS_APR_5, REPAYMENT_STATUS_JUN_5, REPAYMENT_STATUS_SEPT_7, AGE, REPAYMENT_STATUS_JUL_5, REPAYMENT_STATUS_JUL_7, REPAYMENT_STATUS_AUG_-2, REPAYMENT_STATUS_JUN_3, REPAYMENT_STATUS_SEPT_2, REPAYMENT_STATUS_APR_2, REPAYMENT_STATUS_APR_6, REPAYMENT_STATUS_JUL_1, REPAYMENT_STATUS_JUN_-1, PAY_AMT_JUN, REPAYMENT_STATUS_MAY_0, REPAYMENT_STATUS_SEPT_5, REPAYMENT_STATUS_APR_7, REPAYMENT_STATUS_AUG_0, REPAYMENT_STATUS_JUN_4, REPAYMENT_STATUS_MAY_5, REPAYMENT_STATUS_SEPT_0, REPAYMENT_STATUS_JUL_3, REPAYMENT_STATUS_MAY_8, REPAYMENT_STATUS_JUN_8, REPAYMENT_STATUS_MAY_7, BILL_AMT_APR, REPAYMENT_STATUS_AUG_3, REPAYMENT_STATUS_JUL_2, REPAYMENT_STATUS_APR_3, EDUCATION_UNKNOWN_2, REPAYMENT_STATUS_APR_8, REPAYMENT_STATUS_JUL_6, EDUCATION_UNKNOWN_1, GRADUATE_SCHOOL, REPAYMENT_STATUS_AUG_8, REPAYMENT_STATUS_MAY_-1, REPAYMENT_STATUS_APR_4, REPAYMENT_STATUS_AUG_1, REPAYMENT_STATUS_JUL_-1, REPAYMENT_STATUS_SEPT_3, PAY_AMT_AUG, REPAYMENT_STATUS_MAY_-2, PAY_AMT_JUL, REPAYMENT_STATUS_AUG_5, REPAYMENT_STATUS_MAY_4, FEMALE, REPAYMENT_STATUS_APR_-2, REPAYMENT_STATUS_AUG_6, BILL_AMT_SEPT, REPAYMENT_STATUS_JUN_-2, REPAYMENT_STATUS_AUG_-1, REPAYMENT_STATUS_SEPT_-1, REPAYMENT_STATUS_SEPT_6, PAY_AMT_APR, REPAYMENT_STATUS_JUL_4, REPAYMENT_STATUS_AUG_2, PAY_AMT_MAY, REPAYMENT_STATUS_JUN_7, REPAYMENT_STATUS_APR_-1, REPAYMENT_STATUS_JUL_-2, REPAYMENT_STATUS_SEPT_1, MARRIAGE_OTHERS, REPAYMENT_STATUS_JUN_0, REPAYMENT_STATUS_AUG_7, BILL_AMT_MAY, REPAYMENT_STATUS_SEPT_8, REPAYMENT_STATUS_JUL_0, REPAYMENT_STATUS_MAY_6, BILL_AMT_JUN, REPAYMENT_STATUS_SEPT_-2

f1...<br>
f-1 score: 0.5077233042310276<br>
recall: 0.5833333333333334<br>
precision: 0.44946492271105826

### SVM

In [None]:
# sm_svm = SVC()
# param_grid = {"C": [0.1, 1, 10],
#               "kernel": ["linear", "rbf"],
#               "class_weight": ["balanced", None],
#               "random_state": [42]}
# sm_svm_gs = GridSearchCV(sm_svm, param_grid, cv=3, scoring="recall", n_jobs=-1)
# sm_svm_gs.fit(smote_sampled_X_train, smote_sampled_y_train);
# sm_svm_gs.best_params_

Evaluate the best SVM model:

In [None]:
# best_sm_svm = SVC(**sm_svm_gs.best_params_)
# best_sm_svm.fit(smote_sampled_X_train, smote_sampled_y_train)
# y_preds_best_sm_svm = best_sm_svm.predict(smote_sampled_X_test)
# all_scores(smote_sampled_y_test, y_preds_best_sm_svm)