In [1]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTENC
# from sklearn.model_selection import train_test_split
# from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import (make_scorer,
                             roc_auc_score,
                             roc_curve,
                             f1_score,
                             recall_score,
                             precision_score,
                             precision_recall_curve,
                             classification_report,
                             plot_confusion_matrix,
                             plot_roc_curve)
import matplotlib.pyplot as plt
%matplotlib inline

seed = 36

In [2]:
train = pd.read_csv('data/train.csv')

# drop 'duration' column due to data leakage and its 'unrealistic' effect on the target variable
train = train.drop(columns='duration')
train.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,36,management,married,university.degree,no,no,no,cellular,nov,mon,1,999,0,nonexistent,-0.1,93.2,-42.0,4.191,5195.8,no
1,49,blue-collar,married,basic.4y,unknown,yes,no,telephone,may,fri,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,41,entrepreneur,single,high.school,no,yes,no,cellular,jul,thu,3,999,0,nonexistent,1.4,93.918,-42.7,4.962,5228.1,no
3,41,entrepreneur,divorced,basic.9y,no,yes,no,cellular,jul,thu,1,999,0,nonexistent,1.4,93.918,-42.7,4.958,5228.1,no
4,33,admin.,single,university.degree,no,no,no,cellular,nov,thu,2,999,1,failure,-0.1,93.2,-42.0,4.076,5195.8,no


In [3]:
cat_locations = [1, 2, 3, 4, 5, 6, 7, 8, 9, 13]
over = SMOTENC(categorical_features=cat_locations, random_state=seed)

X_resampled, y_resampled = over.fit_resample(train.drop(columns='y'), train.y)
X_resampled['y'] = y_resampled

In [None]:
df = pd.get_dummies(X_resampled)
df = df.drop(columns='y_no')
df.shape

In [None]:
y = df.y_yes
X = df.drop(columns='y_yes')

# x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=36, stratify=y)

In [None]:
features = [
    'job_retired',
    'job_student',
    'default_unknown',
    'contact_cellular',
    'contact_telephone',
    'month_dec',
    'month_mar',
    'month_may',
    'month_oct',
    'month_sep',
    'poutcome_success',
    'pdays',
    'previous',
    'emp_var_rate',
    'euribor3m',
    'nr_employed',
    'cons_price_idx',
#     'cons_conf_idx',
]
X = X.loc[:, features]
X.shape

In [None]:
def tp_tn_fn_fp(true_value, predicted_value):
    tp, tn, fn, fp = 0, 0, 0, 0
    for v, p in zip(true_value, predicted_value):
        if v == p:
            if v+p == 2: tp += 1
            else: tn += 1
        elif v == 1: fn += 1
        else: fp += 1
    return tp, tn, fn, fp

def lift_score(true_value, predicted_value):
    tp, tn, fn, fp = tp_tn_fn_fp(true_value, predicted_value)
    return ( tp/(tp+fp) ) / ( (tp+fn) / (tp+tn+fp+fn) )

lift_scorer = make_scorer(lift_score)

In [None]:
models = [
    RandomForestClassifier(random_state=seed)
]

param_grid = [
    {
        'n_estimators': [100, 200, 300],
        'criterion': ['gini', 'entropy'],
        'max_depth': [30, 50, None]
    }
]


clf = GridSearchCV(
            models[0],
            param_grid[0],
#             scoring=lift_scorer,
            scoring='roc_auc',
            cv=5,
            n_jobs=-1
        )
clf.fit(X, y)

print(clf.best_score_)
print(clf.best_params_)

In [None]:
X.head()

In [None]:
print(classification_report(y, clf.predict(X)))

0.9329230510749482

{'criterion': 'entropy', 'max_depth': 30, 'n_estimators': 300}

              precision    recall  f1-score   support

           0       0.87      0.97      0.92     25584
           1       0.96      0.86      0.91     25584

    accuracy                           0.92     51168
   macro avg       0.92      0.92      0.91     51168
weighted avg       0.92      0.92      0.91     51168

In [None]:
import joblib
joblib.dump(clf, 'saved_models/random_forest_feature_selected.joblib')

In [None]:
test = pd.read_csv('data/test.csv')

# drop duration column due to its 'unrealistic' effect on the target variable
test = test.drop(columns='duration')

test = pd.get_dummies(test)
test = test.drop(columns='y_no')

y_holdout = test.y_yes
X_holdout = test.drop(columns='y_yes')

X_holdout = X_holdout.loc[:, features]
X_holdout.shape

In [None]:
lift_score(y_holdout, clf.predict(X_holdout))

In [None]:
print(classification_report(y_holdout, clf.predict(X_holdout)))

In [None]:
plot_confusion_matrix(clf, X_holdout, y_holdout,
                      display_labels=['no', 'yes'],
                      cmap=plt.cm.Blues,
                      normalize='true'
                     );