In [3]:
# -*- coding: utf-8 -*-
import click
import logging
from dotenv import find_dotenv, load_dotenv
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import json
import pathlib 
from pathlib import Path

In [5]:
df_train = pd.read_csv(pathlib.Path("C:/Users/Paulius/p160m138-2019r-lab3/p160m138-2019r-lab3/data/interim/bank_train.csv"))
df_test = pd.read_csv(pathlib.Path("C:/Users/Paulius/p160m138-2019r-lab3/p160m138-2019r-lab3/data/interim/bank_test.csv"))

In [6]:
list(df_train.columns)

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing',
 'loan',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'target']

In [7]:
df_train.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
target        int64
dtype: object

In [8]:
numeric_features = [
    'age',
    'balance',
    'day',
    'campaign',
    'pdays',
    'previous',
]

In [9]:
categorical_features = [
    'job',
    'marital',
    'education',
    'default',
    'housing',
    'loan',
    'contact',
    'month',
    'campaign',
    'poutcome'
]

In [10]:
numeric_transformer_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor_pipe = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer_pipe, numeric_features),
        ('cat', categorical_transformer_pipe, categorical_features)])

In [11]:
X_train = df_train.drop('target', axis=1)
y_train = df_train['target']

X_test = df_test.drop('target', axis=1)
y_test = df_test['target']

In [12]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor_pipe),
    ('classifier', RandomForestClassifier(n_jobs=-1, n_estimators=100))])

clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [13]:
print("model score: {:.3f}".format(clf.score(X_test, y_test)))

model score: 0.897


In [14]:
print("model accuracy: {:.3f}".format(metrics.accuracy_score(y_test, clf.predict(X_test))))

print("model precision: {:.3f}".format(metrics.precision_score(y_test, clf.predict(X_test))))

print("model recall: {:.3f}".format(metrics.recall_score(y_test, clf.predict(X_test))))

print("model F1: {:.3f}".format(metrics.f1_score(y_test, clf.predict(X_test))))

print("model AuROC: {:.3f}".format(metrics.roc_auc_score(y_test, clf.predict(X_test))))

model accuracy: 0.897
model precision: 0.674
model recall: 0.238
model F1: 0.352
model AuROC: 0.611


In [15]:
print(metrics.classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.91      0.98      0.94      7985
           1       0.67      0.24      0.35      1058

    accuracy                           0.90      9043
   macro avg       0.79      0.61      0.65      9043
weighted avg       0.88      0.90      0.87      9043



In [16]:
param_grid = {
    'classifier__n_estimators': [10, 30, 100],
}

grid_search = GridSearchCV(clf, param_grid, cv=4, iid=False, scoring='recall', n_jobs=-1)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                    

In [17]:
print("model accuracy: {:.3f}".format(metrics.accuracy_score(y_test, grid_search.predict(X_test))))

print("model precision: {:.3f}".format(metrics.precision_score(y_test, grid_search.predict(X_test))))

print("model recall: {:.3f}".format(metrics.recall_score(y_test, grid_search.predict(X_test))))

print("model F1: {:.3f}".format(metrics.f1_score(y_test, grid_search.predict(X_test))))

print("model AuROC: {:.3f}".format(metrics.roc_auc_score(y_test, grid_search.predict(X_test))))

model accuracy: 0.896
model precision: 0.664
model recall: 0.228
model F1: 0.339
model AuROC: 0.606


In [18]:
cv_results = {k: v.tolist() if isinstance(v, np.ndarray) else v for k, v in grid_search.cv_results_.items()}
cv_results

{'mean_fit_time': [7.5675660371780396, 15.904978811740875, 39.508933305740356],
 'std_fit_time': [1.85769966118303, 0.2893660104601841, 0.9576448051955023],
 'mean_score_time': [5.208703458309174, 2.090374708175659, 0.5662997364997864],
 'std_score_time': [0.38417710599105154,
  0.6254345155424147,
  0.38651957822850697],
 'param_classifier__n_estimators': [10, 30, 100],
 'params': [{'classifier__n_estimators': 10},
  {'classifier__n_estimators': 30},
  {'classifier__n_estimators': 100}],
 'split0_test_score': [0.20888468809073724,
  0.19659735349716445,
  0.20982986767485823],
 'split1_test_score': [0.19281663516068054,
  0.20888468809073724,
  0.20793950850661624],
 'split2_test_score': [0.1833648393194707,
  0.1937618147448015,
  0.2051039697542533],
 'split3_test_score': [0.1759697256385998,
  0.18448438978240303,
  0.1750236518448439],
 'mean_test_score': [0.19025897205237208,
  0.19593206152877654,
  0.19947424944514291],
 'std_test_score': [0.012300107395159007,
  0.008717400813

In [19]:
# json.dumps converts an object into JSON string, while json.dump writes it to a file
print(json.dumps(cv_results, indent=4))

{
    "mean_fit_time": [
        7.5675660371780396,
        15.904978811740875,
        39.508933305740356
    ],
    "std_fit_time": [
        1.85769966118303,
        0.2893660104601841,
        0.9576448051955023
    ],
    "mean_score_time": [
        5.208703458309174,
        2.090374708175659,
        0.5662997364997864
    ],
    "std_score_time": [
        0.38417710599105154,
        0.6254345155424147,
        0.38651957822850697
    ],
    "param_classifier__n_estimators": [
        10,
        30,
        100
    ],
    "params": [
        {
            "classifier__n_estimators": 10
        },
        {
            "classifier__n_estimators": 30
        },
        {
            "classifier__n_estimators": 100
        }
    ],
    "split0_test_score": [
        0.20888468809073724,
        0.19659735349716445,
        0.20982986767485823
    ],
    "split1_test_score": [
        0.19281663516068054,
        0.20888468809073724,
        0.20793950850661624
    ],
    "spl