In [159]:
# -*- coding: utf-8 -*-
import click
import logging
from dotenv import find_dotenv, load_dotenv
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import json
import pathlib 
from pathlib import Path

In [87]:
df_train = pd.read_csv(pathlib.Path("C:/Users/Paulius/p160m138-2019r-lab3/p160m138-2019r-lab3/data/interim/bank_train.csv"))
df_test = pd.read_csv(pathlib.Path("C:/Users/Paulius/p160m138-2019r-lab3/p160m138-2019r-lab3/data/interim/bank_test.csv"))

In [88]:
list(df_train.columns)

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing',
 'loan',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'target']

In [89]:
df_train.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
target        int64
dtype: object

In [90]:
numeric_features = [
    'age',
    'balance',
    'day',
    'campaign',
    'pdays',
    'previous',
]

In [91]:
categorical_features = [
    'job',
    'marital',
    'education',
    'default',
    'housing',
    'loan',
    'contact',
    'month',
    'campaign',
    'poutcome'
]

In [92]:
numeric_transformer_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor_pipe = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer_pipe, numeric_features),
        ('cat', categorical_transformer_pipe, categorical_features)])

In [93]:
X_train = df_train.drop('target', axis=1)
y_train = df_train['target']

X_test = df_test.drop('target', axis=1)
y_test = df_test['target']

In [109]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor_pipe),
    ('classifier', RandomForestClassifier(n_jobs=-1, n_estimators=100))])

clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [110]:
print("model score: {:.3f}".format(clf.score(X_test, y_test)))

model score: 0.897


In [111]:
print("model accuracy: {:.3f}".format(metrics.accuracy_score(y_test, clf.predict(X_test))))

print("model precision: {:.3f}".format(metrics.precision_score(y_test, clf.predict(X_test))))

print("model recall: {:.3f}".format(metrics.recall_score(y_test, clf.predict(X_test))))

print("model F1: {:.3f}".format(metrics.f1_score(y_test, clf.predict(X_test))))

print("model AuROC: {:.3f}".format(metrics.roc_auc_score(y_test, clf.predict(X_test))))

model accuracy: 0.897
model precision: 0.674
model recall: 0.233
model F1: 0.346
model AuROC: 0.609


In [112]:
print(metrics.classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.91      0.99      0.94      7985
           1       0.67      0.23      0.35      1058

    accuracy                           0.90      9043
   macro avg       0.79      0.61      0.64      9043
weighted avg       0.88      0.90      0.87      9043



In [113]:
param_grid = {
    'classifier__n_estimators': [10, 30, 100],
}

grid_search = GridSearchCV(clf, param_grid, cv=4, iid=False, scoring='recall', n_jobs=-1)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                    

In [114]:
print("model accuracy: {:.3f}".format(metrics.accuracy_score(y_test, grid_search.predict(X_test))))

print("model precision: {:.3f}".format(metrics.precision_score(y_test, grid_search.predict(X_test))))

print("model recall: {:.3f}".format(metrics.recall_score(y_test, grid_search.predict(X_test))))

print("model F1: {:.3f}".format(metrics.f1_score(y_test, grid_search.predict(X_test))))

print("model AuROC: {:.3f}".format(metrics.roc_auc_score(y_test, grid_search.predict(X_test))))

model accuracy: 0.897
model precision: 0.673
model recall: 0.230
model F1: 0.342
model AuROC: 0.607


In [115]:
cv_results = {k: v.tolist() if isinstance(v, np.ndarray) else v for k, v in grid_search.cv_results_.items()}
cv_results

{'mean_fit_time': [3.7253000140190125, 7.2048909068107605, 16.981665909290314],
 'std_fit_time': [0.669750416922757, 0.049128142804261525, 0.291984223879611],
 'mean_score_time': [3.570048213005066,
  0.19624316692352295,
  0.2787806987762451],
 'std_score_time': [0.6641879251700253,
  0.008273285454890745,
  0.17175448851807368],
 'param_classifier__n_estimators': [10, 30, 100],
 'params': [{'classifier__n_estimators': 10},
  {'classifier__n_estimators': 30},
  {'classifier__n_estimators': 100}],
 'split0_test_score': [0.2003780718336484,
  0.20132325141776938,
  0.21361058601134217],
 'split1_test_score': [0.18525519848771266,
  0.21455576559546313,
  0.21644612476370512],
 'split2_test_score': [0.1880907372400756,
  0.2060491493383743,
  0.21172022684310018],
 'split3_test_score': [0.16745506149479658,
  0.1750236518448439,
  0.1825922421948912],
 'mean_test_score': [0.18529476726405833,
  0.19923795454911267,
  0.20609229495325967],
 'std_test_score': [0.011764142472654976,
  0.014

In [116]:
# json.dumps converts an object into JSON string, while json.dump writes it to a file
print(json.dumps(cv_results, indent=4))

{
    "mean_fit_time": [
        3.7253000140190125,
        7.2048909068107605,
        16.981665909290314
    ],
    "std_fit_time": [
        0.669750416922757,
        0.049128142804261525,
        0.291984223879611
    ],
    "mean_score_time": [
        3.570048213005066,
        0.19624316692352295,
        0.2787806987762451
    ],
    "std_score_time": [
        0.6641879251700253,
        0.008273285454890745,
        0.17175448851807368
    ],
    "param_classifier__n_estimators": [
        10,
        30,
        100
    ],
    "params": [
        {
            "classifier__n_estimators": 10
        },
        {
            "classifier__n_estimators": 30
        },
        {
            "classifier__n_estimators": 100
        }
    ],
    "split0_test_score": [
        0.2003780718336484,
        0.20132325141776938,
        0.21361058601134217
    ],
    "split1_test_score": [
        0.18525519848771266,
        0.21455576559546313,
        0.21644612476370512
    ],
    

In [118]:
import json

with open('C:/Users/Paulius/p160m138-2019r-lab3/p160m138-2019r-lab3/notebooks/Notebook/p160m138-2019r-lab3/p160m138-2019r-lab3/json' + 'best_params.json', 'w') as outfile:  
   
    json.dump(grid_search.best_params_, outfile, indent=4)

In [119]:
param_grid = {
        'classifier__n_estimators': [10, 30, 100, 200],
        'classifier__max_depth': [None, 10, 20, 30]
    }

In [122]:
grid_search = GridSearchCV(clf, param_grid, cv=5, iid=False, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train) 

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                    

In [134]:
params_res = {
        'classifier__n_estimators': grid_search.best_params_['classifier__n_estimators'],
        'classifier__max_depth': grid_search.best_params_['classifier__max_depth']
    }

In [135]:
with open('C:/Users/Paulius/p160m138-2019r-lab3/p160m138-2019r-lab3/notebooks/Notebook/p160m138-2019r-lab3/p160m138-2019r-lab3/json' + 'best_params.json', 'w') as outfile:
    json.dump(grid_search.best_params_, outfile, indent=4)

In [144]:
converted_dict = dict()
for key in grid_search.cv_results_.keys():
        val = grid_search.cv_results_[key]
        converted_val = val
        if isinstance(val, np.ndarray):
            converted_val = val.tolist()
        converted_dict[key] = converted_val

In [145]:
with open('C:/Users/Paulius/p160m138-2019r-lab3/p160m138-2019r-lab3/notebooks/Notebook/p160m138-2019r-lab3/p160m138-2019r-lab3/json' + 'best_params.json', 'w') as outfile:
    json.dump(converted_dict, outfile, indent=4)

In [160]:
metrics_test = get_metrics(X_test, y_test, clf)
metrics_train = get_metrics(X_train, y_train, clf)
metrics_dict = {
       'metrics_test': metrics_test,
       'metrics_train': metrics_train
               }

   

NameError: name 'get_metrics' is not defined