## Supervised learning pipelines
Predict whether a new cohort of loan applicants are likely to default on their loans.

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy  as np
import os

from sklearn.preprocessing   import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble        import RandomForestClassifier
from sklearn.metrics         import accuracy_score
from pathlib                 import Path

In [3]:
# set root directory
path_root = Path("C:/Users/giann/data-science-core")
os.chdir(path_root)
print(f'- Root directory = {os.getcwd()}')

- Root directory = C:\Users\giann\data-science-core


In [4]:
# load data
path_dataset = path_root / 'dataset/credit.csv'
data      = pd.read_csv(path_dataset) 
data.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,'<0',6,'critical/other existing credit',buy_radio_tv,1169,'no known savings','>=7',4,'male single',none,...,'real estate',67,none,own,2,skilled,1,yes,yes,good
1,'0<=X<200',48,'existing paid',buy_radio_tv,5951,'<100','1<=X<4',2,'female div/dep/mar',none,...,'real estate',22,none,own,1,skilled,1,none,yes,bad
2,'no checking',12,'critical/other existing credit',education,2096,'<100','4<=X<7',2,'male single',none,...,'real estate',49,none,own,1,'unskilled resident',2,none,yes,good
3,'<0',42,'existing paid',buy_furniture_equipment,7882,'<100','4<=X<7',2,'male single',guarantor,...,'life insurance',45,none,'for free',1,skilled,2,none,yes,good
4,'<0',24,'delayed previously',buy_new_car,4870,'<100','1<=X<4',3,'male single',none,...,'no known property',53,none,'for free',2,skilled,2,none,yes,bad


### Feature engineering
Many features are in string format, which is a problem for your classifiers. You hence decide to encode the string columns numerically using `LabelEncoder()`. The dataset credit is also preloaded, as is a list of all column names whose data types are string, stored in `non_numeric_columns`.

In [5]:
# replace string columns with integer values
non_numeric_columns = ['checking_status', 'credit_history' , 'purpose'      , 'savings_status'    , 
                       'employment'     , 'personal_status', 'other_parties', 'property_magnitude', 
                       'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']

# Create a label encoder for each column. Encode the values
for column in non_numeric_columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])

data.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,1,6,1,4,1169,4,3,4,3,2,...,2,67,1,1,2,3,1,1,1,good
1,0,48,3,4,5951,2,0,2,0,2,...,2,22,1,1,1,3,1,0,1,bad
2,3,12,1,6,2096,2,1,2,3,2,...,2,49,1,1,1,2,2,0,1,good
3,1,42,3,2,7882,2,1,2,3,1,...,0,45,1,0,1,3,2,0,1,good
4,1,24,2,3,4870,2,0,3,3,2,...,1,53,1,0,2,3,2,0,1,bad


In [6]:
# convert attributes to categorical type
categorical_attr = ['class']
data[categorical_attr] = data[categorical_attr].apply(lambda x : x.astype('category'))
# Inspect the data types of the columns of the data frame
print(data.dtypes)

checking_status              int32
duration                     int64
credit_history               int32
purpose                      int32
credit_amount                int64
savings_status               int32
employment                   int32
installment_commitment       int64
personal_status              int32
other_parties                int32
residence_since              int64
property_magnitude           int32
age                          int64
other_payment_plans          int32
housing                      int32
existing_credits             int64
job                          int32
num_dependents               int64
own_telephone                int32
foreign_worker               int32
class                     category
dtype: object


### Make pipeline

In [8]:
X = data.drop('class', axis = 1)
y = data['class']
# Split the data into train and test, with 20% as test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1, stratify = y)

In [11]:
# Create a random forest classifier, fixing the seed to 2
rf_model = RandomForestClassifier(random_state = 2, n_estimators = 100).fit(X_train, y_train)

In [12]:
# Use it to predict the labels of the test data
rf_predictions = rf_model.predict(X_test)
# Assess the accuracy of both classifiers
accuracies = dict()
accuracies['rf'] = accuracy_score(y_test, rf_predictions)

In [14]:
# Although you should always assess accuracy on a training-test split, 
# it is better to refit the winning classifier to the whole data
production_model = RandomForestClassifier(random_state = 2, n_estimators = 100).fit(X, y)
rf_predictions   = production_model.predict(X_test)
accuracies['rf_prod'] = accuracy_score(y_test, rf_predictions)

### Grid search CV for model complexity

In [15]:
# Set a range for n_estimators from 10 to 40 in steps of 10
param_grid = {'n_estimators': list(range(10, 41, 10))}
# Optimize for a RandomForestClassifier using GridSearchCV
grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=3)
grid.fit(X_train, y_train)  
grid.best_params_

{'n_estimators': 30}

In [16]:
# Assess the accuracy of both classifiers
best_model     = grid.best_estimator_
rf_predictions = best_model.predict(X_test)
accuracies['rf_tuned'] = accuracy_score(y_test, rf_predictions)

In [17]:
accuracies

{'rf': 0.79, 'rf_prod': 1.0, 'rf_tuned': 0.775}