# Classification model - Saving Pickle File

In [1]:
import os
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

### Load the data

In [3]:
data = pd.read_csv('../data/loan_data.csv')
print(f"Shape of the data is: {data.shape}")
data.head()

Shape of the data is: (614, 13)


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


### Check how many `null/Nan` values are in each column

In [4]:
data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

### View the possible labels for the columns that have some null values

In [5]:
# I'm intentionally not looking at "LoanAmount", because I don't want to print out every possible loan value
contains_null = ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Loan_Amount_Term', 'Credit_History']

for col in contains_null:
    print(f"List of unique labels for {col}:::{set(data[col])}")

List of unique labels for Gender:::{nan, 'Female', 'Male'}
List of unique labels for Married:::{nan, 'Yes', 'No'}
List of unique labels for Dependents:::{nan, '2', '3+', '0', '1'}
List of unique labels for Self_Employed:::{nan, 'Yes', 'No'}
List of unique labels for Loan_Amount_Term:::{nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 12.0, 36.0, 300.0, 180.0, 60.0, 84.0, 480.0, 360.0, 240.0, 120.0}
List of unique labels for Credit_History:::{0.0, 1.0, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan}


### Clean up null values

For the sake of simplicity, I'm going to make a few assumptions:

- `Dependents`: Assumption that there are no dependents (0: 345 | 1: 102 | 2: 101 | 3+: 51)
- `Self_Employed`: Assumption that the applicant is not self-employed (No: 500 | Yes: 82)
- `Credit_History`: Assumption that the person has a credit history (True: 475 | False: 89)
- `Married`: If nothing specified, applicant is not married
- `Gender`: Assuming the gender is Male for the missing values (Male: 489 | Female: 112)

In [7]:
data['Dependents'] = data['Dependents'].fillna('0')
data['Self_Employed'] = data['Self_Employed'].fillna('No')
data['Credit_History'] = data['Credit_History'].fillna(1)
data['Married'] = data['Married'].fillna('No')
data['Gender'] = data['Gender'].fillna('Male')

### View cleaned up values

In [8]:
label_cols = ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Education', 'Property_Area', 'Loan_Status']

for col in label_cols:
    print(f"List of unique labels for {col} ::: {set(data[col])}")

List of unique labels for Gender ::: {'Female', 'Male'}
List of unique labels for Married ::: {'Yes', 'No'}
List of unique labels for Dependents ::: {'1', '0', '3+', '2'}
List of unique labels for Self_Employed ::: {'Yes', 'No'}
List of unique labels for Education ::: {'Graduate', 'Not Graduate'}
List of unique labels for Property_Area ::: {'Semiurban', 'Rural', 'Urban'}
List of unique labels for Loan_Status ::: {'N', 'Y'}


### Encode categorical fields
We have a lot of `string` labels that we encounter in `Gender`, `Married`, `Education`, `Self_Employed` & `Property_Area` columns.

In [9]:
# create dictionaries to map fields to numeric values
gender_values = {'Female' : 0, 'Male' : 1} 
married_values = {'No' : 0, 'Yes' : 1}
education_values = {'Graduate' : 0, 'Not Graduate' : 1}
employed_values = {'No' : 0, 'Yes' : 1}
dependent_values = {'3+': 3, '0': 0, '2': 2, '1': 1}
target_values = {'Y':1, 'N':0}

# replace values in each column according to the dictionaries above
data.replace({'Gender': gender_values, 'Married': married_values, 'Education': education_values, \
                'Self_Employed': employed_values, 'Dependents': dependent_values, 'Loan_Status': target_values}, inplace=True)

# Get dummy variables for nominal property column
clean_data = pd.get_dummies(data, columns=["Property_Area"])

clean_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
0,LP001002,1,0,0,0,0,5849,0.0,,360.0,1.0,1,0,0,1
1,LP001003,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,1,0,0
2,LP001005,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,1,0,0,1
3,LP001006,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,1,0,0,1
4,LP001008,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,1,0,0,1


### Train/Test Split

In [10]:
# store the target variable in y and everything else goes in X
y = clean_data['Loan_Status']

# we're also dropping Load_ID because it adds no value to the prediction
X = clean_data.drop(['Loan_ID', 'Loan_Status'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=3)

X_train.shape

(460, 13)

### Fill the nulls for the continuous variables with the mean values of that column

In [11]:
X_train['Loan_Amount_Term'] = X_train['Loan_Amount_Term'].fillna(X_train['Loan_Amount_Term'].mean())
X_train['LoanAmount'] = X_train['LoanAmount'].fillna(X_train['LoanAmount'].mean())
X_test['Loan_Amount_Term'] = X_test['Loan_Amount_Term'].fillna(X_test['Loan_Amount_Term'].mean())
X_test['LoanAmount'] = X_test['LoanAmount'].fillna(X_test['LoanAmount'].mean())


# View the datatypes of all columns
X_train.dtypes

Gender                       int64
Married                      int64
Dependents                   int64
Education                    int64
Self_Employed                int64
ApplicantIncome              int64
CoapplicantIncome          float64
LoanAmount                 float64
Loan_Amount_Term           float64
Credit_History             float64
Property_Area_Rural          uint8
Property_Area_Semiurban      uint8
Property_Area_Urban          uint8
dtype: object

### Confirm that we no longer have any nulls

In [12]:
X_train.isnull().sum()

Gender                     0
Married                    0
Dependents                 0
Education                  0
Self_Employed              0
ApplicantIncome            0
CoapplicantIncome          0
LoanAmount                 0
Loan_Amount_Term           0
Credit_History             0
Property_Area_Rural        0
Property_Area_Semiurban    0
Property_Area_Urban        0
dtype: int64

### Create the classifier model and the parameter grid for GridSearch

In [13]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

param_grid = {"n_estimators" : [10, 20, 50, 100],
             "max_depth" : [None, 6, 8, 10],
             "max_leaf_nodes": [None, 5, 10, 20], 
             "min_impurity_split": [0.1, 0.2, 0.3, 0.4]}

### Initialize the GridSearch to tune my hyperparameters

In [14]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(model, param_grid=param_grid, cv=3, verbose=2)

### Execute the tuning

In [15]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 256 candidates, totalling 768 fits
[CV] max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=10 
[CV]  max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=10, total=   0.0s
[CV] max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=10 
[CV]  max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=10, total=   0.0s
[CV] max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=10 
[CV]  max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=10, total=   0.0s
[CV] max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=20 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV]  max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=20, total=   0.1s
[CV] max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=20 
[CV]  max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=20, total=   0.1s
[CV] max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=20 
[CV]  max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=20, total=   0.1s
[CV] max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=50 
[CV]  max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=50, total=   0.1s
[CV] max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=50 
[CV]  max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=50, total=   0.1s
[CV] max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=50 
[CV]  max_depth=None, max_leaf_nodes=None, min_impurity_split=0.1, n_estimators=50, total=   0.

[Parallel(n_jobs=1)]: Done 768 out of 768 | elapsed:  1.4min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

### View the results

In [16]:
print(f"Best parameters: {grid.best_params_}")
print(f"Test set score: {grid.score(X_test, y_test)}")

Best parameters: {'max_depth': 6, 'max_leaf_nodes': 10, 'min_impurity_split': 0.3, 'n_estimators': 10}
Test set score: 0.8506493506493507


# Saving the model

In [17]:
import pickle

with open('RF_model.pkl','wb') as f:
    pickle.dump(grid, f)

### Loading the saved model

In [18]:
with open('RF_model.pkl', 'rb') as f:
    model = pickle.load(f)

### Test the saved model

In [19]:
print(f"Test set score: {model.score(X_test, y_test)}")

Test set score: 0.8506493506493507


In [20]:
model

GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando