In [13]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, recall_score,precision_score
from sklearn.ensemble import RandomForestClassifier
import joblib
#%matplotlib inline

In [2]:
loans = pd.read_csv('loan_data.csv')
loans.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [20]:
#Create a list of 1 element containing the string 'purpose'
cat_feats = ['purpose']

#create a fixed larger DataFrame that has new feature columns with dummy variables
final_data = pd.get_dummies(loans,columns=cat_feats,drop_first=True)

X = final_data.drop('not.fully.paid', axis=1)
y= final_data ['not.fully.paid']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,
                                                    random_state=42)
#max number of features
m_features = int(np.sqrt(X.shape[1]))
                 
# prediction with Random Forest 
rfc = RandomForestClassifier(n_estimators=100,
                             max_features=m_features,
                             random_state=42)
rfc.fit(X_train, y_train)

X_test_pred = rfc.predict(X_test)
X_train_pred = rfc.predict(X_train)

In [None]:
print('RFC:' , '\n' , confusion_matrix(y_train, X_train_pred))
print('RFC:' , '\n' , classification_report(y_train, X_train_pred))
print('\n')
print('RFC:' , '\n' , confusion_matrix(y_test, X_test_pred))
print('RFC:' , '\n' , classification_report(y_test, X_test_pred))

In [21]:
# test Metric 
test_f1_score = f1_score(y_test, X_test_pred)
test_recall = recall_score(y_test, X_test_pred)
test_precision = precision_score(y_test, X_test_pred)

# train Metric 
train_f1_score = f1_score(y_train, X_train_pred)
train_recall = recall_score(y_train, X_train_pred)
train_precision = precision_score(y_train, X_train_pred)

# create an evaluation table
performance_table = pd.DataFrame({
    'Metric': ['F1_score', 'Precison', 'Recall'],
    'training data': [train_f1_score,train_precision,train_recall],
    'test data' : [test_f1_score, test_precision, test_recall]
})

print(performance_table)

     Metric  training data  test data
0  F1_score            1.0   0.050000
1  Precison            1.0   0.533333
2    Recall            1.0   0.026230


# Analyse

the results shows an $\textbf{*Overfitting*:}$ 
That means the model learns the specifics of the training data very well but does not generalize to new data.


In [23]:
# get the importance of variables
importance = rfc.feature_importances_

importance_df = pd.DataFrame({'Variable': X.columns,
                              'Importance': importance})

importance_df = importance_df.sort_values(by='Importance',
                                          ascending=False).reset_index(drop=True)

In [24]:
importance_df

Unnamed: 0,Variable,Importance
0,installment,0.116039
1,days.with.cr.line,0.114783
2,revol.util,0.112184
3,revol.bal,0.112074
4,log.annual.inc,0.110376
5,dti,0.110292
6,int.rate,0.102555
7,fico,0.07873
8,inq.last.6mths,0.053876
9,purpose_debt_consolidation,0.015534


In [26]:
#Store the model in a good format
joblib.dump(rfc, 'random_forest_model.pkl')

['random_forest_classifier_model.pkl']