In [1]:
# Import Modules
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

import hvplot.pandas

# Needed for decision tree visualization
import pydotplus
from IPython.display import Image

In [2]:
train_df=pd.read_csv('./train.csv')

In [3]:
train_df.describe()

Unnamed: 0,UNIQUEID,DISBURSED_AMOUNT,ASSET_COST,LTV,BRANCH_ID,SUPPLIER_ID,MANUFACTURER_ID,CURRENT_PINCODE_ID,STATE_ID,EMPLOYEE_CODE_ID,...,SEC_OVERDUE_ACCTS,SEC_CURRENT_BALANCE,SEC_SANCTIONED_AMOUNT,SEC_DISBURSED_AMOUNT,PRIMARY_INSTAL_AMT,SEC_INSTAL_AMT,NEW_ACCTS_IN_LAST_SIX_MONTHS,DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS,NO_OF_INQUIRIES,LOAN_DEFAULT
count,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,...,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0
mean,535917.573376,54356.993528,75865.07,74.74653,72.936094,19638.635035,69.028054,3396.880247,7.262243,1549.477148,...,0.007244,5427.793,7295.923,7179.998,13105.48,323.2684,0.381833,0.097481,0.206615,0.217071
std,68315.693711,12971.314171,18944.78,11.456636,69.834995,3491.949566,22.141304,2238.147502,4.48223,975.261278,...,0.111079,170237.0,183156.0,182592.5,151367.9,15553.69,0.955107,0.384439,0.706498,0.412252
min,417428.0,13320.0,37000.0,10.03,1.0,10524.0,45.0,1.0,1.0,1.0,...,0.0,-574647.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,476786.25,47145.0,65717.0,68.88,14.0,16535.0,48.0,1511.0,4.0,713.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,535978.5,53803.0,70946.0,76.8,61.0,20333.0,86.0,2970.0,6.0,1451.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,595039.75,60413.0,79201.75,83.67,130.0,23000.0,86.0,5677.0,10.0,2362.0,...,0.0,0.0,0.0,0.0,1999.0,0.0,0.0,0.0,0.0,0.0
max,671084.0,990572.0,1628992.0,95.0,261.0,24803.0,156.0,7345.0,22.0,3795.0,...,8.0,36032850.0,30000000.0,30000000.0,25642810.0,4170901.0,35.0,20.0,36.0,1.0


In [4]:
#CHANGING DATE COLUMNS TO A USABLE FORMAT
train_df['DATE_OF_BIRTH'] = pd.to_datetime(train_df['DATE_OF_BIRTH'], format='%d-%m-%Y')
train_df['DISBURSAL_DATE'] = pd.to_datetime(train_df['DISBURSAL_DATE'], format='%d-%m-%Y')
train_df['AGE_AT_DISBURSAL']= pd.to_numeric(train_df['DISBURSAL_DATE']-train_df['DATE_OF_BIRTH'])/(1e9*60*60*24*365)
train_df[train_df['DISBURSED_AMOUNT']==990572.000000]

Unnamed: 0,UNIQUEID,DISBURSED_AMOUNT,ASSET_COST,LTV,BRANCH_ID,SUPPLIER_ID,MANUFACTURER_ID,CURRENT_PINCODE_ID,DATE_OF_BIRTH,EMPLOYMENT_TYPE,...,SEC_DISBURSED_AMOUNT,PRIMARY_INSTAL_AMT,SEC_INSTAL_AMT,NEW_ACCTS_IN_LAST_SIX_MONTHS,DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS,AVERAGE_ACCT_AGE,CREDIT_HISTORY_LENGTH,NO_OF_INQUIRIES,LOAN_DEFAULT,AGE_AT_DISBURSAL
210252,440173,990572,1628992,61.39,138,24062,152,3368,1990-06-16,,...,0,6224,0,0,0,5yrs 10mon,5yrs 10mon,0,0,28.186301


In [5]:
pd.DataFrame(train_df['DISBURSAL_DATE']).hvplot.hist()

In [6]:
defaults=train_df[train_df['LOAN_DEFAULT']==1]
defaults['PRI_CURRENT_BALANCE'].mean()

116892.87601509553

In [7]:
#DROPPING EXTRA COLUMNS FROM CH
train_df = train_df.drop(['DATE_OF_BIRTH', 'DISBURSAL_DATE'], axis=1)

In [8]:
#CHANGING THE FORMAT OF CREDIT HISTORY LENGTH INTO A USABLE FORMAT. CHANGING FROM 0 YRS 0 MON TO A SINGLE NUMERICAL
#NUMBER THAT REPRESENTS THE TOTAL AMOUNT OF MONTHS 

train_df['CH_Years'] = train_df['CREDIT_HISTORY_LENGTH'].str.extract(r'(\d+)yrs').astype(int)
train_df['CH_Months'] = train_df['CREDIT_HISTORY_LENGTH'].str.extract(r'(\d+)mon').astype(int)
train_df['Credit_History_Total_Months'] = train_df['CH_Years'] * 12 + train_df['CH_Months']


In [9]:
#DROPPING EXTRA COLUMNS FROM CH
train_df = train_df.drop(['CH_Years', 'CH_Months'], axis=1)

In [10]:
#CHANGING FORMAT FOR AVERAGE AGE INTO A USUABLE FORMAT.
train_df['Acc_Years'] = train_df['AVERAGE_ACCT_AGE'].str.extract(r'(\d+)yrs').astype(int)
train_df['Acc_Months'] = train_df['AVERAGE_ACCT_AGE'].str.extract(r'(\d+)mon').astype(int)
train_df['Average_Acc_Age_Total_Months'] = train_df['Acc_Years'] * 12 + train_df['Acc_Months']
#DROPPING EXTRA COLUMNS FROM ACC
train_df = train_df.drop(['Acc_Years', 'Acc_Months'], axis=1)
#DROPPING ORIGINAL AVERAGE_ACCT_AGE AND CREDIT_HISTORY_LENGTH
train_df = train_df.drop(['AVERAGE_ACCT_AGE', 'CREDIT_HISTORY_LENGTH'], axis=1)

In [11]:
X=train_df.drop(['LOAN_DEFAULT','PERFORM_CNS_SCORE_DESCRIPTION','CURRENT_PINCODE_ID','SEC_NO_OF_ACCTS',
       'SEC_ACTIVE_ACCTS', 'SEC_OVERDUE_ACCTS', 'SEC_CURRENT_BALANCE',
       'SEC_SANCTIONED_AMOUNT', 'SEC_DISBURSED_AMOUNT', 'PRIMARY_INSTAL_AMT',
       'SEC_INSTAL_AMT', 'PRI_DISBURSED_AMOUNT'],axis=1)
y=train_df['LOAN_DEFAULT']

In [12]:
X.columns

Index(['UNIQUEID', 'DISBURSED_AMOUNT', 'ASSET_COST', 'LTV', 'BRANCH_ID',
       'SUPPLIER_ID', 'MANUFACTURER_ID', 'EMPLOYMENT_TYPE', 'STATE_ID',
       'EMPLOYEE_CODE_ID', 'MOBILENO_AVL_FLAG', 'AADHAR_FLAG', 'PAN_FLAG',
       'VOTERID_FLAG', 'DRIVING_FLAG', 'PASSPORT_FLAG', 'PERFORM_CNS_SCORE',
       'PRI_NO_OF_ACCTS', 'PRI_ACTIVE_ACCTS', 'PRI_OVERDUE_ACCTS',
       'PRI_CURRENT_BALANCE', 'PRI_SANCTIONED_AMOUNT',
       'NEW_ACCTS_IN_LAST_SIX_MONTHS', 'DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS',
       'NO_OF_INQUIRIES', 'AGE_AT_DISBURSAL', 'Credit_History_Total_Months',
       'Average_Acc_Age_Total_Months'],
      dtype='object')

In [13]:
y.describe()

count    233154.000000
mean          0.217071
std           0.412252
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: LOAN_DEFAULT, dtype: float64

In [14]:
X = pd.get_dummies(X)

X.describe()

Unnamed: 0,UNIQUEID,DISBURSED_AMOUNT,ASSET_COST,LTV,BRANCH_ID,SUPPLIER_ID,MANUFACTURER_ID,STATE_ID,EMPLOYEE_CODE_ID,MOBILENO_AVL_FLAG,...,PRI_ACTIVE_ACCTS,PRI_OVERDUE_ACCTS,PRI_CURRENT_BALANCE,PRI_SANCTIONED_AMOUNT,NEW_ACCTS_IN_LAST_SIX_MONTHS,DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS,NO_OF_INQUIRIES,AGE_AT_DISBURSAL,Credit_History_Total_Months,Average_Acc_Age_Total_Months
count,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,...,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0,233154.0
mean,535917.573376,54356.993528,75865.07,74.74653,72.936094,19638.635035,69.028054,7.262243,1549.477148,1.0,...,1.039896,0.156549,165900.1,218503.9,0.381833,0.097481,0.206615,34.493769,16.252404,8.915764
std,68315.693711,12971.314171,18944.78,11.456636,69.834995,3491.949566,22.141304,4.48223,975.261278,0.0,...,1.941496,0.548787,942273.6,2374794.0,0.955107,0.384439,0.706498,9.831424,28.581255,15.106416
min,417428.0,13320.0,37000.0,10.03,1.0,10524.0,45.0,1.0,1.0,1.0,...,0.0,0.0,-6678296.0,0.0,0.0,0.0,0.0,18.016438,0.0,0.0
25%,476786.25,47145.0,65717.0,68.88,14.0,16535.0,48.0,4.0,713.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.372603,0.0,0.0
50%,535978.5,53803.0,70946.0,76.8,61.0,20333.0,86.0,6.0,1451.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.69589,0.0,0.0
75%,595039.75,60413.0,79201.75,83.67,130.0,23000.0,86.0,10.0,2362.0,1.0,...,1.0,0.0,35006.5,62500.0,0.0,0.0,0.0,41.427397,24.0,13.0
max,671084.0,990572.0,1628992.0,95.0,261.0,24803.0,156.0,22.0,3795.0,1.0,...,144.0,25.0,96524920.0,1000000000.0,35.0,20.0,36.0,69.172603,468.0,369.0


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
y_train.value_counts()

LOAN_DEFAULT
0    137145
1     37720
Name: count, dtype: int64

In [16]:
# Creating StandardScaler instance
scaler = StandardScaler()
# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [17]:
# borrowed some code from: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

base_model = DecisionTreeClassifier()
base_model.fit(X_train_scaled, y_train)
base_accuracy = evaluate(base_model, X_test_scaled, y_test)
# Making predictions using the testing data
predictions = base_model.predict(X_test_scaled)
# Displaying classification report
print(classification_report(y_test, predictions))

from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search
param_grid = {'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}
# Create a based model
rf = DecisionTreeClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
                          cv = 3, n_jobs = -1, verbose = 3, scoring = 'recall')
grid_search = grid_search.fit(X_train_scaled, y_train)
print(grid_search.best_params_)
best_grid = grid_search.best_estimator_

predictions = best_grid.predict(X_test_scaled)
print(classification_report(y_test, predictions))

Model Performance
Average Error: 0.3341 degrees.
Accuracy = -inf%.
              precision    recall  f1-score   support

           0       0.79      0.78      0.78     45398
           1       0.26      0.28      0.27     12891

    accuracy                           0.67     58289
   macro avg       0.53      0.53      0.53     58289
weighted avg       0.67      0.67      0.67     58289

Fitting 3 folds for each of 36 candidates, totalling 108 fits
{'criterion': 'gini', 'max_depth': 40}
              precision    recall  f1-score   support

           0       0.79      0.78      0.78     45398
           1       0.26      0.28      0.27     12891

    accuracy                           0.67     58289
   macro avg       0.53      0.53      0.53     58289
weighted avg       0.67      0.67      0.67     58289



In [19]:
best_grid.feature_importances_

array([0.11957724, 0.08269672, 0.08485975, 0.10791239, 0.04661274,
       0.07787964, 0.01912434, 0.03187859, 0.08835301, 0.        ,
       0.00495288, 0.00553765, 0.00631055, 0.00286477, 0.00030075,
       0.03435314, 0.01681421, 0.00864918, 0.00520425, 0.0249618 ,
       0.02507852, 0.00699632, 0.00569117, 0.01052657, 0.11887216,
       0.02684385, 0.02271178, 0.00635711, 0.00807892])

In [20]:
# Create DOT data
#dot_data = tree.export_graphviz(model, out_file=None, feature_names=X.columns, class_names=['0', '1'], filled=False)

# Draw graph
#graph = pydotplus.graph_from_dot_data(dot_data)

# Show graph
#Image(graph.create_png())

In [21]:
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)
rf_model = rf_model.fit(X_train_scaled, y_train)
predictions = rf_model.predict(X_test_scaled)

In [22]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [23]:
print(cm_df)
print(acc_score)

          Predicted 0  Predicted 1
Actual 0        44982          416
Actual 1        12521          370
0.7780541783183791
