In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv(Path('Resources/2019loans.csv'))
test = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
#train

In [4]:
del train['Unnamed: 0']

In [5]:
# Convert categorical data to numeric and separate target feature for training data
X_train= train.drop('loan_status', axis=1)

In [6]:
X_train_dummies = pd.get_dummies(X_train)
print(X_train_dummies.columns)
X_train_dummies

Index(['index', 'loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti',
       'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal',
       'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc', 'mths_since_

Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,57107,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,0.0,...,0,1,0,1,1,0,1,0,1,0
1,141451,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,0.0,...,0,1,0,1,1,0,1,0,1,0
2,321143,20000.0,0.1240,448.95,197000.0,11.28,0.0,0.0,12.0,0.0,...,0,1,0,1,1,0,1,0,1,0
3,11778,3000.0,0.1240,100.22,45000.0,18.08,0.0,0.0,12.0,1.0,...,0,1,0,1,1,0,1,0,1,0
4,169382,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,0.0,...,0,1,0,1,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,354912,19975.0,0.2565,801.09,28000.0,28.42,0.0,0.0,15.0,0.0,...,0,1,0,1,1,0,1,0,1,0
12176,354944,15000.0,0.1774,540.34,50000.0,23.43,4.0,0.0,16.0,0.0,...,1,1,0,1,1,0,1,0,1,0
12177,354973,3600.0,0.1862,131.28,60000.0,28.80,0.0,1.0,14.0,0.0,...,0,1,0,1,1,0,1,0,1,0
12178,355002,15000.0,0.0881,475.68,62000.0,11.44,0.0,0.0,5.0,0.0,...,0,1,0,1,0,1,1,0,1,0


In [7]:
train['debt_settlement_flag'].value_counts()

N    12175
Y        5
Name: debt_settlement_flag, dtype: int64

In [8]:
y_train_label = LabelEncoder().fit_transform(train['loan_status'])
y_train_label

array([1, 1, 1, ..., 0, 0, 0])

In [9]:
del test['Unnamed: 0']

In [10]:
test['debt_settlement_flag'].value_counts()

N    4702
Name: debt_settlement_flag, dtype: int64

In [11]:
#test

In [12]:
# Convert categorical data to numeric and separate target feature for testing data
X_test = test.drop('loan_status', axis=1)

In [13]:
X_test_dummies = pd.get_dummies(X_test)
print(X_test_dummies.columns)
X_test_dummies

Index(['index', 'loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti',
       'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal',
       'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc', 'mths_since_

Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,67991,40000.0,0.0819,814.70,140000.0,19.75,0.0,1.0,18.0,0.0,...,0,0,1,0,1,1,0,1,0,1
1,25429,6000.0,0.1524,208.70,55000.0,11.52,2.0,0.0,8.0,0.0,...,0,0,1,0,1,1,0,1,0,1
2,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,...,0,0,1,0,1,1,0,1,0,1
3,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,...,0,0,1,0,1,1,0,1,0,1
4,37505,3600.0,0.1240,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,...,0,0,1,0,1,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,77282,30000.0,0.1240,673.42,140480.0,15.74,0.0,0.0,20.0,0.0,...,1,0,1,1,0,1,0,1,0,1
4698,77291,24000.0,0.0756,747.22,50000.0,26.81,0.0,0.0,9.0,0.0,...,0,0,1,0,1,1,0,1,0,1
4699,77292,10000.0,0.2305,387.36,33000.0,38.51,0.0,2.0,7.0,0.0,...,0,1,1,1,0,1,0,1,0,1
4700,77297,8000.0,0.1862,205.86,38000.0,16.36,0.0,1.0,8.0,1.0,...,1,0,1,0,1,1,0,1,0,1


In [14]:
y_test_label = LabelEncoder().fit_transform(test['loan_status'])
y_test_label

array([1, 1, 1, ..., 0, 0, 0])

In [15]:
# add missing dummy variables to testing set
X_test_dummies = X_test_dummies.reindex(columns = X_train_dummies.columns, fill_value=0)
X_test_dummies

Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,67991,40000.0,0.0819,814.70,140000.0,19.75,0.0,1.0,18.0,0.0,...,0,1,0,1,1,0,1,0,1,0
1,25429,6000.0,0.1524,208.70,55000.0,11.52,2.0,0.0,8.0,0.0,...,0,1,0,1,1,0,1,0,1,0
2,38496,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,...,0,1,0,1,1,0,1,0,1,0
3,19667,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,...,0,1,0,1,1,0,1,0,1,0
4,37505,3600.0,0.1240,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,...,0,1,0,1,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,77282,30000.0,0.1240,673.42,140480.0,15.74,0.0,0.0,20.0,0.0,...,0,1,1,0,1,0,1,0,1,0
4698,77291,24000.0,0.0756,747.22,50000.0,26.81,0.0,0.0,9.0,0.0,...,0,1,0,1,1,0,1,0,1,0
4699,77292,10000.0,0.2305,387.36,33000.0,38.51,0.0,2.0,7.0,0.0,...,1,1,1,0,1,0,1,0,1,0
4700,77297,8000.0,0.1862,205.86,38000.0,16.36,0.0,1.0,8.0,1.0,...,0,1,0,1,1,0,1,0,1,0


## Prediction for unscaled data

#### As our data is categorical, then random forest should be our first choice.  The output of the random forest is the class selected 

#### by most trees. For regression tasks, the mean or average prediction of the individual trees is returned.Random forests generally

#### outperform decision trees. Random Forest Regression is a supervised learning algorithm that uses ensemble learning method for

#### regression. Ensemble learning method is a technique that combines predictions from multiple machine learning algorithms to make

#### a more accurate prediction than a single model. I predict, Random Forest Model with yield a better result compared to Logistic 

#### regression. The major limitation of Logistic Regression is the assumption of linearity between the dependent variable and the 

#### independent variables. It not only provides a measure of how appropriate a predictor(coefficient size)is, but also its 

#### direction of association (positive or negative).

### Logistic Regression model on the unscaled data

In [16]:
# Train the Logistic Regression model on the unscaled data and print the model score
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train_dummies, y_train_label)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [17]:
print(f"Training Data Score: {classifier.score(X_train_dummies, y_train_label)}")
print(f"Testing Data Score: {classifier.score(X_test_dummies, y_test_label)}")

Training Data Score: 0.6575533661740558
Testing Data Score: 0.5204168438962143


In [18]:
predictions = classifier.predict(X_test_dummies)
pd.DataFrame({"Prediction": predictions, "Actual": y_test_label})

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,0,1
4,1,1
...,...,...
4697,1,0
4698,1,0
4699,1,0
4700,1,0


In [19]:
from sklearn.metrics import confusion_matrix, classification_report

y_true = y_test_label
y_pred = classifier.predict(X_test_dummies)
confusion_matrix(y_true, y_pred)

array([[ 502, 1849],
       [ 406, 1945]], dtype=int64)

In [20]:
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn) # (111 + 128) / (111 + 5 + 128 + 6)
print(f"Accuracy: {accuracy}")

Accuracy: 0.5204168438962143


In [21]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.55      0.21      0.31      2351
           1       0.51      0.83      0.63      2351

    accuracy                           0.52      4702
   macro avg       0.53      0.52      0.47      4702
weighted avg       0.53      0.52      0.47      4702



### Random Forest Classifier model  on the unscaled data

In [22]:
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_dummies, y_train_label)
print(f'Training Score: {clf.score(X_train_dummies, y_train_label)}')
print(f'Testing Score: {clf.score(X_test_dummies, y_test_label)}')

Training Score: 1.0
Testing Score: 0.6631220757124627


In [23]:
from sklearn.metrics import confusion_matrix, classification_report

y_true = y_test_label
y_pred = clf.predict(X_test_dummies)
confusion_matrix(y_true, y_pred)

array([[1067, 1284],
       [ 300, 2051]], dtype=int64)

In [24]:
pd.DataFrame({"Prediction": y_pred, "Actual": y_test_label})

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
...,...,...
4697,0,0
4698,0,0
4699,0,0
4700,1,0


In [25]:
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn) # (111 + 128) / (111 + 5 + 128 + 6)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6631220757124627


In [26]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.45      0.57      2351
           1       0.61      0.87      0.72      2351

    accuracy                           0.66      4702
   macro avg       0.70      0.66      0.65      4702
weighted avg       0.70      0.66      0.65      4702



### with 66% accuracy, Random Forest Model gave a better result than logistic regression with accuracy of 52% on scaled data

## Prediction for scaled data. 

####  Feature Scaling is required for correct prediction and results.  Regression Coefficients are directly influenced by scale of 

####  Features. The feature with high magnitude will weigh lot more than features having low magnitude even if they are more crucial 

####  in determining the output. Logistic Regression algorithms are very sensitive to the Feature Scaling, while “Tree-Based” 

#### Algorithm like Random Forest  Regression are insensitive to the Feature scaling. Having said that I predict Logistic Regression 

####  would give a better accuracy  on the scaled data. 



### Logistic Regression model on the scaled data

In [27]:
# Scale the data
# Train the Logistic Regression model on the scaled data and print the model score
scaler = StandardScaler().fit(X_train_dummies)
X_train_scaled = scaler.transform(X_train_dummies)
X_test_scaled = scaler.transform(X_test_dummies)
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train_label)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test_label)}")

Training Data Score: 0.6358784893267652
Testing Data Score: 0.5061675882603147


In [28]:
from sklearn.metrics import confusion_matrix, classification_report

y_true = y_test_label
y_pred = classifier.predict(X_test_scaled)
confusion_matrix(y_true, y_pred)

array([[ 525, 1826],
       [ 496, 1855]], dtype=int64)

In [29]:
pd.DataFrame({"Prediction": y_pred, "Actual": y_test_label})

Unnamed: 0,Prediction,Actual
0,0,1
1,1,1
2,1,1
3,0,1
4,1,1
...,...,...
4697,0,0
4698,1,0
4699,1,0
4700,1,0


In [30]:
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn) # (111 + 128) / (111 + 5 + 128 + 6)
print(f"Accuracy: {accuracy}")

Accuracy: 0.5061675882603147


In [31]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.51      0.22      0.31      2351
           1       0.50      0.79      0.62      2351

    accuracy                           0.51      4702
   macro avg       0.51      0.51      0.46      4702
weighted avg       0.51      0.51      0.46      4702



### Random Forest Classifier model on the scaled data

In [32]:
# Train a Random Forest Classifier model on the scaled data and print the model score
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train_label)
print(f'Training Score: {clf.score(X_train_scaled, y_train_label)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test_label)}')

Training Score: 1.0
Testing Score: 0.6635474266269672


In [33]:
from sklearn.metrics import confusion_matrix, classification_report

y_true = y_test_label
y_pred = clf.predict(X_test_scaled)
confusion_matrix(y_true, y_pred)

array([[1069, 1282],
       [ 300, 2051]], dtype=int64)

In [34]:
pd.DataFrame({"Prediction": y_pred, "Actual": y_test_label})

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
...,...,...
4697,0,0
4698,0,0
4699,0,0
4700,1,0


In [35]:
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
accuracy = (tp + tn) / (tp + fp + tn + fn) # (111 + 128) / (111 + 5 + 128 + 6)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6635474266269672


In [36]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.45      0.57      2351
           1       0.62      0.87      0.72      2351

    accuracy                           0.66      4702
   macro avg       0.70      0.66      0.65      4702
weighted avg       0.70      0.66      0.65      4702



### with the same 66% accuracy as unscaled result, Random forest Model gave a better result than Logistic regression model with accuracy of 50.6% on scaled data. Here we can see that Logistic regression model accuracy came down by 1.4% from unscaled result. Going back to my predictions, logistic regression being very sensitive to scaling, its accuracy score got affected with scaling the data. 