In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix


In [4]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [5]:
# Convert categorical data to numeric and separate target feature for training data

y_train = train_df["target"]
X_train = train_df.drop(columns = ["target"])
X_train = pd.get_dummies(X_train)
X_train.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,7000.0,0.1894,256.38,75000.0,28.62,0.0,2.0,20.0,0.0,40414.0,...,0,1,1,0,1,0,1,0,1,0
1,40000.0,0.1614,975.71,102000.0,11.72,2.0,0.0,10.0,0.0,43531.0,...,0,1,0,1,1,0,1,0,1,0
2,11000.0,0.2055,294.81,45000.0,37.25,1.0,3.0,23.0,0.0,8242.0,...,1,1,0,1,1,0,1,0,1,0
3,4000.0,0.1612,140.87,38000.0,42.89,1.0,0.0,7.0,0.0,12767.0,...,0,1,0,1,0,1,1,0,1,0
4,14000.0,0.1797,505.93,43000.0,22.16,1.0,0.0,22.0,0.0,11182.0,...,0,1,0,1,1,0,1,0,1,0


In [6]:
# Convert categorical data to numeric and separate target feature for testing data

y_test = test_df["target"]
X_test = test_df.drop(columns = ["target"])
X_test = pd.get_dummies(X_test)
X_test.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,40000.0,0.1033,856.4,128700.0,12.47,0.0,1.0,8.0,0.0,38113.0,...,1,0,1,0,1,1,0,0,1,1
1,24450.0,0.143,572.72,44574.0,15.05,0.0,1.0,6.0,0.0,1665.0,...,0,0,1,0,1,1,0,1,0,1
2,13500.0,0.143,316.23,60000.0,28.72,0.0,0.0,8.0,0.0,13857.0,...,0,0,1,0,1,1,0,0,1,1
3,10625.0,0.1774,268.31,60000.0,15.7,0.0,4.0,17.0,0.0,6216.0,...,0,1,1,0,1,1,0,1,0,1
4,6375.0,0.1862,232.46,60000.0,35.5,0.0,0.0,13.0,0.0,12681.0,...,1,0,1,0,1,1,0,1,0,1


In [7]:
# add missing dummy variables to testing set

for column in X_train.columns:
    if column not in X_test.columns:
        print(f'adding column {column}')
        X_test[column] = 0

adding column debt_settlement_flag_Y


In [8]:
# Train the Logistic Regression model on the unscaled data and print the model score

logisticRegr = LogisticRegression(
    solver='lbfgs', 
    max_iter=200,
    random_state=0).fit(X_train, y_train)
print('Logistic Regression train score: ', logisticRegr.score(X_train, y_train))
print('Logistic Regression test score: ', logisticRegr.score(X_test, y_test))

Logistic Regression train score:  0.666256157635468
Logistic Regression test score:  0.5289238621863037


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
# Train a Random Forest Classifier model and print the model score

randomForestClass = RandomForestClassifier(random_state=0)
randomForestClass.fit(X_train, y_train)
print("RandomForestClassifier train score: ", randomForestClass.score(X_train,y_train))
print("RandomForestClassifier test score: ", randomForestClass.score(X_test,y_test))


RandomForestClassifier train score:  1.0
RandomForestClassifier test score:  0.6333475116971502


In [10]:
# Scale the data

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# Train the Logistic Regression model on the scaled data and print the model score

logisticRegr_scaled = LogisticRegression(
    solver='lbfgs',
    max_iter=200,
    random_state=0
)
model = logisticRegr_scaled.fit(X_train_scaled, y_train)
print("LogisticRegression scaled train score: ", logisticRegr_scaled.score(X_train_scaled, y_train))
print("LogisticRegression scaled test score: ", logisticRegr_scaled.score(X_test_scaled, y_test))

LogisticRegression scaled train score:  0.7111658456486043
LogisticRegression scaled test score:  0.7598894087622289


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
# Train a Random Forest Classifier model on the scaled data and print the model score

randomForestClass_scaled = RandomForestClassifier(random_state=0)
model = randomForestClass_scaled.fit(X_train_scaled, y_train)
print("RandomForestClassifier scaled train score: ", randomForestClass_scaled.score(X_train_scaled, y_train))
print("RandomForestClassifier scaled test score: ", randomForestClass_scaled.score(X_test_scaled, y_test))


RandomForestClassifier scaled train score:  1.0
RandomForestClassifier scaled test score:  0.6344108889834114


In [13]:
predictions = model.predict(X_test_scaled)
confusion_matrix(y_test, predictions)

array([[1947,  404],
       [1315, 1036]], dtype=int64)