In [26]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
# Read in source data

train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
# Convert categorical data to numeric and separate target feature for training data

In [4]:
train_df['debt_settlement_flag'].value_counts()

N    12175
Y        5
Name: debt_settlement_flag, dtype: int64

In [5]:
copy_train_df = train_df.copy()

In [6]:
copy_train_df = copy_train_df.drop(['Unnamed: 0', 'loan_status', 'index' ], axis=1)

In [7]:
X_train = pd.get_dummies(copy_train_df)

In [8]:
# Convert categorical data to numeric and separate target feature for testing data

In [9]:
test_df['debt_settlement_flag'].value_counts()

N    4702
Name: debt_settlement_flag, dtype: int64

In [10]:
copy_test_df = test_df.copy()

In [12]:
 copy_test_df.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.124,120.27,RENT,50000.0,Not Verified,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [13]:
copy_test_df = copy_test_df.drop(['loan_status', 'index', 'Unnamed: 0'], axis=1)

In [14]:
X_test= pd.get_dummies(copy_test_df)

In [15]:
# add missing dummy variables to testing set

In [16]:
X_test['debt_settlement_flag_Y']=0

In [17]:
X_test.isnull().any()

loan_amnt                     False
int_rate                      False
installment                   False
annual_inc                    False
dti                           False
                              ...  
application_type_Joint App    False
hardship_flag_N               False
hardship_flag_Y               False
debt_settlement_flag_N        False
debt_settlement_flag_Y        False
Length: 92, dtype: bool

In [18]:
# Add target column back to y

In [19]:
y_train = train_df['loan_status']

In [23]:
y_test = test_df['loan_status']

In [24]:
# Train the Logistic Regression model on the unscaled data and print the model score

In [25]:
# from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',max_iter=200000)
classifier.fit(X_train, y_train)
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Testing Data Score: 0.568481497235219


In [None]:
# Train a Random Forest Classifier model and print the model score

In [27]:
# from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=1).fit(X_train, y_train)
print(f"Testing Score:  {clf.score(X_test, y_test)}")

Testing Score:  0.6544023819651212


In [None]:
# Scale the data

In [28]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Train the Logistic Regression model on the scaled data and print the model score

In [29]:
classifier_scaled = LogisticRegression(solver='lbfgs',max_iter=200000)
classifier_scaled.fit(X_train_scaled, y_train)
print(f"Testing Data Score: {classifier_scaled.score(X_test_scaled, y_test)}")

Testing Data Score: 0.7681837515950659


In [None]:
# Train a Random Forest Classifier model on the scaled data and print the model score

In [30]:
clf_scaled = RandomForestClassifier(random_state=1).fit(X_train_scaled, y_train)
print(f"Testing Score:  {clf_scaled.score(X_test_scaled, y_test)}")

Testing Score:  0.6548277328796257
