In [19]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
train_df['debt_settlement_flag'].value_counts()

N    12175
Y        5
Name: debt_settlement_flag, dtype: int64

In [4]:
test_df['debt_settlement_flag'].value_counts()

N    4702
Name: debt_settlement_flag, dtype: int64

In [5]:
dummy1=pd.get_dummies(train_df)

In [6]:
dummy1.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,loan_status_low_risk,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,57107,57107,13375.0,0.1797,483.34,223000.0,29.99,0.0,0.0,15.0,...,1,1,0,1,1,0,1,0,1,0
1,141451,141451,21000.0,0.1308,478.68,123000.0,11.26,2.0,0.0,16.0,...,1,1,0,1,1,0,1,0,1,0
2,321143,321143,20000.0,0.124,448.95,197000.0,11.28,0.0,0.0,12.0,...,1,1,0,1,1,0,1,0,1,0
3,11778,11778,3000.0,0.124,100.22,45000.0,18.08,0.0,0.0,12.0,...,1,1,0,1,1,0,1,0,1,0
4,169382,169382,30000.0,0.1612,1056.49,133000.0,27.77,0.0,2.0,13.0,...,1,1,0,1,1,0,1,0,1,0


In [7]:
#add  column
testing_data = test_df.copy()

In [8]:
testing_data.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,67991,67991,40000.0,0.0819,814.7,MORTGAGE,140000.0,Not Verified,low_risk,n,...,97.7,0.0,0.0,0.0,527975.0,70914.0,74600.0,99475.0,N,N
1,25429,25429,6000.0,0.1524,208.7,RENT,55000.0,Not Verified,low_risk,n,...,66.7,0.0,0.0,0.0,34628.0,23460.0,5900.0,23628.0,N,N
2,38496,38496,3600.0,0.1695,128.27,RENT,42000.0,Not Verified,low_risk,n,...,100.0,0.0,0.0,0.0,23100.0,19183.0,7300.0,15000.0,N,N
3,19667,19667,20000.0,0.1524,478.33,RENT,100000.0,Not Verified,low_risk,n,...,100.0,50.0,0.0,0.0,56481.0,43817.0,13800.0,35981.0,N,N
4,37505,37505,3600.0,0.124,120.27,RENT,50000.0,Not Verified,low_risk,n,...,100.0,25.0,0.0,0.0,45977.0,32448.0,21000.0,24977.0,N,N


In [9]:
#drop columns
testing_data = testing_data.drop(['loan_status', 'index', 'Unnamed: 0'], axis=1)

In [10]:
testing_dum = pd.get_dummies(testing_data)

In [11]:
testing_dum.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,40000.0,0.0819,814.7,140000.0,19.75,0.0,1.0,18.0,0.0,9471.0,...,0,0,1,0,1,1,0,1,0,1
1,6000.0,0.1524,208.7,55000.0,11.52,2.0,0.0,8.0,0.0,1280.0,...,0,0,1,0,1,1,0,1,0,1
2,3600.0,0.1695,128.27,42000.0,6.74,0.0,0.0,6.0,0.0,4757.0,...,0,0,1,0,1,1,0,1,0,1
3,20000.0,0.1524,478.33,100000.0,12.13,0.0,2.0,7.0,0.0,12731.0,...,0,0,1,0,1,1,0,1,0,1
4,3600.0,0.124,120.27,50000.0,16.08,0.0,3.0,6.0,0.0,10413.0,...,0,0,1,0,1,1,0,1,0,1


In [12]:
#add column to equal 0
testing_dum['debt_settlement_flag_Y']=0

In [13]:
#check for nulls
testing_dum.isnull().any()

loan_amnt                     False
int_rate                      False
installment                   False
annual_inc                    False
dti                           False
                              ...  
application_type_Joint App    False
hardship_flag_N               False
hardship_flag_Y               False
debt_settlement_flag_N        False
debt_settlement_flag_Y        False
Length: 92, dtype: bool

In [14]:
training_data = train_df.copy()

In [15]:
training_data = training_data.drop(['Unnamed: 0', 'loan_status', 'index' ], axis=1)

In [16]:
training_dum = pd.get_dummies(training_data)

In [18]:
training_dum.columns

Index(['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'dti',
       'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal',
       'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_in

In [20]:
#build scaler
scaler = StandardScaler().fit(training_dum)
X_train_scaled = scaler.transform(training_dum)
X_test_scaled = scaler.transform(testing_dum)

In [21]:
X_test_scaled[0]

array([ 2.20755943, -1.12001617,  1.0371484 ,  0.2974932 , -0.11879708,
       -0.32002561,  0.53213327,  0.86437267, -0.35198632, -0.35939018,
        1.6408751 ,  2.25599638,  2.25595752,  0.67642899,  0.67658556,
        0.75678349,  0.27544777, -0.15098128,  0.        ,  0.        ,
        0.45353021, -0.13312632,  0.        , -0.00906138, -0.13971415,
        1.29387623, -0.04305549,  0.85646683,  0.12620873,  0.01389393,
       -0.36898244,  0.33413335, -0.87759252, -0.22528046, -0.66929854,
       -0.5361904 , -1.73531986,  1.1753957 , -0.90827924,  0.10235081,
       -0.53462945, -0.61674152,  0.51199143,  2.65250191, -1.34596659,
       -0.07108199, -0.00906138,  1.13730366,  0.16180251, -0.70933534,
       -0.81409859,  1.01813477,  0.70776118, -0.87468909, -0.35942048,
        0.7884271 ,  0.04318247,  0.22223451,  0.58239031,  1.32738286,
        0.44378311,  1.0904904 ,  0.04140276,  0.86688248,  0.        ,
       -0.00906138, -0.1488685 , -0.18256126,  0.35884239, -0.97

In [23]:
y_train = train_df['loan_status']

In [24]:
y_test = test_df['loan_status']

In [25]:
# Logistic regression classifier model

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train_scaled, y_train)
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Testing Data Score: 0.767333049766057


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
# Randon Forest Classifier Model
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=1).fit(X_train_scaled, y_train)
print(f"Testing Score:  {clf.score(X_test_scaled, y_test)}")

Testing Score:  0.6548277328796257


In [None]:
# Convert categorical data to numeric and separate target feature for training data

In [None]:
# Convert categorical data to numeric and separate target feature for testing data

In [None]:
# add missing dummy variables to testing set

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score

In [None]:
# Scale the data

In [None]:
# Train the Logistic Regression model on the scaled data and print the model score

In [None]:
# Train a Random Forest Classifier model and print the model score

###### 

In [None]:
# Train a Random Forest Classifier model on the scaled data and print the model score