# Import data

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer 

In [2]:
data = pd.read_csv('10K_Lending_Club_Loans.csv', encoding='latin')
data.head(30)

Unnamed: 0,loan_id,loan_amnt,funded_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,...,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,mths_since_last_major_derog,policy_code,is_bad
0,100001,4000,4000,60 months,7.29%,79.76,A,A4,Time Warner Cable,10+ years,...,,15.0,0.0,12087,12.1,44.0,f,,1,0
1,100002,16000,16000,60 months,18.25%,408.48,F,F1,Ottawa University,< 1 year,...,,4.0,0.0,10114,64.0,5.0,f,,1,0
2,100003,8700,8700,36 months,7.88%,272.15,A,A5,Kennedy Wilson,4 years,...,,4.0,0.0,81,0.6,8.0,f,,1,0
3,100004,18000,18000,60 months,11.49%,395.78,B,B4,TOWN OF PLATTEKILL,10+ years,...,,6.0,0.0,10030,37.1,23.0,f,,1,0
4,100005,16000,16000,36 months,11.83%,530.15,B,B3,Belmont Correctional,10+ years,...,,8.0,0.0,10740,40.4,21.0,f,,1,0
5,100006,3000,3000,60 months,15.58%,72.29,D,D3,BAE Systems,4 years,...,,6.0,0.0,1715,26.4,25.0,f,,1,0
6,100007,14000,8725,60 months,7.51%,174.88,A,A4,Peninsula Counseling Center,10+ years,...,,18.0,0.0,5466,11.1,29.0,f,,1,0
7,100008,3975,3975,60 months,17.58%,100.04,D,D4,Health Plan of Nevada,6 years,...,,9.0,0.0,10354,95.9,10.0,f,,1,0
8,100009,25000,25000,36 months,15.58%,873.76,D,D3,John Deere,2 years,...,,11.0,0.0,19662,59.2,27.0,f,,1,0
9,100010,10000,10000,36 months,8.00%,313.37,A,A3,,< 1 year,...,,11.0,0.0,19998,18.3,23.0,f,,1,0


# Define catagorical and numerical data

In [5]:
y = data.pop('is_bad')
data.pop('url')
data.pop('pymnt_plan')
data.pop('policy_code')
data.pop('initial_list_status')
data.pop('mths_since_last_major_derog')
data.pop('loan_id')

0       100001
1       100002
2       100003
3       100004
4       100005
         ...  
9995    109996
9996    109997
9997    109998
9998    109999
9999    110000
Name: loan_id, Length: 10000, dtype: int64

In [6]:
num_features = list(data.select_dtypes(include = "number").columns)
num_features

['loan_amnt',
 'funded_amnt',
 'installment',
 'annual_inc',
 'dti',
 'delinq_2yrs',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'mths_since_last_record',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc']

In [7]:
cat_features = list(data.select_dtypes(include = 'object').columns)
cat_features

['term',
 'int_rate',
 'grade',
 'sub_grade',
 'emp_title',
 'emp_length',
 'home_ownership',
 'verification_status',
 'desc',
 'purpose',
 'title',
 'zip_code',
 'addr_state',
 'earliest_cr_line']

In [8]:
text_features = [feature for feature in cat_features if len(set(data[feature])) > 100]
[cat_features.remove(feature) for feature in text_features]
text_features

['int_rate', 'emp_title', 'desc', 'title', 'zip_code', 'earliest_cr_line']

In [9]:
num_df = data[num_features].fillna(0)
num_df

Unnamed: 0,loan_amnt,funded_amnt,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc
0,4000,4000,79.76,50000.0,10.87,0.0,0.0,0.0,0.0,15.0,0.0,12087,12.1,44.0
1,16000,16000,408.48,39216.0,9.15,0.0,2.0,0.0,0.0,4.0,0.0,10114,64.0,5.0
2,8700,8700,272.15,65000.0,11.24,0.0,0.0,0.0,0.0,4.0,0.0,81,0.6,8.0
3,18000,18000,395.78,57500.0,6.18,1.0,0.0,16.0,0.0,6.0,0.0,10030,37.1,23.0
4,16000,16000,530.15,50004.0,19.03,0.0,4.0,0.0,0.0,8.0,0.0,10740,40.4,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,8700,8700,268.32,66250.0,9.40,0.0,1.0,0.0,0.0,8.0,0.0,3656,24.1,10.0
9996,8000,8000,265.68,26000.0,20.49,0.0,1.0,79.0,0.0,8.0,0.0,6709,58.9,12.0
9997,16000,16000,545.67,47831.0,24.13,0.0,0.0,0.0,111.0,9.0,1.0,11346,60.7,17.0
9998,3600,3600,83.04,70000.0,16.18,2.0,2.0,16.0,0.0,9.0,0.0,17157,50.9,27.0


In [10]:
cat_df = data[cat_features].fillna('missing')
cat_df.head(30)

Unnamed: 0,term,grade,sub_grade,emp_length,home_ownership,verification_status,purpose,addr_state
0,60 months,A,A4,10+ years,MORTGAGE,not verified,medical,TX
1,60 months,F,F1,< 1 year,RENT,not verified,debt_consolidation,KS
2,36 months,A,A5,4 years,RENT,not verified,credit_card,CA
3,60 months,B,B4,10+ years,MORTGAGE,not verified,debt_consolidation,NY
4,36 months,B,B3,10+ years,MORTGAGE,VERIFIED - income,debt_consolidation,OH
5,60 months,D,D3,4 years,RENT,VERIFIED - income,other,DC
6,60 months,A,A4,10+ years,MORTGAGE,not verified,credit_card,NY
7,60 months,D,D4,6 years,MORTGAGE,VERIFIED - income source,debt_consolidation,NV
8,36 months,D,D3,2 years,MORTGAGE,VERIFIED - income,debt_consolidation,IL
9,36 months,A,A3,< 1 year,RENT,not verified,car,CA


In [11]:
one_hot = OneHotEncoder(handle_unknown="ignore")
one_hot

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='ignore', sparse=True)

In [12]:
cat_df = pd.DataFrame(data=one_hot.fit_transform(cat_df).toarray(), columns=one_hot.get_feature_names(), index=cat_df.index)
cat_df.head(25)

Unnamed: 0,x0_ 36 months,x0_ 60 months,x1_A,x1_B,x1_C,x1_D,x1_E,x1_F,x1_G,x2_A1,...,x7_SD,x7_TN,x7_TX,x7_UT,x7_VA,x7_VT,x7_WA,x7_WI,x7_WV,x7_WY
0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Merge data sets

In [13]:
df_merge = num_df.merge(cat_df, left_index=True, right_index=True)
# numeric is main, catagorical is secondary
df_merge.head(25)

Unnamed: 0,loan_amnt,funded_amnt,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,...,x7_SD,x7_TN,x7_TX,x7_UT,x7_VA,x7_VT,x7_WA,x7_WI,x7_WV,x7_WY
0,4000,4000,79.76,50000.0,10.87,0.0,0.0,0.0,0.0,15.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,16000,16000,408.48,39216.0,9.15,0.0,2.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,8700,8700,272.15,65000.0,11.24,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,18000,18000,395.78,57500.0,6.18,1.0,0.0,16.0,0.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,16000,16000,530.15,50004.0,19.03,0.0,4.0,0.0,0.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,3000,3000,72.29,47028.0,7.83,2.0,1.0,19.0,0.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,14000,8725,174.88,126000.0,14.28,0.0,0.0,0.0,0.0,18.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,3975,3975,100.04,42000.0,10.29,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,25000,25000,873.76,50000.0,15.36,0.0,2.0,0.0,0.0,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,10000,10000,313.37,40000.0,6.48,0.0,1.0,0.0,0.0,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
X_train, X_test, y_train, y_test = train_test_split(df_merge, y, test_size = 0.2) 
# capital letters represent the data set, lowercase represents columns
X_train, X_test, y_train, y_test

(      loan_amnt  funded_amnt  installment  annual_inc    dti  delinq_2yrs  \
 1428       8000         8000       167.04     24000.0  19.90          0.0   
 4006       3000         3000       107.70     60000.0  24.48          0.0   
 3183      24000        24000       782.78     85000.0  13.65          0.0   
 6377      25000        25000       673.11     72000.0  19.12          0.0   
 6452       5000         5000       102.56    105000.0  17.74          0.0   
 ...         ...          ...          ...         ...    ...          ...   
 1731       5000         5000       164.98     41000.0  10.36          1.0   
 4545       5000         5000       171.31     69000.0   9.34          4.0   
 7743       8000         8000       271.45     23000.0   7.62          0.0   
 5497      17400        17400       577.85     82000.0  12.47          0.0   
 4177       7000         7000       219.04     84000.0  13.14          0.0   
 
       inq_last_6mths  mths_since_last_delinq  mths_since_last

# Logistic regression

In [15]:
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
logistic_regression.coef_

array([[ 3.52842043e-05,  1.02069989e-04, -4.11943890e-03,
        -1.91794489e-05, -1.90409621e-02,  3.18287425e-05,
        -1.39044059e-03, -1.92891493e-03,  5.40490588e-03,
        -1.05782735e-02,  3.35667269e-05,  8.77964432e-07,
        -7.46312807e-04, -2.23204699e-02, -1.97309284e-03,
         2.83784672e-04, -1.46869338e-03, -9.64167612e-04,
         6.00329594e-05,  1.34656939e-04,  2.83212545e-04,
         1.75373410e-04,  9.02769737e-05, -2.15410763e-04,
        -2.85225298e-04, -2.52279133e-04, -4.18984504e-04,
        -2.96793681e-04, -2.09204574e-04, -1.56421745e-04,
        -2.80003656e-04, -1.87477087e-04, -1.31060549e-04,
         3.22020212e-05, -7.60932832e-05,  1.32051434e-06,
         5.16779958e-05,  5.09257113e-05,  2.08192690e-05,
         3.73106693e-06,  4.54287229e-05,  1.45148120e-05,
         5.01630685e-05,  4.39311746e-05,  6.99933644e-05,
         8.49734650e-05,  4.00684179e-05,  4.42461230e-05,
         3.52216189e-05,  3.46377843e-05, -2.05236949e-0

In [17]:
logistic_regression.intercept_

array([-0.00168931])

In [18]:
for prediction in logistic_regression.predict_proba(X_test):
    print(prediction)

[0.82288673 0.17711327]
[0.76061372 0.23938628]
[0.82867833 0.17132167]
[0.88393061 0.11606939]
[0.91496591 0.08503409]
[0.8404867 0.1595133]
[0.92567144 0.07432856]
[0.89293267 0.10706733]
[0.81760702 0.18239298]
[0.96756747 0.03243253]
[0.84365205 0.15634795]
[0.8885902 0.1114098]
[0.74407655 0.25592345]
[0.85663067 0.14336933]
[0.91674003 0.08325997]
[0.76650765 0.23349235]
[0.62882611 0.37117389]
[0.79595694 0.20404306]
[0.87640027 0.12359973]
[0.64831322 0.35168678]
[0.88521193 0.11478807]
[0.73161367 0.26838633]
[0.90785207 0.09214793]
[0.78861695 0.21138305]
[0.92510511 0.07489489]
[0.88485932 0.11514068]
[0.76782261 0.23217739]
[0.9421551 0.0578449]
[0.74740145 0.25259855]
[0.87611902 0.12388098]
[0.86472938 0.13527062]
[0.7860761 0.2139239]
[0.90953298 0.09046702]
[0.84119932 0.15880068]
[0.84062342 0.15937658]
[0.82979977 0.17020023]
[0.81672274 0.18327726]
[0.82143189 0.17856811]
[0.89000201 0.10999799]
[0.72070137 0.27929863]
[0.88143948 0.11856052]
[0.97239866 0.02760134]


[0.84526661 0.15473339]
[0.86143082 0.13856918]
[0.66170473 0.33829527]
[0.81990621 0.18009379]
[0.80481729 0.19518271]
[0.8378417 0.1621583]
[0.81662097 0.18337903]
[0.89107112 0.10892888]
[0.8465115 0.1534885]
[0.91892051 0.08107949]
[0.78394041 0.21605959]
[0.79422347 0.20577653]
[0.74755491 0.25244509]
[0.98883285 0.01116715]
[0.73756119 0.26243881]
[0.81531505 0.18468495]
[0.95723137 0.04276863]
[0.90912313 0.09087687]
[0.80301099 0.19698901]
[0.97315804 0.02684196]
[0.91579171 0.08420829]
[0.94072887 0.05927113]
[0.91830706 0.08169294]
[0.77344893 0.22655107]
[0.84357143 0.15642857]
[0.90601888 0.09398112]
[0.82938334 0.17061666]
[0.86263596 0.13736404]
[0.91611519 0.08388481]
[0.7807091 0.2192909]
[0.97580593 0.02419407]
[0.90361029 0.09638971]
[0.78428725 0.21571275]
[0.82118987 0.17881013]
[0.88575069 0.11424931]
[0.85893304 0.14106696]
[0.77721275 0.22278725]
[0.79556065 0.20443935]
[0.84474068 0.15525932]
[0.83870549 0.16129451]
[0.9852483 0.0147517]
[0.97644957 0.02355043]


[0.9101527 0.0898473]
[0.86351032 0.13648968]
[0.75744268 0.24255732]
[0.85628017 0.14371983]
[0.95383089 0.04616911]
[0.87813614 0.12186386]
[0.95195523 0.04804477]
[0.95115787 0.04884213]
[0.74362446 0.25637554]
[0.77880314 0.22119686]
[0.8360155 0.1639845]
[0.9962338 0.0037662]
[0.95593367 0.04406633]
[0.6732397 0.3267603]
[0.88248116 0.11751884]
[0.77972626 0.22027374]
[0.90355186 0.09644814]
[0.95073532 0.04926468]
[0.76563037 0.23436963]
[0.95307501 0.04692499]
[0.88663714 0.11336286]
[0.8258871 0.1741129]
[0.6776423 0.3223577]
[0.78081617 0.21918383]
[0.78245624 0.21754376]
[0.85813096 0.14186904]
[0.9095684 0.0904316]
[0.98337648 0.01662352]
[0.84737464 0.15262536]
[0.6807135 0.3192865]
[0.84405997 0.15594003]
[0.92303561 0.07696439]
[0.92359566 0.07640434]
[0.93371648 0.06628352]
[0.95199533 0.04800467]
[0.7478922 0.2521078]
[0.87173688 0.12826312]
[0.77865091 0.22134909]
[0.66572334 0.33427666]
[0.71979664 0.28020336]
[0.89854207 0.10145793]
[0.85355047 0.14644953]
[0.9268724