In [2]:
import numpy as np
import pandas as pd

In [3]:
ld=pd.read_csv("loans data.csv")

In [4]:
ld.head()

Unnamed: 0,ID,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length
0,81174.0,20000,20000,8.90%,36 months,debt_consolidation,14.90%,SC,MORTGAGE,6541.67,735-739,14,14272,2.0,< 1 year
1,99592.0,19200,19200,12.12%,36 months,debt_consolidation,28.36%,TX,MORTGAGE,4583.33,715-719,12,11140,1.0,2 years
2,80059.0,35000,35000,21.98%,60 months,debt_consolidation,23.81%,CA,MORTGAGE,11500.0,690-694,14,21977,1.0,2 years
3,15825.0,10000,9975,9.99%,36 months,debt_consolidation,14.30%,KS,MORTGAGE,3833.33,695-699,10,9346,0.0,5 years
4,33182.0,12000,12000,11.71%,36 months,credit_card,18.78%,NJ,RENT,3195.0,695-699,11,14469,0.0,9 years


In [5]:
ld.dtypes

ID                                float64
Amount.Requested                   object
Amount.Funded.By.Investors         object
Interest.Rate                      object
Loan.Length                        object
Loan.Purpose                       object
Debt.To.Income.Ratio               object
State                              object
Home.Ownership                     object
Monthly.Income                    float64
FICO.Range                         object
Open.CREDIT.Lines                  object
Revolving.CREDIT.Balance           object
Inquiries.in.the.Last.6.Months    float64
Employment.Length                  object
dtype: object

In [None]:
## Data prep:
# ID,Amount.Funded.By.Investors: Drop
#Interest.Rate,Debt.To.Income.Ratio: remove % sign and convert to numeric
#Amount.Requested,Open.CREDIT.Lines,Revolving.CREDIT.Balance: convert it to numeric
# FICO.Range:  replace it by numeric column which is average of range
# Employment.Length: convert to number
# Loan.Length,Loan.Purpose,State,Home.Ownership: dummies

In [6]:
ld.drop(['ID','Amount.Funded.By.Investors'],axis=1,inplace=True)

In [7]:
for col in ['Interest.Rate','Debt.To.Income.Ratio']:
    ld[col]=ld[col].str.replace("%","")

In [8]:
ld.select_dtypes(['object']).columns

Index(['Amount.Requested', 'Interest.Rate', 'Loan.Length', 'Loan.Purpose',
       'Debt.To.Income.Ratio', 'State', 'Home.Ownership', 'FICO.Range',
       'Open.CREDIT.Lines', 'Revolving.CREDIT.Balance', 'Employment.Length'],
      dtype='object')

In [9]:
for col in ['Amount.Requested','Interest.Rate','Debt.To.Income.Ratio',
            'Open.CREDIT.Lines','Revolving.CREDIT.Balance']:
    ld[col]=pd.to_numeric(ld[col],errors='coerce')

In [10]:
ld.dtypes

Amount.Requested                  float64
Interest.Rate                     float64
Loan.Length                        object
Loan.Purpose                       object
Debt.To.Income.Ratio              float64
State                              object
Home.Ownership                     object
Monthly.Income                    float64
FICO.Range                         object
Open.CREDIT.Lines                 float64
Revolving.CREDIT.Balance          float64
Inquiries.in.the.Last.6.Months    float64
Employment.Length                  object
dtype: object

In [11]:
k=ld['FICO.Range'].str.split("-",expand=True).astype(float)
ld['fico']=0.5*(k[0]+k[1])
del ld['FICO.Range']
ld.dtypes

Amount.Requested                  float64
Interest.Rate                     float64
Loan.Length                        object
Loan.Purpose                       object
Debt.To.Income.Ratio              float64
State                              object
Home.Ownership                     object
Monthly.Income                    float64
Open.CREDIT.Lines                 float64
Revolving.CREDIT.Balance          float64
Inquiries.in.the.Last.6.Months    float64
Employment.Length                  object
fico                              float64
dtype: object

In [13]:
ld['fico']

0       737.0
1       717.0
2       692.0
3       697.0
4       697.0
        ...  
2495    707.0
2496    742.0
2497    682.0
2498    677.0
2499    672.0
Name: fico, Length: 2500, dtype: float64

In [12]:
ld['Employment.Length'].value_counts()

10+ years    653
< 1 year     249
2 years      243
3 years      235
5 years      202
4 years      191
1 year       177
6 years      163
7 years      127
8 years      108
9 years       72
.              2
Name: Employment.Length, dtype: int64

In [None]:
#errors=corece : it replace any missing value with NAN or which value is is not possible to convert for them it coerce that values

In [14]:
ld['Employment.Length']=ld['Employment.Length'].str.replace('years',"")
ld['Employment.Length']=ld['Employment.Length'].str.replace('year',"")
ld['Employment.Length']=np.where(ld['Employment.Length'].str[:2]=="10",10,ld['Employment.Length'])
ld['Employment.Length']=np.where(ld['Employment.Length'].str[0]=="<",0,ld['Employment.Length'])
ld['Employment.Length']=pd.to_numeric(ld['Employment.Length'],errors='coerce')

In [15]:
ld['Employment.Length']

0        0.0
1        2.0
2        2.0
3        5.0
4        9.0
        ... 
2495     8.0
2496    10.0
2497    10.0
2498     5.0
2499    10.0
Name: Employment.Length, Length: 2500, dtype: float64

In [16]:
cat_cols=ld.select_dtypes(['object']).columns

In [17]:
cat_cols

Index(['Loan.Length', 'Loan.Purpose', 'State', 'Home.Ownership'], dtype='object')

In [18]:
## we can use pandas getdummies function and manual method to create dummies
# but in this case we are using manual method to ignore lower frequencies variable
for col in cat_cols:
    freqs=ld[col].value_counts()
    k=freqs.index[freqs>20][:-1]
    for cat in k:
        name=col+'_'+cat
        ld[name]=(ld[col]==cat).astype(int)
    del ld[col]
    print(col)


Loan.Length
Loan.Purpose
State
Home.Ownership


In [19]:
ld.dtypes

Amount.Requested                   float64
Interest.Rate                      float64
Debt.To.Income.Ratio               float64
Monthly.Income                     float64
Open.CREDIT.Lines                  float64
Revolving.CREDIT.Balance           float64
Inquiries.in.the.Last.6.Months     float64
Employment.Length                  float64
fico                               float64
Loan.Length_36 months                int32
Loan.Purpose_debt_consolidation      int32
Loan.Purpose_credit_card             int32
Loan.Purpose_other                   int32
Loan.Purpose_home_improvement        int32
Loan.Purpose_major_purchase          int32
Loan.Purpose_small_business          int32
Loan.Purpose_car                     int32
Loan.Purpose_wedding                 int32
Loan.Purpose_medical                 int32
Loan.Purpose_moving                  int32
State_CA                             int32
State_NY                             int32
State_TX                             int32
State_FL   

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
ld_train,ld_test=train_test_split(ld,test_size=0.2,random_state=2)

In [22]:
ld_train.isnull().sum()

Amount.Requested                    5
Interest.Rate                       0
Debt.To.Income.Ratio                1
Monthly.Income                      3
Open.CREDIT.Lines                   7
Revolving.CREDIT.Balance            4
Inquiries.in.the.Last.6.Months      3
Employment.Length                  68
fico                                0
Loan.Length_36 months               0
Loan.Purpose_debt_consolidation     0
Loan.Purpose_credit_card            0
Loan.Purpose_other                  0
Loan.Purpose_home_improvement       0
Loan.Purpose_major_purchase         0
Loan.Purpose_small_business         0
Loan.Purpose_car                    0
Loan.Purpose_wedding                0
Loan.Purpose_medical                0
Loan.Purpose_moving                 0
State_CA                            0
State_NY                            0
State_TX                            0
State_FL                            0
State_IL                            0
State_GA                            0
State_PA    

In [23]:
## replace na values by mean
for col in ld_train.columns:
    if ld_train[col].isnull().sum()>0:
        ld_train.loc[ld_train[col].isnull(),col]=ld_train[col].mean()

for col in ld_test.columns:
    if ld_test[col].isnull().sum()>0:
        ld_test.loc[ld_test[col].isnull(),col]=ld_train[col].mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [24]:
# seperate the predictor and response. y-train-->response, x_train=predictor
x_train=ld_train.drop('Interest.Rate',axis=1)
y_train=ld_train['Interest.Rate']

In [25]:
from sklearn.linear_model import LinearRegression

In [26]:
lm=LinearRegression()

In [27]:
lm.fit(x_train,y_train)

LinearRegression()

In [29]:
lm.coef_

array([ 1.59059286e-04, -1.07930820e-03, -1.69117262e-05, -2.82757819e-02,
       -5.76482784e-06,  3.12674913e-01,  1.63365845e-02, -8.77620360e-02,
       -3.18831460e+00, -7.22469773e-01, -8.23643158e-01, -2.04778652e-02,
       -8.80070638e-01, -4.68619194e-01, -8.25934775e-02, -5.39514017e-01,
       -9.90991495e-01, -6.02580092e-01,  8.39558555e-01, -4.70816058e-01,
       -4.13716336e-01,  2.17299255e-01, -3.66654406e-01, -8.18678712e-01,
       -2.02295370e-01, -5.96572942e-01, -5.92819403e-01, -2.23066635e-01,
       -3.78898570e-01, -8.38219355e-02, -5.32476912e-02, -3.86482396e-01,
       -5.59831742e-01, -3.03593431e-01,  5.81282728e-02,  6.54090424e-02,
       -2.60693164e-01,  2.09119339e-02, -1.48945197e-01, -4.78060090e-01,
       -4.23477762e-01,  1.99575471e-01, -2.43834338e-01, -2.61042630e-02,
       -1.04608993e-01, -8.55763668e-03, -8.90429283e-01, -3.88422632e-01,
       -2.32968218e-01])

In [30]:
lm.intercept_
# beta zero is intercept

77.07213296580457

In [31]:
list(zip(x_train.columns,lm.coef_))

[('Amount.Requested', 0.00015905928579314354),
 ('Debt.To.Income.Ratio', -0.0010793082005023213),
 ('Monthly.Income', -1.6911726214773715e-05),
 ('Open.CREDIT.Lines', -0.028275781850735107),
 ('Revolving.CREDIT.Balance', -5.7648278431465805e-06),
 ('Inquiries.in.the.Last.6.Months', 0.3126749130602594),
 ('Employment.Length', 0.01633658450806541),
 ('fico', -0.08776203604546573),
 ('Loan.Length_36 months', -3.1883146036023713),
 ('Loan.Purpose_debt_consolidation', -0.7224697730607057),
 ('Loan.Purpose_credit_card', -0.8236431581879268),
 ('Loan.Purpose_other', -0.020477865204266532),
 ('Loan.Purpose_home_improvement', -0.8800706376573194),
 ('Loan.Purpose_major_purchase', -0.4686191937334272),
 ('Loan.Purpose_small_business', -0.0825934774582125),
 ('Loan.Purpose_car', -0.5395140165856502),
 ('Loan.Purpose_wedding', -0.9909914948711225),
 ('Loan.Purpose_medical', -0.6025800916376355),
 ('Loan.Purpose_moving', 0.8395585554658108),
 ('State_CA', -0.47081605787532094),
 ('State_NY', -0.413

In [32]:
## Performance on test
x_test=ld_test.drop('Interest.Rate',axis=1)

In [33]:
predicted_ir=lm.predict(x_test)

In [34]:
predicted_ir

array([13.54161591,  9.08585546, 14.27481912, 13.84101559,  7.05132327,
       19.65215563, 13.35168699, 16.14292115, 11.58633624,  7.28721433,
       13.07191708, 21.45154056, 15.24655957, 16.53099029, 18.37585535,
       15.29486625, 13.51287106, 15.85097164, 13.97718879, 14.4747684 ,
       14.44590658, 14.02142775, 14.47910987, 13.22387302,  9.58805934,
        6.76409074, 19.01604862, 10.01746242,  9.85921855, 16.3330338 ,
       16.09424183, 15.34691013, 12.51688001, 15.80770005, 14.98804752,
       17.83123193,  6.77862414,  7.48424136, 15.67748383, 11.13120689,
       15.00156029, 14.7188922 , 10.38141212, 13.2199913 , 15.54957095,
       18.50671927, 13.1732654 , 19.68116888, 10.63320172, 15.2118001 ,
       15.23683658, 10.19660874, 12.00616426, 19.07206059, 16.85599657,
        5.11473269, 18.33913281,  8.01969534, 11.21352647,  8.84255518,
        8.48098371,  9.96270159,  3.53199925, 12.30254252,  9.97044881,
        6.1621252 ,  7.08565801, 17.45505222, 17.76810379, 14.80

In [35]:
from sklearn.metrics import mean_absolute_error

In [36]:
mean_absolute_error(ld_test['Interest.Rate'],predicted_ir)

1.5472250646584094

In [37]:
## Regularization methods:
##L2 penalty- Ridge, L1 penalty- lasso
from sklearn.linear_model import Ridge,Lasso

In [38]:
## best value of lambda using kfold
from sklearn.model_selection import KFold

In [39]:
lambdas=np.linspace(1,100,100)

In [40]:
lambdas

array([  1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,  11.,
        12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,  22.,
        23.,  24.,  25.,  26.,  27.,  28.,  29.,  30.,  31.,  32.,  33.,
        34.,  35.,  36.,  37.,  38.,  39.,  40.,  41.,  42.,  43.,  44.,
        45.,  46.,  47.,  48.,  49.,  50.,  51.,  52.,  53.,  54.,  55.,
        56.,  57.,  58.,  59.,  60.,  61.,  62.,  63.,  64.,  65.,  66.,
        67.,  68.,  69.,  70.,  71.,  72.,  73.,  74.,  75.,  76.,  77.,
        78.,  79.,  80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,  88.,
        89.,  90.,  91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,  99.,
       100.])

In [41]:
## kfold works on indices of ld_train, it fold on the basis of indices hence ids are shuffled
ld_train

Unnamed: 0,Amount.Requested,Interest.Rate,Debt.To.Income.Ratio,Monthly.Income,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length,fico,Loan.Length_36 months,...,State_MO,State_NV,State_OR,State_SC,State_WI,State_KY,State_LA,State_OK,Home.Ownership_MORTGAGE,Home.Ownership_RENT
914,3250.0,12.73,21.44,5000.00,12.0,20564.0,0.0,3.0,677.0,1,...,0,0,0,0,0,0,0,0,1,0
510,16000.0,14.09,7.78,4166.67,6.0,4240.0,3.0,2.0,737.0,0,...,0,0,0,0,0,0,0,0,0,1
2073,10000.0,5.42,1.69,8000.00,6.0,8974.0,0.0,3.0,797.0,1,...,0,0,0,0,0,0,0,0,1,0
1494,3600.0,13.99,10.00,5750.00,8.0,10766.0,2.0,4.0,672.0,1,...,0,0,0,0,0,0,0,0,0,1
765,14000.0,14.65,3.47,8750.00,6.0,9035.0,1.0,10.0,682.0,1,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
674,2500.0,15.95,24.07,1500.00,10.0,2684.0,3.0,1.0,667.0,1,...,0,0,0,0,0,0,0,0,1,0
2408,10000.0,22.47,34.26,6200.00,36.0,28850.0,1.0,3.0,677.0,1,...,0,0,0,0,0,0,0,0,1,0
1099,30000.0,12.12,10.97,5833.33,8.0,10787.0,0.0,10.0,752.0,0,...,0,0,0,0,0,0,0,0,1,0
2347,12000.0,6.62,1.43,9666.67,5.0,4693.0,0.0,1.0,752.0,1,...,0,0,0,0,0,0,0,0,0,1


In [42]:
## so we have to reset index
ld_train.reset_index(inplace=True,drop=True)

In [43]:
## Recreate x_train and y_train
x_train=ld_train.drop('Interest.Rate',axis=1)
y_train=ld_train['Interest.Rate']

In [44]:
mae_list=[]
for a in lambdas:
    ridge=Ridge(fit_intercept=True,alpha=a)
    
    kf=KFold(n_splits=10)
    
    xval_err=0
    
    for train,test in kf.split(x_train):
        ridge.fit(x_train.loc[train],y_train[train])
        p=ridge.predict(x_train.loc[test])
        xval_err +=mean_absolute_error(y_train[test],p)
        
    mae_10cv=xval_err/10
    print(a,":",mae_10cv)
    mae_list.extend([mae_10cv])

1.0 : 1.6267097009333444
2.0 : 1.6257681508581001
3.0 : 1.6249774457021062
4.0 : 1.624273990841311
5.0 : 1.6236580809736758
6.0 : 1.6231680074751111
7.0 : 1.6227536210372695
8.0 : 1.6223613718190328
9.0 : 1.6220103214511337
10.0 : 1.6217013835746734
11.0 : 1.6214259315268353
12.0 : 1.6211733725677373
13.0 : 1.6209392676797734
14.0 : 1.6207220334034957
15.0 : 1.6205351675578417
16.0 : 1.6203605610695768
17.0 : 1.6202085720988397
18.0 : 1.620079081765517
19.0 : 1.6199685932002938
20.0 : 1.619879194258542
21.0 : 1.619807968589694
22.0 : 1.619746776629397
23.0 : 1.6196987333296586
24.0 : 1.6196579584929598
25.0 : 1.6196280638888247
26.0 : 1.6196052025318637
27.0 : 1.6195952251220729
28.0 : 1.6195910063448504
29.0 : 1.6195981361056746
30.0 : 1.6196119924686225
31.0 : 1.6196360388810689
32.0 : 1.6196636562737659
33.0 : 1.6196954630624263
34.0 : 1.619730774153357
35.0 : 1.6197861094807422
36.0 : 1.6198637289799982
37.0 : 1.6199515829877285
38.0 : 1.6200404730228268
39.0 : 1.620132542632756
40

In [45]:
best_alpha=lambdas[mae_list==min(mae_list)]
print('Alpha with min 10cv error is:',best_alpha)

Alpha with min 10cv error is: [28.]


In [34]:
## so here we get 28 so we can get finer value around 28, then we take this lambda value and run on entire model and 
# error comes 1.52

In [47]:
ridge=Ridge(fit_intercept=True,alpha=best_alpha)
ridge.fit(x_train,y_train)
p_test=ridge.predict(x_test)

mae_ridge=mean_absolute_error(ld_test['Interest.Rate'],p_test)

mae_ridge

1.5286735956414594

In [36]:
## we get slightly better performance than previous one- 1.54

In [37]:
## Using Lasso:

In [48]:
lambdas=np.linspace(1,100,100)
mae_list=[]
for a in lambdas:
    lasso=Lasso(fit_intercept=True,alpha=a)
    kf=KFold(n_splits=10)
    xval_err=0
    
    for train,test in kf.split(x_train):
        lasso.fit(x_train.loc[train],y_train[train])
        p=lasso.predict(x_train.loc[test])
        xval_err += mean_absolute_error(y_train[test],p)
        
    mae_10cv=xval_err/10
#    print(a,":",mae_10cv)
    mae_list.extend([mae_10cv])
best_alpha=lambdas[mae_list==min(mae_list)]
print('Alpha with min 10cv error is :',best_alpha)

Alpha with min 10cv error is : [1.]


In [49]:
## best value come near to 1 so we should expand on neighbourhood
lambdas=np.linspace(0.5,10,100)
mae_list=[]
for a in lambdas:
    lasso=Lasso(fit_intercept=True,alpha=a)
    kf=KFold(n_splits=10)
    xval_err=0
    
    for train,test in kf.split(x_train):
        lasso.fit(x_train.loc[train],y_train[train])
        p=lasso.predict(x_train.loc[test])
        xval_err += mean_absolute_error(y_train[test],p)
        
    mae_10cv=xval_err/10
#    print(a,":",mae_10cv)
    mae_list.extend([mae_10cv])
best_alpha=lambdas[mae_list==min(mae_list)]
print('Alpha with min 10cv error is :',best_alpha)

Alpha with min 10cv error is : [0.5]


In [40]:
## best value come near to 0.5 so we should expand on neighbourhood
lambdas=np.linspace(0.001,1,100)
mae_list=[]
for a in lambdas:
    lasso=Lasso(fit_intercept=True,alpha=a)
    kf=KFold(n_splits=10)
    xval_err=0
    
    for train,test in kf.split(x_train):
        lasso.fit(x_train.loc[train],y_train[train])
        p=lasso.predict(x_train.loc[test])
        xval_err += mean_absolute_error(y_train[test],p)
        
    mae_10cv=xval_err/10
#    print(a,":",mae_10cv)
    mae_list.extend([mae_10cv])
best_alpha=lambdas[mae_list==min(mae_list)]
print('Alpha with min 10cv error is :',best_alpha)

Alpha with min 10cv error is : [0.01109091]


In [50]:
## now value-0.011 is at midddle so we can progress
lasso=Lasso(fit_intercept=True,alpha=best_alpha)
lasso.fit(x_train,y_train)
p_test=lasso.predict(x_test)

mae_lasso=mean_absolute_error(ld_test['Interest.Rate'],p_test)
mae_lasso

1.847085015890833

In [51]:
# here also we get 1.52 which similar to ridge value but if we look at coeff many variable coeff become zero
list(zip(x_train.columns,lasso.coef_))

[('Amount.Requested', 0.00022515960016130724),
 ('Debt.To.Income.Ratio', -0.0),
 ('Monthly.Income', -4.091840370825148e-05),
 ('Open.CREDIT.Lines', -0.01099620150554398),
 ('Revolving.CREDIT.Balance', -1.168942420928686e-05),
 ('Inquiries.in.the.Last.6.Months', 0.02780027643057032),
 ('Employment.Length', 0.0),
 ('fico', -0.08875326259542494),
 ('Loan.Length_36 months', -0.0),
 ('Loan.Purpose_debt_consolidation', -0.0),
 ('Loan.Purpose_credit_card', -0.0),
 ('Loan.Purpose_other', 0.0),
 ('Loan.Purpose_home_improvement', 0.0),
 ('Loan.Purpose_major_purchase', 0.0),
 ('Loan.Purpose_small_business', 0.0),
 ('Loan.Purpose_car', 0.0),
 ('Loan.Purpose_wedding', -0.0),
 ('Loan.Purpose_medical', 0.0),
 ('Loan.Purpose_moving', 0.0),
 ('State_CA', -0.0),
 ('State_NY', 0.0),
 ('State_TX', 0.0),
 ('State_FL', -0.0),
 ('State_IL', -0.0),
 ('State_GA', 0.0),
 ('State_PA', -0.0),
 ('State_NJ', -0.0),
 ('State_VA', 0.0),
 ('State_MA', -0.0),
 ('State_OH', 0.0),
 ('State_MD', 0.0),
 ('State_NC', -0.0),