In [1]:
import logging
import csv
import os
import numpy as np
import pandas as pd
import seaborn
import matplotlib.pyplot as plt
import scipy.stats     
%matplotlib inline
from sklearn.linear_model import (LinearRegression, Ridge, 
                                  Lasso, RandomizedLasso)
from sklearn.feature_selection import RFE, f_regression
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

In [2]:
#defining filedirectory
fileDir = os.path.dirname(os.path.realpath('__file__'))

In [3]:
#defining data frames
dfLoanData = pd.DataFrame()

In [4]:
#reading clean-data from csv
for directory, subdirectory, filenames in  os.walk(fileDir + '/CleanedData'):
    for filename in filenames:
        if filename == 'LoanData.csv':
            print("Reading from a file: " + filename + '....')
            dfLoanData = pd.read_csv(os.path.join(directory, filename), encoding = 'ISO-8859-1')

Reading from a file: LoanData.csv....


In [5]:
#creating copies of data frames
df = dfLoanData.copy()

In [6]:
columns = ['term', 'int_rate', 'grade', 'emp_length', 'purpose', 'addr_state', 'dti', 'inq_last_6mths', 
           'pub_rec', 'issue_year', 'cr_line_history', 'Credit_Score_Code','loan_status_binary', 
           'home_ownership_binary', 'verification_status_binary', 
           'application_type_binary', 'loan_amnt_category_code', 'annual_inc_category_code']

In [7]:
df = df[columns]

In [8]:
df['grade'] = (df['grade']).astype('category')
df['purpose'] = (df['purpose']).astype('category')
df['addr_state'] = (df['addr_state']).astype('category')

In [9]:
#converting all the category columns to int by taking category-code
cat_columns = df.select_dtypes(['category']).columns
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)

In [10]:
cor = df.corr()
cor.loc[:,:] = np.tril(cor, k=-1) # below main lower triangle of an array
cor = cor.stack()
cor[(cor > 0.55) | (cor < -0.55)]

grade  int_rate    0.955217
dtype: float64

In [11]:
def get_list(coefs, names = None, sort = False):
    if names == None:
        names = ["X%s" % x for x in range(len(coefs))]
    lst = zip(coefs, names)
    if sort:
        lst = sorted(lst,  key = lambda x:-np.abs(x[0]))
    return lst

In [12]:
def find_ranks(ranks, names, order=1):
    minmax = MinMaxScaler()
    ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
    ranks = map(lambda x: round(x, 2), ranks)
    return dict(zip(names, ranks ))

In [13]:
Y= df['int_rate']
df.drop('int_rate',axis=1,inplace=True)
X = df
names = X.columns.values.tolist()
ranks = {}

In [14]:
#Linear Regression - Multicollinearity
lr = LinearRegression(normalize=True)
lr.fit(X, Y)
ranks["Linear regression"] = find_ranks(np.abs(lr.coef_), names)

In [15]:
#L1 Regularization for Linear Regression - adding additional constraint or penalty to the model to prevent overfitting since
#Linearregression does not work well with multiple variables
#Forces weak features to have zero coefficients
lasso = Lasso(alpha=.05)
lasso.fit(X, Y)
ranks["Lasso"] = find_ranks(np.abs(lasso.coef_), names)


In [16]:
#Ridge Regression - adds L2 norm to reduce sparse solutions by Lasso
#forces the coefficient values to be spread out more equally
ridge = Ridge(alpha=7)
ridge.fit(X, Y)
ranks["Ridge"] = find_ranks(np.abs(ridge.coef_), names)

In [17]:
#random forest - for non linear relationships 
rf = RandomForestRegressor()
rf.fit(X,Y)
ranks["RF"] = find_ranks(rf.feature_importances_, names)

In [18]:
#Recursive feature elimination: selects features then repeatedly runs the model to get the best fit with rest of features 
# stop the search when 6 features are left (they will get equal scores)
rfe = RFE(lr, n_features_to_select=6)
rfe.fit(X,Y)
ranks["RFE"] = find_ranks(rfe.ranking_, names, order=-1)

In [19]:
rank_df = pd.DataFrame(ranks)
rank_df['Max-Rank'] = rank_df.max(axis=1)

In [20]:
rank_df.sort("Max-Rank", ascending = False)

Unnamed: 0,Lasso,Linear regression,RF,RFE,Ridge,Max-Rank
Credit_Score_Code,0.03,0.03,0.0,1.0,0.03,1.0
grade,1.0,1.0,1.0,1.0,1.0,1.0
issue_year,0.04,0.05,0.03,1.0,0.05,1.0
inq_last_6mths,0.01,0.03,0.0,1.0,0.03,1.0
verification_status_binary,0.0,0.06,0.0,1.0,0.06,1.0
application_type_binary,0.0,0.02,0.0,1.0,0.02,1.0
annual_inc_category_code,0.01,0.01,0.0,0.91,0.01,0.91
loan_amnt_category_code,0.0,0.01,0.01,0.82,0.01,0.82
pub_rec,0.0,0.01,0.0,0.73,0.01,0.73
purpose,0.0,0.01,0.0,0.64,0.01,0.64
