In [1]:
import csv
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
#defining filedirectory
fileDir = os.path.dirname(os.path.realpath('__file__'))

In [3]:
#defining data frames
dfLoanData = pd.DataFrame()

In [4]:
#reading clean-data from csv
for directory, subdirectory, filenames in  os.walk(fileDir + '/CleanedData'):
    for filename in filenames:
        if filename == 'LoanData.csv':
            print("Reading from a file: " + filename + '....')
            dfLoanData = pd.read_csv(os.path.join(directory, filename), encoding = 'ISO-8859-1')

Reading from a file: LoanData.csv....


In [5]:
#creating copies of data frames
df = dfLoanData.copy()

In [6]:
columns = ['term', 'int_rate', 'grade', 'emp_length', 'purpose', 'addr_state', 'dti', 'inq_last_6mths', 
           'pub_rec', 'issue_year', 'cr_line_history', 'Credit_Score_Code','loan_status_binary', 
           'home_ownership_binary', 'verification_status_binary', 
           'application_type_binary', 'loan_amnt_category_code', 'annual_inc_category_code']

In [7]:
df = df[columns]

In [8]:
df['grade'] = (df['grade']).astype('category')
df['purpose'] = (df['purpose']).astype('category')
df['addr_state'] = (df['addr_state']).astype('category')

In [9]:
#converting all the category columns to int by taking category-code
cat_columns = df.select_dtypes(['category']).columns
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)

# Prediction

In [10]:
#dividing the DF into two for test & train data
from sklearn.model_selection import train_test_split
train_loans, test_loans = train_test_split(df, test_size = 0.3)

In [11]:
#inputs required for classification
train_y = train_loans['int_rate']
train_X = train_loans.drop(['int_rate'], axis = 1)

test_y = test_loans['int_rate']
test_X = test_loans.drop(['int_rate'], axis = 1)

In [12]:
#preprocessingthe data : scale between 0 and 1
from sklearn import preprocessing

train_X = preprocessing.minmax_scale(train_X) 
test_X = preprocessing.minmax_scale(test_X)

In [13]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error

def build_error_metric(model, y_train, y_train_predicted, y_test, y_test_predicted, error_df):
    
    rms_train = mean_squared_error(y_train, y_train_predicted)
    rms_test = mean_squared_error(y_test, y_test_predicted)
    
    rmse_train = mean_squared_error(y_train, y_train_predicted)**0.5
    rmse_test = mean_squared_error(y_test, y_test_predicted)**0.5
    
    mae_train = mean_absolute_error(y_train, y_train_predicted)
    mae_test = mean_absolute_error(y_test, y_test_predicted)
    
    mape_train = median_absolute_error(y_train, y_train_predicted)
    mape_test = median_absolute_error(y_test, y_test_predicted)
    
    df = pd.DataFrame({'Model':[model],
                             'rmse_train':[rmse_train], 
                             'rmse_test': [rmse_test],
                             'rms_train':[rms_train], 
                             'rms_test': [rms_test],
                             'mae_train': [mae_train],
                             'mae_test':[mae_test],
                             'mape_train':[mape_train],
                             'mape_test':[mape_test]})
    
    error_df = pd.concat([error_df, df])
    return error_df

# Linear regression

In [14]:
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

In [15]:
#Linear Regression
def TrainLinearRegression(train_X, train_y, test_X, test_y, error_df):
    print("Linear Regression---------")
    print("Working on Training Data")
    model = LinearRegression()
    model.fit(train_X, train_y)
    
    y_train_predicted = model.predict(train_X)
    
    print("Working on Testing Data")
    model.fit(test_X, test_y)
    
    y_test_predicted = model.predict(test_X)
    
    error_df = build_error_metric('Linear Regression', train_y, y_train_predicted, test_y, y_test_predicted, error_df)
    
    return error_df

# Random forest

In [16]:
from sklearn.ensemble import RandomForestRegressor

In [17]:
#Random forest
def TrainRandomForest(train_X, train_y, test_X, test_y, error_df):
    print("Random forest---------")
    print("Working on Training Data")
    model = RandomForestRegressor(n_estimators=20)
    model.fit(train_X, train_y)
    
    y_train_predicted = model.predict(train_X)
    
    print("Working on Testing Data")
    model.fit(test_X, test_y)
    
    y_test_predicted = model.predict(test_X)
    
    error_df = build_error_metric('Random forest', train_y, y_train_predicted, test_y, y_test_predicted, error_df)
    
    return error_df

# KNN

In [18]:
from sklearn.neighbors import KNeighborsRegressor

In [19]:
#KNN
def TrainKNN(train_X, train_y, test_X, test_y, error_df):
    print("KNN---------")
    print("Working on Training Data")
    model = KNeighborsRegressor(n_neighbors=3)
    model.fit(train_X, train_y)
    
    y_train_predicted = model.predict(train_X)
    
    print("Working on Testing Data")
    model.fit(test_X, test_y)
    
    y_test_predicted = model.predict(test_X)
    
    error_df = build_error_metric('KNN', train_y, y_train_predicted, test_y, y_test_predicted, error_df)
    
    return error_df

# Neural network

In [20]:
from sklearn.neural_network import MLPRegressor

In [21]:
#Neural network
def TrainNeuralNetwork(train_X, train_y, test_X, test_y, error_df):
    print("Neural network---------")
    print("Working on Training Data")
    model = MLPRegressor()
    model.fit(train_X, train_y)
    
    y_train_predicted = model.predict(train_X)
    
    print("Working on Testing Data")
    model.fit(test_X, test_y)
    
    y_test_predicted = model.predict(test_X)
    
    error_df = build_error_metric('Neural network', train_y, y_train_predicted, test_y, y_test_predicted, error_df)
    
    return error_df

In [22]:
error_df = pd.DataFrame({'rmse_train':[], 
                             'rmse_test': [],
                             'rms_train':[], 
                             'rms_test': [],
                             'mae_train': [],
                             'mae_test':[],
                             'mape_train':[],
                             'mape_test':[]})

In [23]:
error_df = TrainLinearRegression(train_X, train_y, test_X, test_y, error_df)

Linear Regression---------
Working on Training Data
Working on Testing Data


In [24]:
error_df

Unnamed: 0,Model,mae_test,mae_train,mape_test,mape_train,rms_test,rms_train,rmse_test,rmse_train
0,Linear Regression,1.035386,1.036106,0.881522,0.883259,1.733265,1.73956,1.316535,1.318924


In [25]:
error_df = TrainRandomForest(train_X, train_y, test_X, test_y, error_df)

Random forest---------
Working on Training Data
Working on Testing Data


In [26]:
error_df

Unnamed: 0,Model,mae_test,mae_train,mape_test,mape_train,rms_test,rms_train,rmse_test,rmse_train
0,Linear Regression,1.035386,1.036106,0.881522,0.883259,1.733265,1.73956,1.316535,1.318924
0,Random forest,0.334688,0.333544,0.2715,0.27,0.189525,0.188969,0.435344,0.434706


In [27]:
error_df = TrainNeuralNetwork(train_X, train_y, test_X, test_y, error_df)

Neural network---------
Working on Training Data
Working on Testing Data


In [28]:
error_df

Unnamed: 0,Model,mae_test,mae_train,mape_test,mape_train,rms_test,rms_train,rmse_test,rmse_train
0,Linear Regression,1.035386,1.036106,0.881522,0.883259,1.733265,1.73956,1.316535,1.318924
0,Random forest,0.334688,0.333544,0.2715,0.27,0.189525,0.188969,0.435344,0.434706
0,Neural network,0.883463,0.863857,0.799292,0.785976,1.168594,1.135427,1.081015,1.065564


In [29]:
error_df = TrainKNN(train_X, train_y, test_X, test_y, error_df)

KNN---------
Working on Training Data
Working on Testing Data


In [30]:
error_df

Unnamed: 0,Model,mae_test,mae_train,mape_test,mape_train,rms_test,rms_train,rmse_test,rmse_train
0,Linear Regression,1.035386,1.036106,0.881522,0.883259,1.733265,1.73956,1.316535,1.318924
0,Random forest,0.334688,0.333544,0.2715,0.27,0.189525,0.188969,0.435344,0.434706
0,Neural network,0.883463,0.863857,0.799292,0.785976,1.168594,1.135427,1.081015,1.065564
0,KNN,0.827008,0.773428,0.666667,0.64,1.176684,1.016687,1.084751,1.008309
