# Data Modeling using Regression Technique

### Importing Dependencies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
sns.set(style="darkgrid")
import cufflinks as cf

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.metrics import *
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.feature_selection import RFE
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.model_selection import KFold
# libraries for regularization
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV

### Reading the Cleaned Data

In [2]:
df = pd.read_csv('data_clean_regression.csv')

### Splitting data into Training and Testing Set:

In [3]:
def split_data(df) :   
    X = df.loc[: ,df.columns != 'int_rate']
    Y = df['int_rate']
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=10 )
    return X_train, X_test, y_train, y_test

**Following are the columns selected as the best features from SFS feature selection since it gives us the least rmse value comparitively**

In [4]:
df_new = df[['loan_amnt', 'funded_amnt_inv','installment','loan_status','inq_last_6mths','pub_rec','revol_util','open_il_24m',
 'open_rv_24m','all_util', 'inq_fi','acc_open_past_24mths', 'term', 'sub_grade', 'application_type','int_rate']]

In [5]:
X_train, X_test, y_train, y_test = split_data(df_new) # splitting the new selected features

### Scaling data for modelling: 

In [6]:
def get_scaled_data(X_train, X_test): 
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test

In [7]:
X_train, X_test = get_scaled_data(X_train, X_test)

### Defining function to calculate scores:

In [8]:
def compute_metrics(model,predicted_val,true_val):
    RSq = r2_score(true_val,predicted_val)
    print('R Squared: ' + str(RSq))
    MAE = mean_absolute_error(true_val,predicted_val)
    print('MAE: ' + str(MAE))
    RMS = np.sqrt(mean_squared_error(true_val,predicted_val))
    print('RMS: ' + str(RMS))
    MAPE = np.mean(np.abs((true_val - predicted_val) / true_val)) * 100
    print('MAPE: ' + str(MAPE))
    rmse = np.sqrt(mean_squared_error(predicted_val,true_val))
    print('RMSE: ' + str(rmse))
    print('\n')

### Base Model: Linear Regression

In [9]:
def train_linear_model(X_train, X_test, y_train, y_test):
    global lm


    lm = LinearRegression()
    
    lm.fit(X_train,y_train)
    
    reg_pred_train = lm.predict(X_train)
    reg_pred_test = lm.predict(X_test)
    print('Linear Regression Results:')
    print('-Training Metrics-')
    compute_metrics(lm,reg_pred_train,y_train)
    print('-Testing Metrics-')
    compute_metrics(lm,reg_pred_test,y_test)

In [10]:
train_linear_model(X_train, X_test, y_train, y_test)

Linear Regression Results:
-Training Metrics-
R Squared: 0.9582869051102821
MAE: 0.7076410290941456
RMS: 0.9866410485182598
MAPE: 5.263060864961952
RMSE: 0.9866410485182598


-Testing Metrics-
R Squared: 0.958331537898327
MAE: 0.7082942393937717
RMS: 0.987189365531264
MAPE: 5.2731995329344965
RMSE: 0.987189365531264




**According to our Base model we are getting a decent accuracy with a R-square of 0.95 for both traing and testing data using features selected by using Sequenrial Forward Selection**

### Checking alpha values for Lasso Regression:

In [17]:
lasso = Lasso()
parameters = {'alpha':[1e-15,1e-10,1e-8,1e-4,1e-3,1e-2,1,5,10,20]}
lasso_regressor = GridSearchCV(lasso,parameters,scoring='neg_mean_squared_error',cv=5)
lasso_regressor.fit(X_train, y_train)
print(lasso_regressor.best_params_)
print(lasso_regressor.best_score_)


Objective did not converge. You might want to increase the number of iterations. Duality gap: 241442.19062427664, tolerance: 3165.13719024015


Objective did not converge. You might want to increase the number of iterations. Duality gap: 354122.92919582396, tolerance: 3164.8035129708805


Objective did not converge. You might want to increase the number of iterations. Duality gap: 316219.56353540474, tolerance: 3165.5834662356597



{'alpha': 1e-08}
-1.0394868620400075


In [20]:
def train_lasso_model(X_train, X_test, y_train, y_test):
    global ls


    ls = Lasso(alpha=1e-08,normalize=True)
    
    ls.fit(X_train,y_train)
    
    ls_pred_train = ls.predict(X_train)
    ls_pred_test = ls.predict(X_test)
    print('Linear Regression Results:')
    print('-Training Metrics-')
    compute_metrics(ls,ls_pred_train,y_train)
    print('-Testing Metrics-')
    compute_metrics(ls,ls_pred_test,y_test)

In [None]:
train_lasso_model(X_train, X_test, y_train, y_test)

### Checking alpha value for Ridge Regression:

In [18]:
ridge = Ridge()
parameters = {'alpha':[1e-15,1e-10,1e-8,1e-4,1e-3,1e-2,1,5,10,20]}
ridge_regressor = GridSearchCV(ridge,parameters,scoring='neg_mean_squared_error',cv=5)
ridge_regressor.fit(X_train, y_train)
print(ridge_regressor.best_params_)
print(ridge_regressor.best_score_)

{'alpha': 1}
-1.039486862027001


In [18]:
def train_ridge_model(X_train, X_test, y_train, y_test):
    global ridgereg


    ridgereg = Ridge(alpha=2,normalize=True,max_iter=1e5 )
    
    ridgereg.fit(X_train,y_train)
    
    rid_pred_train = ridgereg.predict(X_train)
    rid_pred_test = ridgereg.predict(X_test)
    print('Ridge Regression Results:')
    print('-Training Metrics-')
    compute_metrics(ridgereg,rid_pred_train,y_train)
    print('-Testing Metrics-')
    compute_metrics(ridgereg,rid_pred_test,y_test)

In [19]:
train_ridge_model(X_train, X_test, y_train, y_test)

Ridge Regression Results:
-Training Metrics-
R Squared: 0.5810784139454304
MAE: 2.413464876753904
RMS: 3.1267225291223406
MAPE: 21.32695556672998
RMSE: 3.1267225291223406


-Testing Metrics-
R Squared: 0.5812887170355248
MAE: 2.416097748099795
RMS: 3.129349460389965
MAPE: 21.367408291249035
RMSE: 3.129349460389965




### Gradient Boosting Regressor

In [12]:
def train_gbr_model(X_train, X_test, y_train, y_test):
    global gbr

    params = {'n_estimators': 50, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}
    gbr = GradientBoostingRegressor(**params)
    
    gbr.fit(X_train,y_train)
    
    gbr_pred_train = gbr.predict(X_train)
    gbr_pred_test = gbr.predict(X_test)
    
    print('Gradient Boosting Regressor Results:')
    print('-Training Metrics-')
    compute_metrics(gbr,gbr_pred_train,y_train)
    print('-Testing Metrics-')
    compute_metrics(gbr,gbr_pred_test,y_test)

In [13]:
train_gbr_model(X_train, X_test, y_train, y_test)

Random Forest Regressor Results:
-Training Metrics-
R Squared: 0.606766623604821
MAE: 2.3480857017714354
RMS: 3.029340976966813
MAPE: 21.04333938576813
RMSE: 3.029340976966813


-Testing Metrics-
R Squared: 0.6068544950674457
MAE: 2.3504982523685913
RMS: 3.032308544000489
MAPE: 21.083831434716917
RMSE: 3.032308544000489




In [17]:
def grid_Search_GBR(X_train, X_test, y_train, y_test):
            param_grid_rf = {'n_estimators': [50,70,90], 'max_depth': [5, 10, 50], 'min_samples_split': [2,6,8],
          'learning_rate': [0.01,0.001], 'loss': ['ls']}
            grid_rf = GridSearchCV(GradientBoostingRegressor(), param_grid=param_grid_rf, n_jobs=-1, cv=3, refit=True)
            grid_rf.fit(X_train, y_train) 

In [None]:
grid_Search_GBR(X_train, X_test, y_train, y_test)

In [None]:
params = {'n_estimators': 50, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}
gbr = GradientBoostingRegressor(**params)
    
gbr.fit(X_train,y_train)
    
gbr_pred_train = gbr.predict(X_train)
gbr_pred_test = gbr.predict(X_test)
    
print('Gradient Boosting Regressor Results After Hyperparamter Tuning:')
print('-Training Metrics-')
compute_metrics(gbr,gbr_pred_train,y_train)
print('-Testing Metrics-')
compute_metrics(gbr,gbr_pred_test,y_test)

### Random Forest Regressor:

In [16]:
def train_rf_model(X_train, X_test, y_train, y_test):
    global rf

    rf = RandomForestRegressor()
    
    rf.fit(X_train,y_train)
    
    rf_pred_train = rf.predict(X_train)
    rf_pred_test = rf.predict(X_test)
    
    print('Random Forest Regressor Results:')
    print('-Training Metrics-')
    compute_metrics(rf,rf_pred_train,y_train)
    print('-Testing Metrics-')
    compute_metrics(rf,rf_pred_test,y_test)

In [61]:
train_rf_model(X_train, X_test, y_train, y_test)


The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



Random Forest Regressor Results:
-Training Metrics-
R Squared: 0.9985273885319703
MAE: 0.07206066870706884
RMS: 0.18538178623194024
MAPE: 0.4758223210544743
RMSE: 0.18538178623194024


-Testing Metrics-
R Squared: 0.9919923118243674
MAE: 0.18292295020181207
RMS: 0.4327631855233944
MAPE: 1.2107842576197914
RMSE: 0.4327631855233944




### Hyper-parater tuning using Grid Search CV for Random Forest Regressor:

In [62]:
def grid_Search_RF(X_train, X_test, y_train, y_test):
            param_grid_rf = {'n_estimators': [5, 20, 70], 'max_depth': [5, 10, 50], 'oob_score': [True, False]}
            grid_rf = GridSearchCV(RandomForestRegressor(), param_grid=param_grid_rf, n_jobs=-1, cv=3, refit=True)
            grid_rf.fit(X_train, y_train)
            print(grid_rf.best_params_)

In [63]:
print('Grid Search CV started...')
grid_Search_RF(X_train, X_test, y_train, y_test)
print('Done')

Grid Search CV started...
{'max_depth': 50, 'n_estimators': 70, 'oob_score': True}
Done


In [94]:
# Use the forest's predict method on the test data
scaled_rf = RandomForestRegressor(n_estimators=70, max_depth=50, oob_score=True, n_jobs=-1)
scaled_rf.fit(X_train, y_train)
rf_pred_train = scaled_rf.predict(X_train)
rf_pred_test = scaled_rf.predict(X_test)

print('Random Forest Regressor Results After Hyperparamter Tuning:')
print('-Training Metrics-')
compute_metrics(scaled_rf,rf_pred_train,y_train)
print('-Testing Metrics-')
compute_metrics(scaled_rf,rf_pred_test,y_test)

Random Forest Regressor Results After Hyperparamter Tuning:
-Training Metrics-
R Squared: 0.9989821486106442
MAE: 0.06497871110881892
RMS: 0.1541221860004354
MAPE: 0.4288666918910649
RMSE: 0.1541221860004354


-Testing Metrics-
R Squared: 0.992811034611263
MAE: 0.17273787543187788
RMS: 0.41004349704410886
MAPE: 1.142819084368319
RMSE: 0.41004349704410886




**The best model after model selection for predicting the interest rate is Random Forest Regressor**