In [26]:
# Data Processing
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Modeling
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn import svm
from random import randint
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

# Evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import RocCurveDisplay

In [27]:
#Load in LendingClub Data 
interest = pd.read_csv("LC_train_HW2.csv")
#movies = movies.sample(frac=0.1,random_state = 123,ignore_index=True) #sampling from the data for large datasets

interest.head()

Unnamed: 0,all_util,annual_inc,application_type,chargeoff_within_12_mths,collections_12_mths_ex_med,delinq_2yrs,dti,emp_length,fico_range_high,fico_range_low,...,mo_sin_old_rev_tl_op,mort_acc,mths_since_last_record,mths_since_recent_inq,open_acc,pub_rec_bankruptcies,term,total_bal_ex_mort,verification_status,int_rate
0,80.0,36000.0,Individual,0,0,0,26.33,,674,670,...,35,0,,0.0,5,0,36 months,34683,Verified,18.62
1,61.0,45000.0,Individual,0,0,0,38.51,1 year,734,730,...,103,0,,0.0,14,0,60 months,63873,Not Verified,16.08
2,31.0,53040.0,Individual,0,0,0,25.2,< 1 year,809,805,...,145,4,,0.0,9,0,36 months,24452,Verified,7.56
3,87.0,125000.0,Individual,0,0,0,27.87,10+ years,684,680,...,230,4,,19.0,14,0,36 months,141033,Verified,17.3
4,75.0,73000.0,Individual,0,0,0,35.12,10+ years,759,755,...,126,1,,13.0,19,0,36 months,160302,Not Verified,10.81


In [28]:
# no non-relevant variables to remove

In [29]:
#Counting the numbers of missing values for each column
interest.isna().sum()

all_util                         16
annual_inc                        0
application_type                  0
chargeoff_within_12_mths          0
collections_12_mths_ex_med        0
delinq_2yrs                       0
dti                             186
emp_length                     8669
fico_range_high                   0
fico_range_low                    0
home_ownership                    0
inq_last_12m                      0
loan_amnt                         0
mo_sin_old_il_acct             2204
mo_sin_old_rev_tl_op              0
mort_acc                          0
mths_since_last_record        90875
mths_since_recent_inq          9975
open_acc                          0
pub_rec_bankruptcies              0
term                              0
total_bal_ex_mort                 0
verification_status               0
int_rate                          0
dtype: int64

In [30]:
#Replacing missing values with the mean
interest['all_util'] = interest['all_util'].fillna(interest['all_util'].mean())
interest['dti'] = interest['dti'].fillna(interest['dti'].mean())
interest['emp_length'] = interest['emp_length'].fillna("Blank") # categorical variable, using mean would be skewed because there is a max employment year of 10+, i will see if it being left blank is an indicator with dummy variables later on in model.
interest['mo_sin_old_il_acct'] = interest['mo_sin_old_il_acct'].fillna(interest['mo_sin_old_il_acct'].mean())
interest['mths_since_last_record'] = interest['mths_since_last_record'].fillna(interest['mths_since_last_record'].mean())
interest['mths_since_recent_inq'] = interest['mths_since_recent_inq'].fillna(interest['mths_since_recent_inq'].mean())

In [31]:
#Creating dummy variables
#Put the name of all categorical variables of interet in an object
cat_cols = ['application_type','emp_length','home_ownership','term','verification_status']
ohe = OneHotEncoder() #Initialize onehotencoder

#Apply onehotencoder to the categorical columns of the dataframe
ohe_features = ohe.fit_transform(interest[cat_cols]).toarray()

#Get catergory names of the dummies and re-arrange transformed variables as a data frame with those names as column names
ohe_labels = ohe.get_feature_names_out(cat_cols)
ohe_features = pd.DataFrame(ohe_features,columns=ohe_labels)
ohe_features

Unnamed: 0,application_type_Individual,application_type_Joint App,emp_length_1 year,emp_length_10+ years,emp_length_2 years,emp_length_3 years,emp_length_4 years,emp_length_5 years,emp_length_6 years,emp_length_7 years,...,emp_length_9 years,emp_length_< 1 year,emp_length_Blank,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,term_ 36 months,term_ 60 months,verification_status_Not Verified,verification_status_Verified
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
99996,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
99997,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
99998,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [32]:
#Create a temporary data frame with all the columns except the categorical one(s)
interest_nocat =interest.loc[:,~interest.columns.isin(cat_cols)]

#Attach the dummy data frame to the temp data frame to create the final dataset
encoded_data = pd.concat([interest_nocat,ohe_features],axis=1)
encoded_data

Unnamed: 0,all_util,annual_inc,chargeoff_within_12_mths,collections_12_mths_ex_med,delinq_2yrs,dti,fico_range_high,fico_range_low,inq_last_12m,loan_amnt,...,emp_length_9 years,emp_length_< 1 year,emp_length_Blank,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,term_ 36 months,term_ 60 months,verification_status_Not Verified,verification_status_Verified
0,80.0,36000.0,0,0,0,26.33,674,670,5,5000,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,61.0,45000.0,0,0,0,38.51,734,730,1,15000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,31.0,53040.0,0,0,0,25.20,809,805,1,4550,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,87.0,125000.0,0,0,0,27.87,684,680,0,4000,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,75.0,73000.0,0,0,0,35.12,759,755,0,15075,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,75.0,38500.0,0,0,0,51.75,664,660,1,18000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
99996,59.0,165000.0,0,0,3,12.90,714,710,0,15000,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
99997,54.0,49999.0,0,0,0,32.67,664,660,0,23000,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
99998,54.0,62800.0,0,0,0,33.86,759,755,3,19000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [33]:
#Features data: select all columns except 'int_rate'
X = encoded_data.loc[:, encoded_data.columns!='int_rate']

#Response/target data
Y = encoded_data[["int_rate"]]

# Split data into training and test set
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.90,    # hold out 10% of data for testing
                                                    random_state=123) # set for reproducibility

In [34]:
#Scaling data to mean of 0 and standard deviation of 1 (i.e., standardization)
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)

#Model 1 SVM

In [35]:
#Initialize and fit the SVM regression algorithm to the training data 
# use C of 0.1, 'rbf' kernel, and 'scale' for gamma
SVMreg = svm.SVR(C=0.1,
                 kernel='rbf',
                 gamma='scale')

#Change the shape of label column to array
SVMreg.fit(X_train_s, np.ravel(Y_train))

In [36]:
# Make predictions using the model and the testing data
Y_pred = SVMreg.predict(X_test_s)
print("RMSE:",mean_squared_error(Y_test, Y_pred, squared=False))
print("MAE:",mean_absolute_error(Y_test, Y_pred))

RMSE: 4.64323908106167
MAE: 3.4675373993059186


In [37]:
# The parameter grid that we plan to use for the grid search.
param_grid = {
    'C': [2.5,3.5,4.5],
    'kernel': ['linear','rbf']}

# Use grid search to find the best hyperparameters
grid_search = GridSearchCV(SVMreg,
                           param_grid,
                           cv=5, n_jobs=-1)

# Fit the grid search object to the data
grid_search.fit(X_train_s, np.ravel(Y_train))

In [38]:
# Obtain and save the values of the best set of hyperparamters
#(full details of cross validation are saved as cv_results_)
best_svm = grid_search.best_estimator_

# Print the best hyperparameters
print('Best hyperparameters:',  grid_search.best_params_)

Best hyperparameters: {'C': 3.5, 'kernel': 'rbf'}


In [39]:
# Generate predictions with the best model
Y_pred_tuned = best_svm.predict(X_test_s) 

print("RMSE:",mean_squared_error(Y_test, Y_pred_tuned, squared=False))
print("MAE:",mean_absolute_error(Y_test, Y_pred_tuned))

RMSE: 4.420681898908094
MAE: 3.2747144563366684


In [40]:
# MOdel 2 Random Forest

In [41]:
#Initialize and fit the algorithm to the training data
RFreg = RandomForestRegressor(random_state=0)
RFreg.fit(X_train, np.ravel(Y_train))

In [42]:
# Make predictions using the model and the testing data
Y_pred_rf = RFreg.predict(X_test_s)
print("RMSE:",mean_squared_error(Y_test, Y_pred_rf, squared=False))
print("MAE:",mean_absolute_error(Y_test, Y_pred_rf))



RMSE: 6.01249559595125
MAE: 5.2822080736842025


In [49]:
# Using a randomized grid search to find the best set of parameters
# Setting ranges for parameters of interest
# max_features: number of variables to consider when looking for the best split (default=sqrt(p)) (equivalent of mtry in R)
# n_estimators: number of trees in the forest (default=100) (equivalent of ntree in R)
param_dist = {'max_features': range(1,9),
              'n_estimators': range(50,1000)}

# Use random search to find the best hyperparameters
# n_iter: number of parameter settings that are randomly sampled
# cv: number of folds for cross validation (if none, the default value of 5 will be used)
rand_search = RandomizedSearchCV(RFreg, 
                                 param_distributions = param_dist,
                                 n_iter=10,
                                 cv=5,
                                 random_state=0,
                                 verbose=False)

# Fit the random search object to the data
rand_search.fit(X_train, np.ravel(Y_train))

In [50]:
# Obtain and save the values of the best set of hyperparameters
#(full details of cross validation are saved as cv_results_)
best_rf = rand_search.best_estimator_

# Print the best hyperparameters
print('Best hyperparameters:',  rand_search.best_params_)

Best hyperparameters: {'n_estimators': 927, 'max_features': 8}


In [51]:
# Generate predictions with the best model
Y_pred_tuned_rf = best_rf.predict(X_test_s) 

print("RMSE:",mean_squared_error(Y_test, Y_pred_tuned_rf, squared=False))
print("MAE:",mean_absolute_error(Y_test, Y_pred_tuned_rf))



RMSE: 5.585260633190541
MAE: 4.855303269062593


In [52]:
#Testing

In [83]:
#Load Test Data
test_interest_new = pd.read_csv("LC_test_HW2.csv")

In [84]:
#Counting the numbers of missing values for each column
test_interest.isna().sum()

ID                            0
all_util                      0
annual_inc                    0
application_type              0
chargeoff_within_12_mths      0
collections_12_mths_ex_med    0
delinq_2yrs                   0
dti                           0
emp_length                    0
fico_range_high               0
fico_range_low                0
home_ownership                0
inq_last_12m                  0
loan_amnt                     0
mo_sin_old_il_acct            0
mo_sin_old_rev_tl_op          0
mort_acc                      0
mths_since_last_record        0
mths_since_recent_inq         0
open_acc                      0
pub_rec_bankruptcies          0
term                          0
total_bal_ex_mort             0
verification_status           0
dtype: int64

In [85]:
#Replacing missing values with the mean
#Replacing missing values with the mean
test_interest['all_util'] = test_interest['all_util'].fillna(test_interest['all_util'].mean())
test_interest['dti'] = test_interest['dti'].fillna(test_interest['dti'].mean())
test_interest['emp_length'] = test_interest['emp_length'].fillna("Blank") # categorical variable, using mean would be skewed because there is a max employment year of 10+, i will see if it being left blank is an indicator with dummy variables later on in model.
test_interest['mo_sin_old_il_acct'] = test_interest['mo_sin_old_il_acct'].fillna(test_interest['mo_sin_old_il_acct'].mean())
test_interest['mths_since_last_record'] = test_interest['mths_since_last_record'].fillna(test_interest['mths_since_last_record'].mean())
test_interest['mths_since_recent_inq'] = test_interest['mths_since_recent_inq'].fillna(test_interest['mths_since_recent_inq'].mean())

In [86]:
#Creating dummy variables
#Put the name of all categorical variables of interet in an object
cat_cols_test = ['application_type','emp_length','home_ownership','term','verification_status']
ohe_test = OneHotEncoder() #Initialize onehotencoder

#Apply onehotencoder to the categorical columns of the dataframe
ohe_features_test = ohe_test.fit_transform(test_interest[cat_cols_test]).toarray()

#Get catergory names of the dummies and re-arrange transformed variables as a data frame with those names as column names
ohe_labels_test = ohe_test.get_feature_names_out(cat_cols_test)
ohe_features_test = pd.DataFrame(ohe_features,columns=ohe_labels_test)
ohe_features_test

Unnamed: 0,application_type_Individual,application_type_Joint App,emp_length_1 year,emp_length_10+ years,emp_length_2 years,emp_length_3 years,emp_length_4 years,emp_length_5 years,emp_length_6 years,emp_length_7 years,...,emp_length_9 years,emp_length_< 1 year,emp_length_Blank,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,term_ 36 months,term_ 60 months,verification_status_Not Verified,verification_status_Verified
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
99996,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
99997,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
99998,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [87]:
#Create a temporary data frame with all the columns except the categorical one(s)
#interest_nocat_test =test_interest.loc[:,~test_interest.columns.isin(cat_cols_test,'ID')]
interest_nocat_test = test_interest.loc[:, ~test_interest.columns.isin(cat_cols_test + ['ID'])]

#Attach the dummy data frame to the temp data frame to create the final dataset
encoded_data_test = pd.concat([interest_nocat_test,ohe_features_test],axis=1)
encoded_data_test


Unnamed: 0,all_util,annual_inc,chargeoff_within_12_mths,collections_12_mths_ex_med,delinq_2yrs,dti,fico_range_high,fico_range_low,inq_last_12m,loan_amnt,...,emp_length_9 years,emp_length_< 1 year,emp_length_Blank,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,term_ 36 months,term_ 60 months,verification_status_Not Verified,verification_status_Verified
0,75.0,135000.0,0.0,0.0,0.0,19.32,704.0,700.0,5.0,15000.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,32.0,90000.0,0.0,0.0,0.0,15.29,714.0,710.0,2.0,20000.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,34.0,90000.0,0.0,0.0,0.0,37.96,779.0,775.0,0.0,30000.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,87.0,90000.0,0.0,0.0,0.0,24.85,699.0,695.0,7.0,25000.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4,59.0,78000.0,0.0,0.0,1.0,16.86,699.0,695.0,4.0,1200.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
99996,,,,,,,,,,,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
99997,,,,,,,,,,,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
99998,,,,,,,,,,,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [88]:
#Features data: select all columns except 'int_rate'
X = encoded_data.loc[:, encoded_data.columns!='int_rate']

#Response/target data
Y = encoded_data[["int_rate"]]

# Split data into training and test set
X_train_data, X_test_data, Y_train_data, Y_test_data = train_test_split(X,
                                                    Y,
                                                    test_size=0.50,    # test on 50% of the data
                                                    random_state=123) # set for reproducibility

In [90]:
#Scaling data to mean of 0 and standard deviation of 1 (i.e., standardization)
scaler_test = StandardScaler().fit(X_train_data)
X_train_s_test = scaler.transform(X_train_data)


In [91]:
#Initialize and fit the SVM regression algorithm to the training data 
# use C of 0.1, 'rbf' kernel, and 'scale' for gamma
SVMreg_test = svm.SVR(C=3.5,
                 kernel='rbf',
                 gamma='scale')

#Change the shape of label column to array
SVMreg_test.fit(X_train_s_test, np.ravel(Y_train_data))

In [92]:
# Make predictions using the model and the testing data
Y_pred_test_data = SVMreg_test.predict(X_train_s_test)
print("RMSE:",mean_squared_error(Y_test_data, Y_pred_test_data, squared=False))
print("MAE:",mean_absolute_error(Y_test_data, Y_pred_test_data))

RMSE: 6.238640996331494
MAE: 4.936056316026701


In [93]:
#Save predicted values as a .csv file
pd.DataFrame(Y_pred_test_data).to_csv("Predictions.csv")

#Nader, these are assumptions i made. I made train 5% of data as it took too long. We can up it now that it works. Using 75% to train, i let grid tuning run for 4.5 hours with no results. we can increase to 20% and see how long it takes?

#Andrew. needs to figureout what to do with the Test data prof provided... look at code you did with Aneesh...