In [26]:
from io import StringIO
import requests
import json
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import *
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

In [19]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
df1 = pd.read_csv('HistoricalInputFiles/historical_data1_Q12007.txt', low_memory=False,sep="|", header = None)
df2 = pd.read_csv('HistoricalInputFiles/historical_data1_Q22007.txt', low_memory=False,sep="|", header = None)


def preprocess_selection(data_1):

    data_1.columns = ['fico','dt_first_pi','flag_fthb','dt_matr','cd_msa',"mi_pct",'cnt_units',
                      'occpy_sts','cltv','dti','orig_upb','ltv','int_rt','channel','ppmt_pnlty',
                      'prod_type','st', 'prop_type','zipcode','id_loan','loan_purpose', 
                      'orig_loan_term','cnt_borr','seller_name','servicer_name', 'flag_sc']

    data_1.fico = data_1.fico.replace(r'\s+', np.nan, regex=True).astype('float64')
    data_1.update(data_1[['fico']].fillna(data_1['fico'].mode()[0]))


    for i, row in data_1.iterrows():
        data_1.set_value(i,'dt_first_pi',((int(str(row['dt_first_pi'])[:4]) - 1990)*12*30 + (int(str(row['dt_first_pi'])[4:6])*30)))

    data_1.flag_fthb = data_1.flag_fthb.replace(r'\s+', np.nan, regex=True)
    data_1['flag_fthb'] = (data_1[['flag_fthb']].fillna(3))
    data_1.flag_fthb.replace('N', 0 , inplace = True)
    data_1.flag_fthb.replace('Y', 0 , inplace = True)
    data_1.flag_fthb = data_1.flag_fthb.astype('category')

    for i, row in data_1.iterrows():
        data_1.set_value(i,'dt_matr',((int(str(row['dt_matr'])[:4]) - 1990)*12*30 + (int(str(row['dt_matr'])[4:6])*30)))

    data_1.mi_pct = data_1.mi_pct.replace(r'\s+', np.nan, regex=True)
    data_1.mi_pct = data_1.mi_pct.astype('float64')
    mean = data_1.mi_pct.mean()
    data_1['mi_pct'] = (data_1[['mi_pct']].fillna(mean))

    data_1.cnt_units = data_1.cnt_units.replace(r'\s+', np.nan, regex=True)
    data_1.update(data_1[['cnt_units']].fillna(0))

    data_1.occpy_sts = data_1.occpy_sts.replace(r'\s+', np.nan, regex=True)
    data_1['occpy_sts'] = (data_1[['occpy_sts']].fillna(0))
    data_1.occpy_sts.replace('O', 1 , inplace = True)
    data_1.occpy_sts.replace('I', 2 , inplace = True)
    data_1.occpy_sts.replace('S', 3 , inplace = True)
    data_1.occpy_sts = data_1.occpy_sts.astype('category')

    data_1.cltv = data_1.cltv.replace(r'\s+', np.nan, regex=True).astype('float64')
    data_1.update(data_1[['cltv']].fillna(data_1['cltv'].mean()))

    data_1.dti = data_1.dti.replace(r'\s+', np.nan, regex=True).astype('float64')
    data_1.update(data_1[['dti']].fillna(65))

    data_1.ltv = data_1.ltv.replace(r'\s+', np.nan, regex=True)
    data_1.ltv = data_1.ltv.astype('float64')
    mean = data_1.ltv.mean()
    data_1['ltv'] = (data_1[['ltv']].fillna(mean))
    
    data_1.int_rt = data_1.int_rt.replace(r'\s+', np.nan, regex=True)
    data_1.int_rt = data_1.int_rt.astype('float64')
    mean_int_rt = data_1.int_rt.mean()
    data_1['int_rt'] = data_1[['int_rt']].fillna(mean_int_rt)
    data_1.int_rt = data_1.int_rt.astype('float64')

    data_1.channel = data_1.channel.replace(r'\s+', np.nan, regex=True)
    data_1['channel'] = (data_1[['channel']].fillna(0))
    data_1.channel.replace('R', 1 , inplace = True)
    data_1.channel.replace('B', 2 , inplace = True)
    data_1.channel.replace('C', 3 , inplace = True)
    data_1.channel.replace('T', 4 , inplace = True)
    data_1.channel = data_1.channel.astype('category')

    data_1.ppmt_pnlty = data_1.ppmt_pnlty.replace(r'\s+', np.nan, regex=True)
    data_1['ppmt_pnlty'] = (data_1[['ppmt_pnlty']].fillna(2))
    data_1.ppmt_pnlty.replace('Y', 1 , inplace = True)
    data_1.ppmt_pnlty.replace('N', 0 , inplace = True)
    data_1.ppmt_pnlty = data_1.ppmt_pnlty.astype('category')

    data_1.prop_type = data_1.prop_type.replace(r'\s+', np.nan, regex=True)
    data_1['prop_type'] = (data_1[['prop_type']].fillna(0))
    data_1.prop_type.replace('P', 1 , inplace = True)
    data_1.prop_type.replace('SF', 1 , inplace = True)
    data_1.prop_type.replace('CO', 2 , inplace = True)
    data_1.prop_type.replace('PU', 3 , inplace = True)
    data_1.prop_type.replace('MH', 4 , inplace = True)
    data_1.prop_type.replace('LH', 5 , inplace = True)
    data_1.prop_type.replace('CP', 6 , inplace = True)


    data_1.zipcode = data_1.zipcode.replace(r'\s+', np.nan, regex=True)
    data_1['zipcode'] = (data_1[['zipcode']].fillna(0))
    data_1.zipcode.replace(np.nan, 0 , inplace = True)

    data_1.loan_purpose = data_1.loan_purpose.replace(r'\s+', np.nan, regex=True)
    data_1['loan_purpose'] = (data_1[['loan_purpose']].fillna(0))
    data_1.loan_purpose.replace('P', 1 , inplace = True)
    data_1.loan_purpose.replace('C', 2 , inplace = True)
    data_1.loan_purpose.replace('N', 3 , inplace = True)

    data_1.cnt_borr = data_1.cnt_borr.replace(r'\s+', np.nan, regex=True)
    data_1['cnt_borr'] = (data_1[['cnt_borr']].fillna(0))
    data_1.cnt_borr = data_1.cnt_borr.astype('category')
    
    data_1.drop(data_1.columns[[4,7,8,9,14,15,16,17,18,19,22,23,24,25]], axis=1, inplace=True)
    
    data_1.to_csv()

preprocess_selection(df1)
preprocess_selection(df2)



df1.shape
df2.shape



(345374, 12)

In [21]:
global sfs_fwd
rf = LinearRegression(n_estimators=15)
sfs_fwd = SFS(rf, k_features = 25, forward=True, scoring='neg_mean_squared_error', n_jobs=-1)
#Feature Selection on Current Quarter Data
sfs_fwd = sfs_fwd.fit(df1.drop('int_rt',axis=1),df1['int_rt'])
    
print('----Selected Features from Forward Search----')
print(sfs_fwd.k_feature_names_)
    
X_train_sfs = sfs_fwd.transform(df1.drop('int_rt',axis=1))
X_test_sfs = sfs_fwd.transform(df2.drop('int_rt',axis=1))
    
y_train_sfs = df1['int_rt']
y_test_sfs = df2['int_rt']
    
rf.fit(X_train_sfs,y_train_sfs)
    
fwd_pred_train = rf.predict(X_train_sfs)
fwd_pred_test = rf.predict(X_test_sfs)
    
print('-Training Metrics-')
RSq = r2_score(y_train_sfs,fwd_pred_train)
print('R Squared: ' + str(RSq))
MAE = mean_absolute_error(y_train_sfs,fwd_pred_train)
print('MAE: ' + str(MAE))
RMS = np.sqrt(mean_squared_error(y_train_sfs,fwd_pred_train))
print('RMS: ' + str(RMS))
MAPE = np.mean(np.abs((y_train_sfs - fwd_pred_train) / y_train_sfs)) * 100
print('MAPE: ' + str(MAPE))
print('-Testing Metrics-')
RSq = r2_score(y_train_sfs,fwd_pred_train)
print('R Squared: ' + str(RSq))
MAE = mean_absolute_error(y_train_sfs,fwd_pred_train)
print('MAE: ' + str(MAE))
RMS = np.sqrt(mean_squared_error(y_train_sfs,fwd_pred_train))
print('RMS: ' + str(RMS))
MAPE = np.mean(np.abs((y_train_sfs - fwd_pred_train) / y_train_sfs)) * 100
print('MAPE: ' + str(MAPE))

----Selected Features from FWD Search----
('fico', 'dt_first_pi', 'flag_fthb', 'dt_matr', 'mi_pct', 'cnt_units', 'orig_upb', 'ltv', 'channel', 'loan_purpose', 'orig_loan_term')
-Training Metrics-
R Squared: 0.8456364123444997
MAE: 0.1023524021511848
RMS: 0.14170788122554664
MAPE: 1.648424219686985
-Testing Metrics-
R Squared: 0.8456364123444997
MAE: 0.1023524021511848
RMS: 0.14170788122554664
MAPE: 1.648424219686985


In [25]:
global sfs_bwd
rf = LinearRegression()
sfs_bwd = SFS(rf, k_features = 30, forward=False, scoring='neg_mean_squared_error', n_jobs=-1)
#Feature Selection on Current Quarter Data
sfs_bwd = sfs_bwd.fit(df1.drop('int_rt',axis=1).values,df1['int_rt'].values)
    
print('----Selected Features from Backward Search----')
print(sfs_bwd.k_feature_names_)
    
X_train_sfs = sfs_bwd.transform(df1.drop('int_rt',axis=1))
X_test_sfs = sfs_bwd.transform(df2.drop('int_rt',axis=1))
    
y_train_sfs = df1['int_rt']
y_test_sfs = df2['int_rt']
    
rf.fit(X_train_sfs,y_train_sfs)
    
bwd_pred_train = rf.predict(X_train_sfs)
bwd_pred_test = rf.predict(X_test_sfs)
    
print('-Training Metrics-')
RSq = r2_score(y_train_sfs,bwd_pred_train)
print('R Squared: ' + str(RSq))
MAE = mean_absolute_error(y_train_sfs,bwd_pred_train)
print('MAE: ' + str(MAE))
RMS = np.sqrt(mean_squared_error(y_train_sfs,bwd_pred_train))
print('RMS: ' + str(RMS))
MAPE = np.mean(np.abs((y_train_sfs - bwd_pred_train) / y_train_sfs)) * 100
print('MAPE: ' + str(MAPE))
print('-Testing Metrics-')
RSq = r2_score(y_train_sfs,bwd_pred_train)
print('R Squared: ' + str(RSq))
MAE = mean_absolute_error(y_train_sfs,bwd_pred_train)
print('MAE: ' + str(MAE))
RMS = np.sqrt(mean_squared_error(y_train_sfs,bwd_pred_train))
print('RMS: ' + str(RMS))
MAPE = np.mean(np.abs((y_train_sfs - bwd_pred_train) / y_train_sfs)) * 100
print('MAPE: ' + str(MAPE))

----Selected Features from FWD Search----
('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10')
-Training Metrics-
R Squared: 0.14677608406841725
MAE: 0.2495571461400936
RMS: 0.3331601760564025
MAPE: 4.0141743310397935
-Testing Metrics-
R Squared: 0.14677608406841725
MAE: 0.2495571461400936
RMS: 0.3331601760564025
MAPE: 4.0141743310397935


In [28]:
global efs_exh
rf = LinearRegression()
efs_exh = EFS(rf, min_features=10, max_features=11, scoring='neg_mean_squared_error', n_jobs=-1)
#Feature Selection on Current Quarter Data
efs_exh = efs_exh.fit(df1.drop('int_rt',axis=1).values,df1['int_rt'].values)
    
print('----Selected Features from Exhaustive Search----')
# print(efs_exh.k_feature_names_)
    
X_train_efs = efs_exh.transform(df1.drop('int_rt',axis=1))
X_test_efs = efs_exh.transform(df2.drop('int_rt',axis=1))
    
y_train_efs = df1['int_rt']
y_test_efs = df2['int_rt']
    
rf.fit(X_train_efs,y_train_efs)
    
exh_pred_train = rf.predict(X_train_efs)
exh_pred_test = rf.predict(X_test_efs)
    
print('-Training Metrics-')
RSq = r2_score(y_train_efs,exh_pred_train)
print('R Squared: ' + str(RSq))
MAE = mean_absolute_error(y_train_efs,exh_pred_train)
print('MAE: ' + str(MAE))
RMS = np.sqrt(mean_squared_error(y_train_efs,exh_pred_train))
print('RMS: ' + str(RMS))
MAPE = np.mean(np.abs((y_train_efs - exh_pred_train) / y_train_efs)) * 100
print('MAPE: ' + str(MAPE))
print('-Testing Metrics-')
RSq = r2_score(y_train_efs,exh_pred_train)
print('R Squared: ' + str(RSq))
MAE = mean_absolute_error(y_train_efs,exh_pred_train)
print('MAE: ' + str(MAE))
RMS = np.sqrt(mean_squared_error(y_train_efs,exh_pred_train))
print('RMS: ' + str(RMS))
MAPE = np.mean(np.abs((y_train_efs - exh_pred_train) / y_train_efs)) * 100
print('MAPE: ' + str(MAPE))

Features: 12/12

----Selected Features from Exhaustive Search----
-Training Metrics-
R Squared: 0.8300393154859349
MAE: 0.10549713961889888
RMS: 0.14869480940205662
MAPE: 1.6987067994948768
-Testing Metrics-
R Squared: 0.8300393154859349
MAE: 0.10549713961889888
RMS: 0.14869480940205662
MAPE: 1.6987067994948768
