In [1]:

import os
import shutil
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from math import sqrt
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import metrics
from sklearn.ensemble import ExtraTreesRegressor

In [2]:
%matplotlib inline
pd.options.display.max_columns = 45
pd.options.display.max_rows = 70

In [26]:
def preprocessing(data):
    data['POSTAL CODE'].fillna(85200.0,inplace=True)
    data['PREPAYMENT PENALTY MORTGAGE FLAG'].fillna('N',inplace=True)
    data['METROPOLITAN STATISTICAL AREA'].fillna(16974.0,inplace=True)
    data.drop(['SUPER CONFORMING FLAG'],axis=1,inplace=True)
    data['CREDIT SCORE'].replace(to_replace=9999, value=np.nan, inplace=True)
    data['CREDIT SCORE'].fillna((data['CREDIT SCORE'].mean()), inplace=True)
    data['FIRST TIME HOMEBUYER FLAG'].replace(to_replace='9', value=np.nan, inplace=True)
    data['FIRST TIME HOMEBUYER FLAG'].fillna('N', inplace=True)
    data['Maturity_year'] =  [int(str(d)[:4]) for d in data['MATURITY DATE']]
    del data['MATURITY DATE']
    data['MORTGAGE INSURANCE PERCENTAGE'].replace(to_replace=999,value=0,inplace=True)
    data['NUMBER OF UNITS'].replace(to_replace=99,value=1,inplace=True)

    data['ORIGINAL COMBINED LOAN-TO-VALU'].replace(to_replace=999,value=80,inplace=True)

    data['ORIGINAL DEBT-TO-INCOME (DTI) RATIO'].replace(to_replace=999,value=float('nan'),inplace=True)
    data['ORIGINAL DEBT-TO-INCOME (DTI) RATIO'].fillna(data['ORIGINAL DEBT-TO-INCOME (DTI) RATIO'].mean(),inplace=True)

    data['ORIGINAL LOAN-TO-VALUE'].replace(to_replace=999,value=float('nan'),inplace=True)
    data['ORIGINAL LOAN-TO-VALUE'].fillna(data['ORIGINAL LOAN-TO-VALUE'].mean(),inplace=True)

    data.drop('PRODUCT TYPE',axis=1,inplace=True)
    data.drop('LOAN SEQUENCE NUMBER',axis=1,inplace=True)

    data['PROPERTY TYPE'].replace(to_replace=99,value='SF',inplace=True)
    data['LOAN PURPOSE'].replace(to_replace=9,value='C',inplace=True)
    data['NUMBER OF BORROWERS'].replace(to_replace=99,value=2,inplace=True)

    del data['FIRST PAYMENT DATE']

    data.drop('ORIGINAL LOAN-TO-VALUE',axis=1,inplace=True)
    data.drop('Maturity_year',axis=1,inplace=True)

    #dictionary for each column
    cleanup_nums = {"FIRST TIME HOMEBUYER FLAG":{"Y": 1, "N": 0},
                "OCCUPANCY STATUS": {"P": 1, "S": 2, "I": 3 },
                "CHANNE":{"T": 1, "R":2,"C":3,"B":4},
                "PREPAYMENT PENALTY MORTGAGE FLAG":{"Y":1,"N":0},
                "PROPERTY TYPE":{"SF":1,"PU":2,"CO":3,"MH":4,"CP":5,"99":99},
                "LOAN PURPOSE":{"C":1,"P":2,"N":3},
                "SELLER NAME":{"Other sellers": 0, "COUNTRYWIDE": 1, "TAYLOR,BEAN&WHITAKER": 2, "PROVIDENTFUNDINGASSO": 3, "USBANKNA": 4, "FIFTHTHIRDBANK": 5, "ABNAMROMTGEGROUP,INC": 6, "CHASEHOMEFINANCELLC": 7, "NATLCITYMTGECO": 8, "WELLSFARGOBANK,NA": 9, "GMACMTGECORP": 10, "WASHINGTONMUTUALBANK": 11, "FLAGSTARBANK,FSB": 12, "BANKOFAMERICA,NA": 13},
                "PREPAYMENT PENALTY MORTGAGE FLAG":{"Y": 1, "N": 0},
                "PROPERTY STATE":{"RI": 0, "OK": 1, "NY": 2, "MO": 3, "MN": 4, "IL": 5, "KY": 6, "WA": 7, "TX": 8, "FL": 9, "CA": 10, "IN": 11, "NJ": 12, "ID": 13, "TN": 14, "KS": 15, "MI": 16, "IA": 17, "MT": 18, "GA": 19, "OH": 20, "OR": 21, "ME": 22, "CT": 23, "WV": 24, "NH": 25, "VA": 26, "NC": 27, "AZ": 28, "NE": 29, "MD": 30, "MA": 31, "UT": 32, "CO": 33, "ND": 34, "PA": 35, "SC": 36, "DE": 37, "SD": 38, "WI": 39, "AL": 40, "AK": 41, "VT": 42, "LA": 43, "AR": 44, "NM": 45, "HI": 46, "DC": 47, "MS": 48, "NV": 49, "GU": 50, "WY": 51, "PR": 52, "VI": 53},
                "SERVICER NAME":{"USBANKNA": 0, "Other servicers": 1, "PNCMTGESERVICES,INC": 2, "WELLSFARGOBANK,NA": 3, "WASHINGTONMUTUALBANK": 4, "PNCBANK,NATL": 5, "NATLCITYMTGECO": 6, "JPMORGANCHASEBANK,NA": 7, "NATIONSTARMTGELLCDBA": 8, "COUNTRYWIDE": 9, "BANKOFAMERICA,NA": 10, "BACHOMELOANSERVICING": 11, "CITIMORTGAGE,INC": 12, "PROVIDENTFUNDINGASSO": 13, "ABNAMROMTGEGROUP,INC": 14, "FIFTHTHIRDBANK": 15, "GMACMORTGAGE,LLC": 16},
                "OCCUPANCY STATUS":{"P": 1, "S": 2, "I": 3 },
                "CHANNE":{"T": 1, "R":2,"C":3,"B":4}  
               }
    data.replace(cleanup_nums, inplace=True)
    
    #d_seller = {"Other sellers": 0, "COUNTRYWIDE": 1, "TAYLOR,BEAN&WHITAKER": 2, "PROVIDENTFUNDINGASSO": 3, "USBANKNA": 4, "FIFTHTHIRDBANK": 5, "ABNAMROMTGEGROUP,INC": 6, "CHASEHOMEFINANCELLC": 7, "NATLCITYMTGECO": 8, "WELLSFARGOBANK,NA": 9, "GMACMTGECORP": 10, "WASHINGTONMUTUALBANK": 11, "FLAGSTARBANK,FSB": 12, "BANKOFAMERICA,NA": 13}
    #d_service  = {"USBANKNA": 0, "Other servicers": 1, "PNCMTGESERVICES,INC": 2, "WELLSFARGOBANK,NA": 3, "WASHINGTONMUTUALBANK": 4, "PNCBANK,NATL": 5, "NATLCITYMTGECO": 6, "JPMORGANCHASEBANK,NA": 7, "NATIONSTARMTGELLCDBA": 8, "COUNTRYWIDE": 9, "BANKOFAMERICA,NA": 10, "BACHOMELOANSERVICING": 11, "CITIMORTGAGE,INC": 12, "PROVIDENTFUNDINGASSO": 13, "ABNAMROMTGEGROUP,INC": 14, "FIFTHTHIRDBANK": 15, "GMACMORTGAGE,LLC": 16}
    #d_seller.setdefault(data["SELLER NAME"], default=0)
    #d_service.setdefault(data["SERVICER NAME"], default=0)
    col=['FIRST TIME HOMEBUYER FLAG',
         'METROPOLITAN STATISTICAL AREA',
         'NUMBER OF UNITS',
         'ORIGINAL DEBT-TO-INCOME (DTI) RATIO',
         'PREPAYMENT PENALTY MORTGAGE FLAG',
         'NUMBER OF BORROWERS',
         'SELLER NAME',
         'SERVICER NAME']
    data.drop(col,axis=1,inplace=True)
    return data
    

In [164]:

data = pd.read_csv("datapart2/historical_data1_Q42016.txt",sep="|",header=None)
data.columns = ["CREDIT SCORE","FIRST PAYMENT DATE","FIRST TIME HOMEBUYER FLAG","MATURITY DATE","METROPOLITAN STATISTICAL AREA",
               "MORTGAGE INSURANCE PERCENTAGE","NUMBER OF UNITS","OCCUPANCY STATUS","ORIGINAL COMBINED LOAN-TO-VALU","ORIGINAL DEBT-TO-INCOME (DTI) RATIO",
               "ORIGINAL UPB","ORIGINAL LOAN-TO-VALUE","ORIGINAL INTEREST RATE","CHANNE","PREPAYMENT PENALTY MORTGAGE FLAG",
               "PRODUCT TYPE","PROPERTY STATE","PROPERTY TYPE","POSTAL CODE","LOAN SEQUENCE NUMBER","LOAN PURPOSE",
               "ORIGINAL LOAN TERM","NUMBER OF BORROWERS","SELLER NAME","SERVICER NAME","SUPER CONFORMING FLAG"
                #,"Pre-HARP LOAN SEQUENCE NUMBER"
                ]
data.head()

Unnamed: 0,CREDIT SCORE,FIRST PAYMENT DATE,FIRST TIME HOMEBUYER FLAG,MATURITY DATE,METROPOLITAN STATISTICAL AREA,MORTGAGE INSURANCE PERCENTAGE,NUMBER OF UNITS,OCCUPANCY STATUS,ORIGINAL COMBINED LOAN-TO-VALU,ORIGINAL DEBT-TO-INCOME (DTI) RATIO,ORIGINAL UPB,ORIGINAL LOAN-TO-VALUE,ORIGINAL INTEREST RATE,CHANNE,PREPAYMENT PENALTY MORTGAGE FLAG,PRODUCT TYPE,PROPERTY STATE,PROPERTY TYPE,POSTAL CODE,LOAN SEQUENCE NUMBER,LOAN PURPOSE,ORIGINAL LOAN TERM,NUMBER OF BORROWERS,SELLER NAME,SERVICER NAME,SUPER CONFORMING FLAG
0,811,201701,9,203612,,0,1,P,80,34,210000,80,3.5,R,N,FRM,PA,SF,17700,F116Q4000001,C,240,2,Other sellers,Other servicers,
1,790,201701,9,204612,12580.0,0,1,P,69,30,517000,69,3.625,C,N,FRM,MD,SF,21000,F116Q4000002,N,360,2,Other sellers,LAKEVIEWLOANSERVICIN,Y
2,690,201612,9,204611,,25,1,P,90,22,225000,90,3.625,R,N,FRM,IL,SF,62800,F116Q4000003,N,360,1,Other sellers,Other servicers,
3,718,201703,9,204702,,0,1,P,80,30,60000,80,4.375,R,N,FRM,IL,SF,62400,F116Q4000004,N,360,1,Other sellers,Other servicers,
4,776,201612,9,204611,30460.0,0,1,P,80,46,152000,80,3.375,R,N,FRM,KY,SF,40300,F116Q4000005,N,360,1,Other sellers,"PNCBANK,NATL",


In [165]:
data=preprocessing(data)

>


In [166]:
val = pd.read_csv("datapart2/historical_data1_Q12017.txt",sep="|",header=None)
val.columns = ["CREDIT SCORE","FIRST PAYMENT DATE","FIRST TIME HOMEBUYER FLAG","MATURITY DATE","METROPOLITAN STATISTICAL AREA",
               "MORTGAGE INSURANCE PERCENTAGE","NUMBER OF UNITS","OCCUPANCY STATUS","ORIGINAL COMBINED LOAN-TO-VALU","ORIGINAL DEBT-TO-INCOME (DTI) RATIO",
               "ORIGINAL UPB","ORIGINAL LOAN-TO-VALUE","ORIGINAL INTEREST RATE","CHANNE","PREPAYMENT PENALTY MORTGAGE FLAG",
               "PRODUCT TYPE","PROPERTY STATE","PROPERTY TYPE","POSTAL CODE","LOAN SEQUENCE NUMBER","LOAN PURPOSE",
               "ORIGINAL LOAN TERM","NUMBER OF BORROWERS","SELLER NAME","SERVICER NAME","SUPER CONFORMING FLAG"
                #,"Pre-HARP LOAN SEQUENCE NUMBER"
                ]
val.head()

Unnamed: 0,CREDIT SCORE,FIRST PAYMENT DATE,FIRST TIME HOMEBUYER FLAG,MATURITY DATE,METROPOLITAN STATISTICAL AREA,MORTGAGE INSURANCE PERCENTAGE,NUMBER OF UNITS,OCCUPANCY STATUS,ORIGINAL COMBINED LOAN-TO-VALU,ORIGINAL DEBT-TO-INCOME (DTI) RATIO,ORIGINAL UPB,ORIGINAL LOAN-TO-VALUE,ORIGINAL INTEREST RATE,CHANNE,PREPAYMENT PENALTY MORTGAGE FLAG,PRODUCT TYPE,PROPERTY STATE,PROPERTY TYPE,POSTAL CODE,LOAN SEQUENCE NUMBER,LOAN PURPOSE,ORIGINAL LOAN TERM,NUMBER OF BORROWERS,SELLER NAME,SERVICER NAME,SUPER CONFORMING FLAG
0,714,201705,9,204704,43580.0,25,1,P,87,42,125000,87,4.375,R,N,FRM,IA,SF,51000,F117Q1000001,N,360,1,Other sellers,Other servicers,
1,809,201705,9,204704,,0,1,P,75,38,195000,75,4.25,R,N,FRM,PA,SF,17900,F117Q1000002,N,360,1,Other sellers,SPECIALIZEDLOANSERVI,
2,745,201703,9,203202,33340.0,0,1,P,73,38,218000,73,3.125,R,N,FRM,WI,SF,53200,F117Q1000003,N,180,1,Other sellers,"PNCBANK,NATL",
3,717,201704,9,203203,,0,1,P,80,33,45000,80,4.25,R,N,FRM,IL,SF,60900,F117Q1000004,C,180,1,Other sellers,Other servicers,
4,809,201704,9,204703,,0,1,P,80,39,297000,80,4.125,R,N,FRM,PA,SF,17800,F117Q1000005,C,360,1,Other sellers,Other servicers,


In [167]:
val=preprocessing(val)

>


In [168]:
(train_x,train_y)=(data.drop('ORIGINAL INTEREST RATE',axis=1),data['ORIGINAL INTEREST RATE'])

In [169]:
(test_x,test_y)=(val.drop('ORIGINAL INTEREST RATE',axis=1),val['ORIGINAL INTEREST RATE'])

In [170]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [171]:
#gradient boosting
gbm_reg = GradientBoostingRegressor(alpha=0.85, learning_rate=0.1, loss="huber", max_depth=10, max_features=0.7, min_samples_leaf=18, min_samples_split=19, n_estimators=100, subsample=0.4,verbose=2)
gbm_reg.fit(train_x,train_y)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           0.1000           0.0135            5.56m
         2           0.0879           0.0124            5.74m
         3           0.0777           0.0100            5.75m
         4           0.0694           0.0080            5.70m
         5           0.0629           0.0063            5.65m
         6           0.0576           0.0052            5.57m
         7           0.0537           0.0045            5.52m
         8           0.0495           0.0038            5.46m
         9           0.0465           0.0030            5.35m
        10           0.0438           0.0024            5.27m
        11           0.0419           0.0020            5.16m
        12           0.0404           0.0016            5.10m
        13           0.0387           0.0013            5.03m
        14           0.0378           0.0011            4.96m
        15           0.0369           0.0009            4.89m
       

GradientBoostingRegressor(alpha=0.85, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='huber', max_depth=10,
             max_features=0.7, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=18, min_samples_split=19,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             n_iter_no_change=None, presort='auto', random_state=None,
             subsample=0.4, tol=0.0001, validation_fraction=0.1, verbose=2,
             warm_start=False)

In [172]:
pred_y = gbm_reg.predict(test_x)
print("Score for train: "+str(gbm_reg.score(train_x,train_y)))
print("Score: "+str(gbm_reg.score(test_x,test_y)))
print("RMS: "+str(sqrt(metrics.mean_squared_error(test_y,pred_y))))
print("MAPE: "+str(mean_absolute_percentage_error(test_y,pred_y)))
print("R2: "+str(metrics.r2_score(test_y,pred_y)))
print("MAE: "+str(metrics.mean_absolute_error(test_y,pred_y)))

Score for train: 0.7128624496918107
Score: -0.5578520208502975
RMS: 0.5821022665406379
MAPE: 12.362882645628169
R2: -0.5578520208502975
MAE: 0.5300463390198438
