# Default Values for Insurance Homeowner's Quoting

## Load Libs and Data

In [12]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
import pprint


Reading CSV: ./data/home_insurance_final.csv
   RISK_RATED_AREA_B  NCD_GRANTED_YEARS_B  RISK_RATED_AREA_C  \
0               19.0                  7.0                6.0   
1               25.0                  6.0                9.0   
2                0.0                  0.0               12.0   
3                0.0                  0.0               14.0   
4                5.0                  7.0               10.0   
5                0.0                  0.0                8.0   
6                1.0                  7.0                6.0   
7                0.0                  0.0                6.0   
8                0.0                  3.0                0.0   
9                5.0                  7.0                1.0   

   SUM_INSURED_CONTENTS  NCD_GRANTED_YEARS_C  SPEC_SUM_INSURED  \
0               50000.0                  7.0            7500.0   
1               50000.0                  7.0               0.0   
2               50000.0                  7.0        

## Test Train Split

In [35]:
y_names = ['RISK_RATED_AREA_B', 'NCD_GRANTED_YEARS_B', 'RISK_RATED_AREA_C', 'NCD_GRANTED_YEARS_C',  'ROOF_CONSTRUCTION', 
           'WALL_CONSTRUCTION', 'LISTED', 'OWNERSHIP_TYPE']

cat_y_names = ['Gen_APPR_ALARM', 'Gen_FLOODING', 'Gen_NEIGH_WATCH', 'Gen_SAFE_INSTALLED', 'Gen_SEC_DISC_REQ', 
               'Gen_SUBSIDENCE', 'Gen_APPR_LOCKS']

x_cols = [col for col in df.columns if col not in y_names]

X_train, X_test, y_train, y_test = train_test_split(
    x_cols, y_names, random_state=4)

## Fit the Regressor  - Build the Model

In [36]:
print('About to fit default values for ' + str(len(y.columns)) + ' y-values and ' + str(len(X.columns)) + ' X-values...')
max_depth = 30
regr_multirf = MultiOutputRegressor(RandomForestRegressor(n_estimators=100,
                                                          max_depth=max_depth,
                                                          random_state=0))
regr_multirf.fit(X_train, y_train)
print('Training score:', regr_multirf.score(X_train, y_train))

About to fit default values for 8 y-values and 55 X-values...
Training score: 0.855182788114699


### Score Log
11/18/19 (1): 
Training score: 0.7753403511281722

Realized that I was running a regressor on both continuous and categorical y-features. Fixed this in (2).

11/18/19 (2): About to fit default values for 8 y-values and 55 X-values...
Training score: 0.855182788114699

Unrounded the continuous variables.

11/18/19 (3) About to fit default values for 8 y-values and 55 X-values...


## Make Predictions

In [34]:
y_multirf = regr_multirf.predict(X_test)
print('Prediction score:', regr_multirf.score(X_test, y_test))

Prediction score: 0.14530676800801465


### Score Log
11/18/19 (1): Prediction score:  0.0412226210299137

Fixed cont/cat variable (see Score Log).

11/18/19 (2): Prediction score: 0.14530676800801465

Not as big an improvement as I expected. Unrounded the values, now that the categorical are removed.

11/18/19 (3): Prediction score: 

In [28]:
print(y_multirf)

[[ 7.94579487  6.22655628  7.64529675 ... 15.00000448  3.
   7.4       ]
 [12.2284979   4.62287946  9.21240715 ... 15.          3.
   7.73870766]
 [16.87988278  5.63905105  8.78838026 ... 14.99803279  3.
   7.16959752]
 ...
 [ 6.12350481  4.88011796  3.83563332 ... 15.00005658  3.
   4.90265112]
 [10.4897401   6.2864506   8.8657619  ... 14.99740541  3.
   6.92275831]
 [ 7.44857642  6.44304132  6.52583613 ... 14.64        3.
   4.91202387]]


In [29]:
# Save stuff
df_X_test = pd.DataFrame(X_test)
pprint.pprint(df_X_test)
df_X_test.to_csv('./results/df_X_test_cont.csv', index=False)

df_y_test = pd.DataFrame(y_multirf, columns=y_names)
pprint.pprint(df_y_test)
df_y_test.to_csv('./results/df_y_test_cont.csv', index=False)

        SUM_INSURED_CONTENTS  SPEC_SUM_INSURED  SPEC_ITEM_PREM  BEDROOMS  \
92533                50000.0               0.0             0.0       5.0   
111689               50000.0               0.0             0.0       2.0   
124907               50000.0               0.0             0.0       3.0   
130082               50000.0               0.0             0.0       4.0   
92090                50000.0            2500.0            13.0       3.0   
37369                50000.0               0.0             0.0       3.0   
127579               50000.0               0.0             0.0       4.0   
185103                   0.0               0.0             0.0       3.0   
123058               50000.0               0.0             0.0       3.0   
161798               50000.0               0.0             0.0       2.0   
2784                 50000.0               0.0             0.0       3.0   
36672                50000.0               0.0             0.0       1.0   
103798      

       RISK_RATED_AREA_B  NCD_GRANTED_YEARS_B  RISK_RATED_AREA_C  \
0               7.945795             6.226556           7.645297   
1              12.228498             4.622879           9.212407   
2              16.879883             5.639051           8.788380   
3              15.363273             6.422333           9.925146   
4              12.800000             3.328643           9.129417   
5              10.555679             5.923763           6.639124   
6              10.875476             6.235707           5.897500   
7               6.905500             5.950406           5.000000   
8              10.794129             6.340226           9.773694   
9               0.000000             0.000000           7.098947   
10             13.300855             6.446310          11.308190   
11              0.000000             0.000000           6.651740   
12             10.154127             6.097860          11.828101   
13             29.487500             5.771014   