# Random Forest

### Data Loading

In [1]:
import pandas as pd
import numpy as np

trn_imputed_dum_X = pd.read_csv('./Dataset/trn_imputed_dum_X.csv')
tst_imputed_dum_X = pd.read_csv('./Dataset/tst_imputed_dum_X.csv')
trn_origin_Y      = pd.read_csv('./Dataset/trn_origin_Y.csv')
tst_origin_X      = pd.read_csv('./Dataset/tst_origin_X.csv')

print(trn_imputed_dum_X.shape)
print(trn_origin_Y.shape)

(79853, 17)
(79853, 1)


### 1) Hyperparameter optimization

In [16]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

param = {'n_estimators':[500, 600, 700, 800, 900, 1000]}

def clf_RF(trn_X, trn_Y, svc_param):
    clf_RF = RandomForestClassifier()
    clf = GridSearchCV(clf_RF, svc_param, n_jobs=8, cv=10, scoring='roc_auc')
    clf.fit(trn_X, trn_Y)
    return clf  

### 2) Training Model

In [17]:
clf_model = clf_RF(trn_imputed_dum_X, trn_origin_Y.values.ravel(), param)

In [18]:
clf_model.best_score_

0.8319960273029442


### 3) DO PREDICTION

In [9]:
clf_yhat = clf_model.best_estimator_.predict(tst_imputed_dum_X)
clf_prob = clf_model.best_estimator_.predict_proba(tst_imputed_dum_X)
clf_prob_renewal = clf_prob[:,1]
display(clf_prob_renewal)

array([1.    , 0.9975, 0.89  , ..., 0.9   , 0.6575, 0.9875])

### 4) Export to output file

In [12]:
tst_output = pd.DataFrame(tst_origin_X[['id', 'premium']])

clf_prob_df = pd.DataFrame(clf_prob_renewal.reshape(len(clf_prob_renewal),1), columns=['renewal'])
pd_incentives = pd.DataFrame(np.zeros((len(clf_prob_renewal), 1)), columns=['incentives'], dtype=np.float32)
pd_improve = pd.DataFrame(np.zeros((len(clf_prob_renewal), 1)), columns=['improvement'], dtype=np.float32)

tst_output = pd.concat([tst_output, clf_prob_df, pd_improve, pd_incentives], axis=1)
#display(tst_output)

tst_output.to_csv('./Dataset/tst_renewal_predicted_RF.csv', index=False)