# Frequency Projection

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

from sklearn.metrics import roc_curve, auc
from sklearn.metrics import r2_score,mean_squared_error

from sklearn.datasets import make_classification

import matplotlib.pyplot as plt

import data_cleansing as dc
import data_utils as du

## load the data and apply cleaning steps


In [2]:
data_train_raw, data_test_raw = dc.load_file()

data_test = data_test_raw.copy()
data_train = data_train_raw.copy()


## Setup the dataset 
Set the target

In [3]:
target_cap = 1
data_train['ClaimNb'] = data_train['ClaimNb'].where(data_train['ClaimNb']<target_cap, target_cap)
data_test['ClaimNb'] = data_test['ClaimNb'].where(data_test['ClaimNb']<target_cap, target_cap)

target_name='ClaimNb'


## Apply lable encoding to the factors

In [4]:

label_encode_factors = ['Area',
                        'VehPower',
                        'VehBrand',
                        'VehGas',
                        'Region',
                        'DrivAgeBand',
                        'DensityBand',
                        'VehAgeBand']
# encode training data and save encoders used
data_train_encoded, encoders = du.preprocess_labelencode(data_train, label_encode_factors)
# apply trained encoders to test set
data_test_encoded = du.preprocess_labelencode_apply(encoders, data_test, label_encode_factors)

# Setup the Random Forest model

## Split target from factors

In [5]:
#sets used to train (resampled sets)
x_train = data_train_encoded[label_encode_factors].copy()
x_train_weights = data_train_encoded['Exposure']
y_train = data_train_encoded[target_name].copy()

#sets used to predict (and evaluate prediction)
x_test = data_test_encoded[label_encode_factors].copy()
y_test = data_test_encoded[target_name].copy()

x_train.head()

Unnamed: 0_level_0,Area,VehPower,VehBrand,VehGas,Region,DrivAgeBand,DensityBand,VehAgeBand
RecordID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,1,3,3,0,14,3,6,0
4,2,5,3,1,17,4,3,0
7,1,3,3,0,2,3,6,0
9,2,0,10,1,17,2,1,4
10,4,2,3,1,7,2,4,10


## Build the model and predict

use exposure as sample weights

In [6]:
%%time
clf_rf = RandomForestClassifier(min_samples_leaf=10, 
                                n_estimators=100,
                                max_depth=10,
                                max_features='auto',
                                criterion='gini',    #mse or mae
                                verbose=True,
                                oob_score=True,
                                n_jobs=3)
#fit values
clf_rf = clf_rf.fit(x_train.values,
                    y_train.values,
                    sample_weight=x_train_weights)    


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   17.9s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:   42.2s finished


Wall time: 51.9 s


In [7]:
#predict values
y_train_predicted_rf = clf_rf.predict(x_train.values)
y_test_predicted_rf = clf_rf.predict(x_test.values)  

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    2.2s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    5.5s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.9s
[Parallel(n_jobs=3)]: Done 100 out of 100 | elapsed:    2.8s finished


## Merge results back to data set

In [8]:
target_name_predicted = target_name + '_predicted'

x_train_out = x_train.copy()
x_test_out = x_test.copy()

x_train_out[target_name_predicted] = y_train_predicted_rf
x_train_out[target_name] = y_train
x_test_out[target_name_predicted] = y_test_predicted_rf
x_test_out[target_name] = y_test

# Test the results 

In [9]:
# total claim results
train_results_actual= x_train_out[target_name].sum()
train_results_predict = x_train_out[target_name_predicted].sum()

test_results_actual= x_test_out[target_name].sum()
test_results_predict = x_test_out[target_name_predicted].sum()

# gini results
train_results_gini = du.gini(x_train_out[target_name].values, x_train_out[target_name_predicted].values)
test_results_gini = du.gini(x_test_out[target_name].values, x_test_out[target_name_predicted].values)
# mean squared error
train_results_mse = np.sqrt(mean_squared_error(x_train_out[target_name].values, x_train_out[target_name_predicted].values))
test_results_mse = np.sqrt(mean_squared_error(x_test_out[target_name].values, x_test_out[target_name_predicted].values))

print('Total Claims: ', 'Tr Actual {:,.5f}'.format(train_results_actual),
      'Te Actual {:,.5f}'.format(test_results_actual))
print('Total Claims: ', 'Tr Predic {:,.5f}'.format(train_results_predict),
      'Te Predic {:,.5f}'.format(test_results_predict))

print('Results: ',
      'Tr_G {:,.5f}'.format(train_results_gini),
      'Te_G {:,.5f}'.format(test_results_gini),
      'Tr_rmse {:,.5f}'.format(train_results_mse),
      'Te_rmse {:,.5f}'.format(test_results_mse))

print('Factor Importance:\n', pd.DataFrame(zip(clf_rf.feature_importances_, x_train_out.columns)).sort_values(0, ascending=False))

Total Claims:  Tr Actual 23,116.00000 Te Actual 9,662.00000
Total Claims:  Tr Predic 0.00000 Te Predic 0.00000
Results:  Tr_G 0.02032 Te_G 0.01902 Tr_rmse 0.24210 Te_rmse 0.23909
Factor Importance:
           0            1
7  0.231153   VehAgeBand
2  0.162214     VehBrand
4  0.144388       Region
1  0.140592     VehPower
5  0.123493  DrivAgeBand
3  0.070633       VehGas
6  0.065668  DensityBand
0  0.061859         Area


### Output the results

In [11]:
data_train_out = data_train_raw.join(x_train_out[target_name_predicted])
data_test_out = data_test_raw.join(x_test_out[target_name_predicted])

data_train_out.to_pickle('freq_train.pkl')
data_test_out.to_pickle('freq_test.pkl')

print(len(data_test_out), len(data_train_out))

169028 394400


# Review the Results in Charts

## Plot of distribution of results

In [None]:
#%% plot distribution charts
#x_train_to_predict[[target_name, target_name_predicted]].plot(kind='kde', figsize=(20,10))
#x_test[[target_name, target_name_predicted]].plot(kind='kde', figsize=(20,10))


## Plot of mean claim numbers split by factor level

### Plot fit of data

In [None]:
fig = du.plot_factors(x_test_out, target_name, target_name_predicted, x_train_out)
fig.show()