```
Encoding categorical variables

1. Try out various different schemes of encoding a categorical variable.
2. Cross-validate each scheme to see how well they are performing on this dataset.
3. Why one scheme is working to other and if we can combine different schemes together or not ?
4. Build a library of datasets and models after careful cross-validation to see if their combination improves the overall score or not.
```

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import os,sys

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_validation import KFold, train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.externals import joblib

from scipy.stats.mstats import gmean

import xgboost as xgb
import category_encoders as ce

import warnings
warnings.filterwarnings('ignore')

basepath = os.path.expanduser('~/Desktop/src/AllState_Claims_Severity/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(2016)

from data import *
from utils import *

In [2]:
train, test, sample_sub = load_data()

In [3]:
data = pd.concat((train, test))

** Binary Encoding **

In [9]:
categorical_columns = get_categorical_features(data.columns)

In [13]:
# encoder = ce.BinaryEncoder(cols=categorical_columns)
# encoder.fit(data)

# data = encoder.transform(data)

In [14]:
# save the processed dataframe onto disk
joblib.dump(data, os.path.join(basepath, 'data/processed/binary_encoded_data'))

['/home/abhishek/Desktop/src/AllState_Claims_Severity/data/processed/binary_encoded_data',
 '/home/abhishek/Desktop/src/AllState_Claims_Severity/data/processed/binary_encoded_data_01.npy',
 '/home/abhishek/Desktop/src/AllState_Claims_Severity/data/processed/binary_encoded_data_02.npy',
 '/home/abhishek/Desktop/src/AllState_Claims_Severity/data/processed/binary_encoded_data_03.npy',
 '/home/abhishek/Desktop/src/AllState_Claims_Severity/data/processed/binary_encoded_data_04.npy',
 '/home/abhishek/Desktop/src/AllState_Claims_Severity/data/processed/binary_encoded_data_05.npy',
 '/home/abhishek/Desktop/src/AllState_Claims_Severity/data/processed/binary_encoded_data_06.npy',
 '/home/abhishek/Desktop/src/AllState_Claims_Severity/data/processed/binary_encoded_data_07.npy']

In [2]:
# load the processed data from disk
binary_data = joblib.load(os.path.join(basepath, 'data/processed/binary_encoded_data'))
n_train     = joblib.load(os.path.join(basepath, 'data/processed/n_train'))

In [3]:
train = binary_data[:n_train]
test  = binary_data[n_train:]

In [4]:
# create a mask for omitting values with very high losses
mask  = (train.loss < 2e4)
train = train[mask]

In [5]:
features = train.columns.drop(['id', 'loss'])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(train[features], train.loss, test_size=0.33, random_state=12397)

** Grid search XGBoost **

In [12]:
def mae(y, y0):
    y0=y0.get_label()
    
    return 'error',mean_absolute_error(np.exp(y), np.exp(y0))

In [9]:
def custom_grid_search(X, y):
    n_rounds = 100
    
    # log transformation
    y = np.log(y)
    
    DMatrix     = xgb.DMatrix(data=X, label=y)
    state_space = []
    
    for subsample in [0.7, 1.]:
        for colsample in [.6, .8]:
            params = {}
            
            params['subsample']        = subsample
            params['colsample_bytree'] = colsample
            params['max_depth']        = 15            # not sure about this
            params['objective']        = 'reg:linear'
            params['eta']              = 2 / n_rounds
            params['nthread']          = 4
            params['gamma']            = 1
            
            xgboostmodelCV   = xgb.cv(params, dtrain=DMatrix, num_boost_round=n_rounds, \
                                    nfold=5, feval=mae, early_stopping_rounds=20, seed=21387)
            
            validationScores = pd.DataFrame(xgboostmodelCV)
            state_space.append((subsample, colsample, validationScores['test-error-mean'].tail(1)))
            
    return state_space

In [10]:
state_space = custom_grid_search(X_train, y_train)

Will train until cv error hasn't decreased in 20 rounds.
Will train until cv error hasn't decreased in 20 rounds.
Will train until cv error hasn't decreased in 20 rounds.
Will train until cv error hasn't decreased in 20 rounds.


In [11]:
state_space

[(0.7, 0.6, 99    2125.888135
  Name: test-error-mean, dtype: float64), (0.7, 0.8, 99    2123.726269
  Name: test-error-mean, dtype: float64), (1.0, 0.6, 99    2123.04043
  Name: test-error-mean, dtype: float64), (1.0, 0.8, 99    2121.036133
  Name: test-error-mean, dtype: float64)]

** Cross Validation XGBoost **

In [7]:
scores = cv_xgboost(X_train, y_train)

Fold: 0


Will train until eval error hasn't decreased in 50 rounds.
[0]	train-error:2978.137207	eval-error:2984.551270
[1]	train-error:2976.633057	eval-error:2983.049072
[2]	train-error:2974.465576	eval-error:2980.884766
[3]	train-error:2971.430664	eval-error:2977.853271
[4]	train-error:2967.288574	eval-error:2973.717529
[5]	train-error:2961.771484	eval-error:2968.212891
[6]	train-error:2954.590332	eval-error:2961.048340
[7]	train-error:2945.444580	eval-error:2951.927979
[8]	train-error:2934.037598	eval-error:2940.550537
[9]	train-error:2920.061768	eval-error:2926.611328
[10]	train-error:2903.245850	eval-error:2909.843018
[11]	train-error:2883.334717	eval-error:2889.994385
[12]	train-error:2860.126953	eval-error:2866.872070
[13]	train-error:2833.520508	eval-error:2840.338379
[14]	train-error:2803.383789	eval-error:2810.301514
[15]	train-error:2769.646729	eval-error:2776.698242
[16]	train-error:2732.390137	eval-error:2739.570312
[17]	train-error:2691.791016	eval-error:2699.119629
[18]	train-erro

best ite: 292
best score: 1124.43103
MAE: 1124.436581282042
Fold: 1


Will train until eval error hasn't decreased in 50 rounds.
[0]	train-error:2973.353027	eval-error:2994.117920
[1]	train-error:2971.849365	eval-error:2992.611328
[2]	train-error:2969.682861	eval-error:2990.439941
[3]	train-error:2966.648926	eval-error:2987.400635
[4]	train-error:2962.509766	eval-error:2983.250244
[5]	train-error:2956.996338	eval-error:2977.721436
[6]	train-error:2949.820801	eval-error:2970.522705
[7]	train-error:2940.680664	eval-error:2961.349854
[8]	train-error:2929.281982	eval-error:2949.901855
[9]	train-error:2915.316650	eval-error:2935.878662
[10]	train-error:2898.517822	eval-error:2918.998779
[11]	train-error:2878.652344	eval-error:2899.049561
[12]	train-error:2855.472656	eval-error:2875.742432
[13]	train-error:2828.893555	eval-error:2849.048340
[14]	train-error:2798.785156	eval-error:2818.778076
[15]	train-error:2765.124756	eval-error:2784.941162
[16]	train-error:2727.979004	eval-error:2747.625488
[17]	train-error:2687.456299	eval-error:2706.891357
[18]	train-erro

best ite: 299
best score: 1122.817871
MAE: 1122.8179988967415
Fold: 2


Will train until eval error hasn't decreased in 50 rounds.
[0]	train-error:2989.333984	eval-error:2962.155029
[1]	train-error:2987.828369	eval-error:2960.650391
[2]	train-error:2985.658691	eval-error:2958.482666
[3]	train-error:2982.619141	eval-error:2955.445068
[4]	train-error:2978.470459	eval-error:2951.300537
[5]	train-error:2972.944824	eval-error:2945.779785
[6]	train-error:2965.749268	eval-error:2938.591553
[7]	train-error:2956.583496	eval-error:2929.434326
[8]	train-error:2945.147461	eval-error:2918.016113
[9]	train-error:2931.137939	eval-error:2904.028564
[10]	train-error:2914.282471	eval-error:2887.200195
[11]	train-error:2894.332520	eval-error:2867.282715
[12]	train-error:2871.061279	eval-error:2844.064697
[13]	train-error:2844.359619	eval-error:2817.420166
[14]	train-error:2814.133545	eval-error:2787.250244
[15]	train-error:2780.308594	eval-error:2753.470703
[16]	train-error:2742.947998	eval-error:2716.192139
[17]	train-error:2702.203369	eval-error:2675.564453
[18]	train-erro

best ite: 297
best score: 1126.38916
MAE: 1126.4281523755892


[298]	train-error:1036.466309	eval-error:1126.456909
[299]	train-error:1036.220947	eval-error:1126.428223


In [8]:
print('Mean cv score: {}'.format(np.mean(scores)))

Mean cv score: 1124.5609108514575


In [10]:
# number of trees
n_rounds = 300

# set up configurations
params = {}

params['max_depth']        = 6
params['objective']        = 'reg:linear'
params['eta']              = 20 / n_rounds
params['nthread']          = 4
params['gamma']            = 1
params['min_child_weight'] = 2
params['subsample']        = 1.0
params['colsample_bytree'] = 0.8

plst = list(params.items())

In [13]:
Dtrain = xgb.DMatrix(X_train, np.log(y_train))
Dval   = xgb.DMatrix(X_test, np.log(y_test))
    
# define a watch list to observe the change in error for training and holdout data
watchlist  = [ (Dtrain, 'train'),(Dval, 'eval')]
 
model = xgb.train(plst, 
                  Dtrain, 
                  n_rounds,
                  feval=mae,  # custom evaluation function
                 )

In [14]:
yhat = np.exp(model.predict(Dval))
print('MAE on unseen set ', mean_absolute_error(y_test, yhat))

MAE on unseen set  1128.30978223


** Full Training **

In [21]:
DTRAIN = xgb.DMatrix(train[features], np.log(train.loss))
DTEST  = xgb.DMatrix(test[features])

In [22]:
# train on full dataset

model = xgb.train(plst, 
                  DTRAIN, 
                  n_rounds,
                  feval=mae  # custom evaluation function
                 )

In [23]:
predictions = model.predict(DTEST)
predictions = np.exp(predictions)

In [24]:
sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sample_submission.csv'))
sample_sub['loss'] = predictions
sample_sub.to_csv(os.path.join(basepath, 'submissions/xgboost_remove_outliers.csv'), index=False)

In [25]:
predictions

array([ 1533.6932373 ,  1929.88842773,  7788.94482422, ...,  2343.55810547,
        1012.81256104,  3036.37963867], dtype=float32)