```
1. Try out various different ways of encoding a categorical variable.
2. How can we use target variable to our advantage ?
```

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import os,sys

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_validation import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.externals import joblib

from scipy.stats.mstats import gmean

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

basepath = os.path.expanduser('~/Desktop/src/AllState_Claims_Severity/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(2016)

from data import *
from utils import *

In [2]:
# load the dataset
train, test, sample_sub = load_data()

In [3]:
# concat train and test dataframes
data = pd.concat((train, test))

** Use the target variable to calculate the mean loss per level of categorical variable and use that to create new features. **

In [4]:
categorical_features = get_categorical_features(data.columns)
numerical_features   = get_numerical_features(data.columns)

In [5]:
def mean_by_target(data, categorical_features):
    for col in categorical_features:
        data[col+'_mean_by_target'] = data.groupby([col])['loss'].transform(lambda x: x.mean())        
    return data

data = mean_by_target(data, categorical_features)

In [6]:
# label encoding
data = label_encoding(data, categorical_features)

In [3]:
# save the processed data to disk
joblib.dump(len(train), os.path.join(basepath, 'data/processed/n_train'))
joblib.dump(data, os.path.join(basepath, 'data/processed/processed_data.pkl'))

['/home/abhishek/Desktop/src/AllState_Claims_Severity/data/processed/n_train']

In [2]:
# load data from disk
data    = joblib.load(os.path.join(basepath, 'data/processed/processed_data.pkl'))
n_train = joblib.load(os.path.join(basepath, 'data/processed/n_train')) 

In [16]:
features = data.columns[116:].drop(['id', 'loss'])

train_   = data[:n_train][features]
test_    = data[n_train:][features]

y        = np.log(data[:n_train].loss) # take it into log domain

In [17]:
X_train, X_test, y_train, y_test = train_test_split(train_, y, test_size=0.33, random_state=1239137)

In [18]:
print(X_train.shape)
print(X_test.shape)

(126173, 130)
(62145, 130)


In [19]:
scores = cv_xgboost(X_train, np.exp(y_train))

Fold: 0


Will train until eval error hasn't decreased in 50 rounds.
[0]	train-error:3040.405762	eval-error:3047.234131
[1]	train-error:3039.929199	eval-error:3046.757568
[2]	train-error:3039.360596	eval-error:3046.189209
[3]	train-error:3038.687012	eval-error:3045.515137
[4]	train-error:3037.892822	eval-error:3044.720947
[5]	train-error:3036.961426	eval-error:3043.789795
[6]	train-error:3035.876953	eval-error:3042.704834
[7]	train-error:3034.620605	eval-error:3041.448486
[8]	train-error:3033.170166	eval-error:3039.998047
[9]	train-error:3031.509033	eval-error:3038.336670
[10]	train-error:3029.609131	eval-error:3036.436523
[11]	train-error:3027.452148	eval-error:3034.280273
[12]	train-error:3025.010986	eval-error:3031.838623
[13]	train-error:3022.256836	eval-error:3029.083008
[14]	train-error:3019.177734	eval-error:3026.003174
[15]	train-error:3015.738281	eval-error:3022.562256
[16]	train-error:3011.912109	eval-error:3018.736328
[17]	train-error:3007.670898	eval-error:3014.493408
[18]	train-erro

best ite: 599
best score: 1153.340576
MAE: 1153.3406374835456
Fold: 1


Will train until eval error hasn't decreased in 50 rounds.
[0]	train-error:3045.769287	eval-error:3036.507568
[1]	train-error:3045.292725	eval-error:3036.031006
[2]	train-error:3044.724121	eval-error:3035.462402
[3]	train-error:3044.050537	eval-error:3034.788330
[4]	train-error:3043.256348	eval-error:3033.993896
[5]	train-error:3042.324951	eval-error:3033.061523
[6]	train-error:3041.239746	eval-error:3031.976807
[7]	train-error:3039.983154	eval-error:3030.719238
[8]	train-error:3038.532227	eval-error:3029.266602
[9]	train-error:3036.869141	eval-error:3027.603027
[10]	train-error:3034.968506	eval-error:3025.700195
[11]	train-error:3032.810547	eval-error:3023.539062
[12]	train-error:3030.367676	eval-error:3021.094482
[13]	train-error:3027.612305	eval-error:3018.336914
[14]	train-error:3024.528564	eval-error:3015.250732
[15]	train-error:3021.088867	eval-error:3011.810303
[16]	train-error:3017.263428	eval-error:3007.982178
[17]	train-error:3013.020996	eval-error:3003.735352
[18]	train-erro

best ite: 598
best score: 1163.233643
MAE: 1163.2363080434498
Fold: 2


Will train until eval error hasn't decreased in 50 rounds.
[0]	train-error:3041.870850	eval-error:3044.303955
[1]	train-error:3041.394531	eval-error:3043.827637
[2]	train-error:3040.826660	eval-error:3043.260254
[3]	train-error:3040.153320	eval-error:3042.586914
[4]	train-error:3039.358887	eval-error:3041.793457
[5]	train-error:3038.428223	eval-error:3040.863525
[6]	train-error:3037.343262	eval-error:3039.779541
[7]	train-error:3036.084717	eval-error:3038.522461
[8]	train-error:3034.635254	eval-error:3037.074219
[9]	train-error:3032.973145	eval-error:3035.413574
[10]	train-error:3031.070312	eval-error:3033.512207
[11]	train-error:3028.911377	eval-error:3031.354736
[12]	train-error:3026.471924	eval-error:3028.917236
[13]	train-error:3023.719238	eval-error:3026.168213
[14]	train-error:3020.635742	eval-error:3023.087158
[15]	train-error:3017.190918	eval-error:3019.647217
[16]	train-error:3013.359131	eval-error:3015.821289
[17]	train-error:3009.119873	eval-error:3011.589355
[18]	train-erro

best ite: 599
best score: 1159.208374
MAE: 1159.2084078980145


[599]	train-error:1124.299438	eval-error:1159.208374


In [20]:
scores

[1153.3406374835456, 1163.2363080434498, 1159.2084078980145]

In [21]:
np.mean(scores)

1158.5951178083367

In [22]:
def mae(y, y0):
    
    y0=y0.get_label()    
    return 'error',mean_absolute_error(np.exp(y), np.exp(y0))

In [25]:
params = {}

params['max_depth']        = 8
params['objective']        = 'reg:linear'
params['eta']              = 0.03
params['nthread']          = 4
params['gamma']            = 4
params['min_child_weight'] = 7
params['subsample']        = 0.8
params['colsample_bytree'] = 0.4

n_rounds = 600

plst   = list(params.items())

In [26]:
Dtrain = xgb.DMatrix(X_train, y_train)
Dval   = xgb.DMatrix(X_test, y_test)
    
# define a watch list to observe the change in error for training and holdout data
watchlist  = [ (Dtrain, 'train'), (Dval, 'eval')]
 
model = xgb.train(plst, 
                  Dtrain, 
                  n_rounds,
                  feval=mae,  # custom evaluation function
                 )

In [27]:
yhat = np.exp(model.predict(Dval))
print('MAE on unseen set ', mean_absolute_error(np.exp(y_test), yhat))

MAE on unseen set  1138.80979027


In [32]:
DTRAIN = xgb.DMatrix(train_, y)
DTEST  = xgb.DMatrix(test_.fillna(-99999))

In [33]:
# train on full dataset

model = xgb.train(plst, 
                  DTRAIN, 
                  n_rounds,
                  feval=mae  # custom evaluation function
                 )

In [34]:
predictions = model.predict(DTEST)
predictions = np.exp(predictions)

In [37]:
sample_sub['loss'] = predictions
sample_sub.to_csv(os.path.join(basepath, 'submissions/xgboost_mean_by_target.csv'), index=False)