```
Feature Interaction among categorical variables.
```

In [20]:
%matplotlib inline

import numpy as np
import pandas as pd
import os, sys

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_validation import KFold, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

basepath = os.path.expanduser('~/Desktop/src/AllState_Claims_Severity/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(2016)

In [184]:
train = pd.read_csv(os.path.join(basepath, 'data/raw/train.csv'))
test  = pd.read_csv(os.path.join(basepath, 'data/raw/test.csv'))

In [185]:
data = pd.concat((train, test))

In [186]:
y = np.log(train.loss)

In [187]:
cat_variables = [col for col in data.columns if 'cat' in col]

for feat in cat_variables:
    data[feat] = pd.factorize(data[feat], sort=True)[0]

In [196]:
def round_down(number):
    n = len(str(number))
    return number  - ( number % np.power(10, n - 1))
    
data['cat116_count'] = data.groupby(['cat116'])['cat116'].transform(lambda x: len(x))
data['cat116_grouped'] = data.cat116_count.map(round_down)

In [197]:
train = data[:len(train)]
test  = data[len(train):]

In [190]:
train.groupby(['cat1', 'cat2'])['loss'].mean()

cat1  cat2
0     0       2778.058601
      1       4087.370464
1     0       1739.453456
      1       2349.008869
Name: loss, dtype: float64

In [179]:
train.pivot_table(columns='cat1', index='cat2', values='loss', aggfunc='mean')

cat1,0,1
cat2,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2778.058601,1739.453456
1,4087.370464,2349.008869


In [82]:
train.pivot_table(columns=['cat1', 'cat79', 'cat81', 'cat82'], index='cat80', values='loss', aggfunc='median', fill_value=0)

cat1,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
cat79,0,0,0,0,0,0,0,0,0,0,...,3,3,3,3,3,3,3,3,3,3
cat81,0,0,1,1,1,1,2,2,2,2,...,1,1,2,2,2,2,3,3,3,3
cat82,0,1,0,1,2,3,0,1,2,3,...,2,3,0,1,2,3,0,1,2,3
cat80,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4
0,1853.36,1160.38,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2293.04,0.0,5514.29,4957.43,0.0,1829.285,4975.285,6366.275,0.0,2292.48,...,4390.945,3633.39,5073.995,5771.98,4282.3,4190.385,4833.68,4029.5,4084.7,2526.51
2,0.0,0.0,5720.33,3667.96,0.0,0.0,2681.27,5669.79,4606.54,2536.06,...,4586.25,0.0,0.0,4242.02,4057.37,2293.44,0.0,5784.0,9275.78,3319.255
3,763.755,0.0,2922.49,2641.81,1535.31,2099.9,2374.17,2318.41,2255.66,2580.33,...,1908.39,1696.895,1973.64,1628.94,0.0,0.0,1391.605,1626.825,4081.395,1590.1


In [121]:
def get_cardinality(data, cat_variables):
    return sorted([(col, data[col].nunique()) for col in cat_variables], key=lambda x: x[1], reverse=True)

In [122]:
cardinality = get_cardinality(data, cat_variables)

In [123]:
cardinality

[('cat116', 326),
 ('cat110', 131),
 ('cat109', 84),
 ('cat113', 61),
 ('cat112', 51),
 ('cat115', 23),
 ('cat105', 20),
 ('cat107', 20),
 ('cat101', 19),
 ('cat114', 19),
 ('cat104', 17),
 ('cat106', 17),
 ('cat99', 16),
 ('cat111', 16),
 ('cat100', 15),
 ('cat103', 13),
 ('cat108', 11),
 ('cat102', 9),
 ('cat89', 8),
 ('cat91', 8),
 ('cat96', 8),
 ('cat90', 7),
 ('cat92', 7),
 ('cat94', 7),
 ('cat97', 7),
 ('cat93', 5),
 ('cat95', 5),
 ('cat98', 5),
 ('cat77', 4),
 ('cat78', 4),
 ('cat79', 4),
 ('cat80', 4),
 ('cat81', 4),
 ('cat82', 4),
 ('cat83', 4),
 ('cat84', 4),
 ('cat85', 4),
 ('cat86', 4),
 ('cat87', 4),
 ('cat88', 4),
 ('cat73', 3),
 ('cat74', 3),
 ('cat75', 3),
 ('cat76', 3),
 ('cat1', 2),
 ('cat2', 2),
 ('cat3', 2),
 ('cat4', 2),
 ('cat5', 2),
 ('cat6', 2),
 ('cat7', 2),
 ('cat8', 2),
 ('cat9', 2),
 ('cat10', 2),
 ('cat11', 2),
 ('cat12', 2),
 ('cat13', 2),
 ('cat14', 2),
 ('cat15', 2),
 ('cat16', 2),
 ('cat17', 2),
 ('cat18', 2),
 ('cat19', 2),
 ('cat20', 2),
 ('cat21', 2)

In [209]:
itrain, itest = train_test_split(range(len(train)), test_size=.2, random_state=12121)

X_train = train[['cat116_grouped', 'cat80', 'cat101']].iloc[itrain]
X_test  = train[['cat116_grouped', 'cat80', 'cat101']].iloc[itest]

y_train = y.iloc[itrain]
y_test  = y.iloc[itest]

In [210]:
kf = KFold(len(X_train), n_folds=5, shuffle=True, random_state=1231)

for index, (itr, ite) in enumerate(kf):
    print('Fold: {}'.format(index))
    
    Xtr = X_train.iloc[itr]
    ytr = y_train.iloc[itr]
    
    Xte = X_train.iloc[ite]
    yte = y_train.iloc[ite]
    
    est = RandomForestRegressor(n_jobs=-1, random_state=1231831)
    est.fit(Xtr, ytr)
    
    yhat = est.predict(Xte)
    
    print('MAE score: {}'.format(mean_absolute_error(np.exp(yte), np.exp(yhat))))

Fold: 0
MAE score: 1496.4724940509932
Fold: 1
MAE score: 1493.8944984381035
Fold: 2
MAE score: 1480.3191135496788
Fold: 3
MAE score: 1499.1284696852772
Fold: 4
MAE score: 1498.1042143429124


In [211]:
est = RandomForestRegressor(n_jobs=-1, random_state=1231319)
est.fit(X_train, y_train)
pred = est.predict(X_test)

print('Mean Absolute Error: {0}'.format(mean_absolute_error(np.exp(y_test), np.exp(pred))))

Mean Absolute Error: 1483.0095065311789
