* Find out the features that take up multiple values.
* One Hot Encode them and see which of them having the maximum impact on the overall performance.
* Include those variables with all the other variables.

In [1]:
%matplotlib inline

import numpy  as np
import pandas as pd
import os, sys

import warnings
warnings.filterwarnings('ignore')

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

basepath = os.path.expanduser('~/Desktop/src/AllState_Claims_Severity/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(2016)

In [2]:
train      = pd.read_csv(os.path.join(basepath, 'data/raw/train.csv'), usecols=['id', 'cat2', \
                                                                                'cat6', 'cat10', \
                                                                                'cat4', 'cat11',\
                                                                                'cat23', 'cat36',\
                                                                                'cat57', 'cat74',\
                                                                                'cat75', 'cat97',\
                                                                                'cat100', 'cat101',\
                                                                                'cat104', 'cat105',\
                                                                                'cat106', 'cat109',\
                                                                                'cat110', 'cat112',\
                                                                                'loss'])

test       = pd.read_csv(os.path.join(basepath, 'data/raw/test.csv'), usecols=['id', 'cat2',\
                                                                               'cat4', 'cat6',\
                                                                               'cat10', 'cat11',\
                                                                               'cat23', 'cat36',\
                                                                               'cat57', 'cat74',\
                                                                               'cat75', 'cat97',\
                                                                               'cat100', 'cat101',\
                                                                               'cat104', 'cat105',\
                                                                               'cat106', 'cat109',\
                                                                               'cat110', 'cat112',\
                                                                              ])

sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sample_submission.csv'))

In [3]:
features = [
#             'cat2', 'cat4', 'cat6', 'cat10',\
#             'cat11', 'cat23', 'cat36',\
#             'cat57', 'cat74',\
#             'cat75', 'cat97',\
            'cat100', 'cat101',\
            'cat104', 'cat105',\
            'cat106', 'cat109',\
            'cat110', 'cat112'
           ]

In [4]:
def encode_categorical_features(train, test,  feature):
    train_ = train.copy()
    test_  = test.copy()
    
    encoder = LabelEncoder()
    encoder.fit(pd.concat((train_[feature], test_[feature])))
    
    train_[feature] = encoder.transform(train_[feature])
    test_[feature]  = encoder.transform(test_[feature])
    
    return train_[feature], test_[feature]

def label_encoding(train, test, features, encoding_func):
    for feat in features:
        train[feat], test[feat] = encoding_func(train, test, feat)
    
    return train, test

** Label Encoding **

In [6]:
train, test = label_encoding(train, test, features, encode_categorical_features)

** One Hot Encoding **

In [46]:

train_vec = pd.get_dummies(train['cat104'].astype('category', categories=set(list(train['cat104']) + list(test['cat104']))))
test_vec  = pd.get_dummies(test['cat104'].astype('category', categories=set(list(train['cat104']) + list(test['cat104']))))

In [47]:
itrain, itest = train_test_split(range(len(train_vec)), test_size=0.2, random_state=40)

X_train = train_vec.iloc[itrain]
X_test  = train_vec.iloc[itest]

y_train = train.iloc[itrain].loss
y_test  = train.iloc[itest].loss

In [49]:
pipeline = Pipeline([
        ('model', RandomForestRegressor(n_estimators=10, max_depth=10, n_jobs=-1, random_state=11))
    ])

In [50]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('model', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=-1, oob_score=False, random_state=11,
           verbose=0, warm_start=False))])

** Public Leaderboard Score: 1760.47142 **

In [51]:
ypred = pipeline.predict(X_test)
print('MAE on unseen examples: %f'%(mean_absolute_error(y_test, ypred)))

MAE on unseen examples: 1958.628273


** Training **

In [28]:
pipeline.fit(train[features], train.loss)
predictions = pipeline.predict(test[features])

** Submissions **

In [30]:
sample_sub['loss'] = predictions
sample_sub.to_csv(os.path.join(basepath, 'submissions/basic_benchmark.csv'), index=False)