In [2]:
%matplotlib inline

import numpy  as np
import pandas as pd
import os, sys

import warnings
warnings.filterwarnings('ignore')

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline

basepath = os.path.expanduser('~/Desktop/src/AllState_Claims_Severity/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(2016)

In [7]:
train      = pd.read_csv(os.path.join(basepath, 'data/raw/train.csv'), usecols=['id', 'cat2', \
                                                                                'cat6', 'cat10', \
                                                                                'cat4', 'loss'])
test       = pd.read_csv(os.path.join(basepath, 'data/raw/test.csv'), usecols=['id', 'cat2', \
                                                                               'cat4', 'cat6',\
                                                                               'cat10'
                                                                              ])
sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sample_submission.csv'))

In [8]:
features = ['cat2', 'cat4', 'cat6', 'cat10']

In [9]:
def encode_categorical_features(train, test,  feature):
    train_ = train.copy()
    test_  = test.copy()
    
    encoder = LabelEncoder()
    encoder.fit(pd.concat((train_[feature], test_[feature])))
    
    train_[feature] = encoder.transform(train_[feature])
    test_[feature]  = encoder.transform(test_[feature])
    
    return train_[feature], test_[feature]

def one_hot_encoding(train, test, feature):
    train_   = train.copy()
    test_    = test.copy()
    
    encoder = OneHotEncoder()
    encoder.fit(pd.concat((train_[feature], test_[feature])))
    
    train_[feature] = encoder.transform(train_[feature])
    test_[feature]  = encoder.transform(test_[feature])
    
    return train_[feature], test_[feature]

In [10]:
train['cat2'], test['cat2'] = encode_categorical_features(train, test, 'cat2')
train['cat4'], test['cat4'] = encode_categorical_features(train, test, 'cat4')
train['cat6'], test['cat6'] = encode_categorical_features(train, test, 'cat6')
train['cat10'], test['cat10'] = encode_categorical_features(train, test, 'cat10')

# train['cat2'], test['cat2'] = one_hot_encoding(train, test, 'cat2')
# train['cat4'], test['cat4'] = one_hot_encoding(train, test, 'cat4')

In [11]:
itrain, itest = train_test_split(range(len(train)), test_size=0.2, random_state=40)

X_train = train.iloc[itrain][features]
X_test  = train.iloc[itest][features]

y_train = train.iloc[itrain].loss
y_test  = train.iloc[itest].loss

In [25]:
pipeline = Pipeline([
        ('model', RandomForestRegressor(n_estimators=10, n_jobs=-1, random_state=11))
    ])

In [26]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('model', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=11,
           verbose=0, warm_start=False))])

In [27]:
ypred = pipeline.predict(X_test)
print('MAE on unseen examples: %f'%(mean_absolute_error(y_test, ypred)))

MAE on unseen examples: 1783.172463


** Training **

In [28]:
pipeline.fit(train[features], train.loss)
predictions = pipeline.predict(test[features])

** Submissions **

In [30]:
sample_sub['loss'] = predictions
sample_sub.to_csv(os.path.join(basepath, 'submissions/basic_benchmark.csv'), index=False)