* Find out the features that take up multiple values.
* One Hot Encode them and see which of them having the maximum impact on the overall performance.
* Include those variables with all the other variables.

In [17]:
%matplotlib inline

import numpy  as np
import pandas as pd
import os, sys

import warnings
warnings.filterwarnings('ignore')

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

basepath = os.path.expanduser('~/Desktop/src/AllState_Claims_Severity/')
sys.path.append(os.path.join(basepath, 'src'))

from data import *

np.random.seed(2016)

In [2]:
train      = pd.read_csv(os.path.join(basepath, 'data/raw/train.csv'))
test       = pd.read_csv(os.path.join(basepath, 'data/raw/test.csv'))
sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sample_submission.csv'))

In [3]:
def encode_categorical_features(train, test,  feature):
    train_ = train.copy()
    test_  = test.copy()
    
    encoder = LabelEncoder()
    encoder.fit(pd.concat((train_[feature], test_[feature])))
    
    train_[feature] = encoder.transform(train_[feature])
    test_[feature]  = encoder.transform(test_[feature])
    
    return train_[feature], test_[feature]

def label_encoding(train, test, features, encoding_func):
    for feat in features:
        train[feat], test[feat] = encoding_func(train, test, feat)
    
    return train, test

** Multi valued features **

In [4]:
mv_features = get_multi_valued_features(train, test)

** Label Encoding **

In [6]:
train, test = label_encoding(train, test, features, encode_categorical_features)

** One Hot Encoding **

In [5]:
train_vec, test_vec = one_hot_encode_features(train, test, features)

** Train Test Split **

In [6]:
itrain, itest = train_test_split(range(len(train)), test_size=0.2, random_state=40)

In [None]:
X_train = train_vec.iloc[itrain]
X_test  = train_vec.iloc[itest]

y_train = train.iloc[itrain].loss
y_test  = train.iloc[itest].loss

** Multi-valued categorical variable feature selection **

In [24]:
def evaluate_features(train, test, y, itrain, itest, features):
    evaluation = []
    
    for feat in features:
        # one hot encode variable 
        train_vec, test_vec = one_hot_encode_features(train, test, [feat])
        del test_vec
        
        X_train = train_vec.iloc[itrain]
        X_test  = train_vec.iloc[itest]

        y_train = y.iloc[itrain]
        y_test  = y.iloc[itest]
        
        del train_vec
        
        # train model
        pipeline = Pipeline([
            ('model', RandomForestRegressor(n_estimators=10, n_jobs=-1, random_state=11))
        ])
        
        pipeline.fit(X_train, y_train)
        ypreds = pipeline.predict(X_test)
        
        score = mean_absolute_error(y_test, ypreds)
        evaluation.append((feat, score))
    
    evaluation = np.array(sorted(evaluation, key=lambda x: x[1]))
    joblib.dump(evaluation, os.path.join(basepath, 'data/processed/multi-valued-features/mv_feat_scores'))

In [25]:
evaluate_features(train, test, train.loss, itrain, itest, mv_features)

** Public Leaderboard Score: 1760.47142 **

In [12]:
ypred = pipeline.predict(X_test)
print('MAE on unseen examples: %f'%(mean_absolute_error(y_test, ypred)))

MAE on unseen examples: 1763.876480


** Training **

In [28]:
pipeline.fit(train[features], train.loss)
predictions = pipeline.predict(test[features])

** Submissions **

In [30]:
sample_sub['loss'] = predictions
sample_sub.to_csv(os.path.join(basepath, 'submissions/basic_benchmark.csv'), index=False)