* Find out the features that take up multiple values.
* One Hot Encode them and see which of them having the maximum impact on the overall performance.
* Include those variables with all the other variables.

In [2]:
%matplotlib inline

import numpy  as np
import pandas as pd
import os, sys

import warnings
warnings.filterwarnings('ignore')

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

import xgboost as xgb

basepath = os.path.expanduser('~/Desktop/src/AllState_Claims_Severity/')
sys.path.append(os.path.join(basepath, 'src'))

from data import *

np.random.seed(2016)

In [3]:
train      = pd.read_csv(os.path.join(basepath, 'data/raw/train.csv'))
test       = pd.read_csv(os.path.join(basepath, 'data/raw/test.csv'))
sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sample_submission.csv'))

In [4]:
def encode_categorical_features(train, test, feature):
    train_ = train.copy()
    test_  = test.copy()
    
    encoder = LabelEncoder()
    encoder.fit(pd.concat((train_[feature], test_[feature])))
    
    train_[feature] = encoder.transform(train_[feature])
    test_[feature]  = encoder.transform(test_[feature])
    
    return train_[feature], test_[feature]

def label_encoding(train, test, features):
    for feat in features:
        train[feat], test[feat] = encode_categorical_features(train, test, feat)
    
    return train, test

** Multi valued features **

In [4]:
mv_features = get_multi_valued_features(train, test)

** Binary valued features **

In [3]:
bv_features = get_binary_valued_features(train, test)

** Label Encoding **

In [6]:
train, test = label_encoding(train, test, features, encode_categorical_features)

** One Hot Encoding **

In [5]:
train_vec, test_vec = one_hot_encode_features(train, test, features)

** Train Test Split **

In [5]:
itrain, itest = train_test_split(range(len(train)), test_size=0.2, random_state=40)

** Multi-valued categorical variable feature selection **

In [29]:
def evaluate_features(train, test, y, itrain, itest, cols, folderpath):
    evaluation = []
    
    for col in cols:
        
        # encode categorical variable 
        train_vec, test_vec = label_encoding(train, test, [col])
        del test_vec
        
        X_train = train_vec.iloc[itrain][[col]]
        X_test  = train_vec.iloc[itest][[col]]

        y_train = y.iloc[itrain]
        y_test  = y.iloc[itest]
        
        del train_vec
        
        # train model
        pipeline = Pipeline([
            ('model', RandomForestRegressor(n_estimators=10, n_jobs=-1, random_state=11))
        ])
        
        pipeline.fit(X_train, y_train)
        ypreds = pipeline.predict(X_test)
        
        score = mean_absolute_error(y_test, ypreds)
        print('Feature Name: %s and MAE: %f'%(col, score))
        evaluation.append((col, score))
    
    evaluation = np.array(sorted(evaluation, key=lambda x: x[1]))
    joblib.dump(evaluation, os.path.join(basepath, 'data/processed/%s'%(folderpath)))

In [30]:
evaluate_features(train, test, train.loss, itrain, itest, bv_features, 'binary-valued-features/bv_feat_importance')

Feature Name: cat1 and MAE: 1877.110292
Feature Name: cat2 and MAE: 1875.075765
Feature Name: cat3 and MAE: 1902.790586
Feature Name: cat4 and MAE: 1945.402952
Feature Name: cat5 and MAE: 1945.085884
Feature Name: cat6 and MAE: 1933.550802
Feature Name: cat7 and MAE: 1887.780068
Feature Name: cat8 and MAE: 1949.372657
Feature Name: cat9 and MAE: 1882.908352
Feature Name: cat10 and MAE: 1848.904116
Feature Name: cat11 and MAE: 1892.182960
Feature Name: cat12 and MAE: 1816.756842
Feature Name: cat13 and MAE: 1893.334365
Feature Name: cat14 and MAE: 1947.525958
Feature Name: cat15 and MAE: 1959.402722
Feature Name: cat16 and MAE: 1904.401929
Feature Name: cat17 and MAE: 1951.994329
Feature Name: cat18 and MAE: 1957.007338
Feature Name: cat19 and MAE: 1957.722474
Feature Name: cat20 and MAE: 1957.958694
Feature Name: cat21 and MAE: 1959.230984
Feature Name: cat22 and MAE: 1959.298559
Feature Name: cat23 and MAE: 1922.472125
Feature Name: cat24 and MAE: 1946.394840
Feature Name: cat25 and M

In [6]:
feat_importance = [79, 78, 56, 117, 122, 11, 80, 127, 129, 126, 104, 99, 71, 111, 105, 118, 120, 100, 109]
features = train.columns[1:-1]
imp_features = [features[f] for f in feat_importance]

In [7]:
train_vec, test_vec = one_hot_encode_features(train, test, ['cat80', 'cat79', 'cat81', 'cat105',
                                                            'cat100', 'cat112', 'cat106', 'cat101', 'cat110'])

In [8]:
train_vec = train_vec.drop(['cat80_D', 'cat79_D', 'cat81_D', 'cat105_T', 'cat100_O',
                            'cat112_Y','cat106_R', 'cat101_U', 'cat110_Y'], axis=1)
test_vec  = test_vec.drop(['cat80_D', 'cat79_D', 'cat81_D', 'cat105_T', 'cat100_O',
                            'cat112_Y','cat106_R', 'cat101_U', 'cat110_Y'], axis=1)

In [9]:
train_lbl, test_lbl = label_encoding(train, test, ['cat57', 'cat12', 'cat72'])

In [10]:
train_cont = train[['cont2', 'cont7', 'cont12', 'cont14', 'cont11', 'cont3', 'cont5']]
test_cont  = test[['cont2', 'cont7', 'cont12', 'cont14', 'cont11', 'cont3', 'cont5']]

In [11]:
train_processed = pd.concat((train_lbl[['cat57', 'cat12', 'cat72']], train_vec, train_cont), axis=1)
test_processed  = pd.concat((test_lbl[['cat57', 'cat12', 'cat72']], test_vec, test_cont), axis=1)

In [12]:
X_train = train_processed.iloc[itrain]
X_test  = train_processed.iloc[itest]

y_train = train.iloc[itrain].loss
y_test  = train.iloc[itest].loss

In [13]:
y = train.loss

In [14]:
del train_vec, test_vec, train_lbl, test_lbl, train_cont, test_cont, train, test

In [21]:
pipeline = Pipeline([
            ('model', xgb.XGBRegressor(colsample_bytree=0.8, subsample=0.8, seed=124, gamma=1, max_depth=5, learning_rate=0.1))
           ])

In [22]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('model', XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=1, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=124, silent=True, subsample=0.8))])

** Public Leaderboard Score: 1760.47142 **

In [23]:
ypred = pipeline.predict(X_test)
print('MAE on unseen examples: %f'%(mean_absolute_error(y_test, ypred)))

MAE on unseen examples: 1304.152417


** Training **

In [65]:
pipeline.fit(train_processed, train.loss)
predictions = pipeline.predict(test_processed)

** Submissions **

In [66]:
sample_sub['loss'] = predictions
sample_sub.to_csv(os.path.join(basepath, 'submissions/basic_simple_features.csv'), index=False)