```
Questions :

1. Categorical features are way more than numerical features, can we use this fact in some way ?
2. How to deal with categorical features, one hot encoding would yield many features and which would increase the dimensionality of the problem ?
3. Do continuous variables need any kind of transformation ?

```

```
Ideas :

Forward feature selection based on minimizing mae with 5-fold cross validation

```

In [3]:
import numpy as np
import pandas as pd
import os, sys

from sklearn.cross_validation import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

basepath = os.path.expanduser('~/Desktop/src/AllState_Claims_Severity/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(2016)

from data import *
from utils import *

In [4]:
# load files
train      = pd.read_csv(os.path.join(basepath, 'data/raw/train.csv'))
test       = pd.read_csv(os.path.join(basepath, 'data/raw/test.csv'))
sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sample_submission.csv'))

In [5]:
# create an indicator for somewhat precarious values for loss. ( only to reduce the number of training examples. )
train['outlier_flag'] = train.loss.map(lambda x: int(x < 4e3))

In [6]:
# encode categorical variables
train, test = encode_categorical_features(train, test)

In [5]:
# get stratified sample
itrain, itest = get_stratified_sample(train.outlier_flag)

In [6]:
# subsample of data to work with
train_sub = train.iloc[itrain]

In [7]:
# target variable
y = np.log(train.loss)

In [25]:
def forward_feature_selection(df):
    columns = df.columns
    
    # rearrange columns in such a way that target variables ( loss, outlier_flag ) is
    # followed by continuous and categorical variables
    
    cont_columns = [col for col in columns if 'cont' in col]
    cat_columns  = [col for col in columns if 'cat' in col]
    
    df = df[list(columns[-2:]) + cont_columns + cat_columns]
    
    y              = np.log(df.loss)
    outlier_flag   = df.outlier_flag
    
    selected_features = []
    features_to_test  = df.columns[2:]
    
    n_fold = 5
    cv     = StratifiedKFold(outlier_flag, n_folds=n_fold, shuffle=True, random_state=23232)
    
    mae_cv_old      = 5000
    is_improving    = True
    
    while is_improving:
        mae_cvs = []
        
        for feature in features_to_test:
            print('{}'.format(selected_features + [feature]))
            
            X = df[selected_features + [feature]]
            
            mae_cv = 0
            
            for i, (i_trn, i_val) in enumerate(cv, start=1):
                est = xgb.XGBRegressor(seed=121212)
                
                est.fit(X.values[i_trn], y.values[i_trn])
                yhat = np.exp(est.predict(X.values[i_val]))

                mae = mean_absolute_error(np.exp(y.values[i_val]), yhat)
                mae_cv += mae / n_fold

            print('MAE CV: {}'.format(mae_cv))
            mae_cvs.append(mae_cv)
        
        mae_cv_new = min(mae_cvs)

        if mae_cv_new < mae_cv_old:
            mae_cv_old = mae_cv_new
            feature = list(features_to_test).pop(mae_cvs.index(mae_cv_new))
            selected_features.append(feature)
            print('selected features: {}'.format(selected_features))
            
            with open(os.path.join(basepath, 'data/processed/features_xgboost/selected_features.txt'), 'w') as f:
                f.write('{}\n'.format('\n'.join(selected_features)))
                f.close()
        else:
            is_improving = False
            print('final selected features: {}'.format(selected_features))
    
    
    print('saving selected feature names as a file')
    with open(os.path.join(basepath, 'data/processed/features_xgboost/selected_features.txt'), 'w') as f:
        f.write('{}\n'.format('\n'.join(selected_features)))
        f.close()

In [26]:
forward_feature_selection(train)

['cont1']
MAE CV: 1804.9347235146147
['cont2']
MAE CV: 1802.8956409659909
['cont3']
MAE CV: 1805.7440445749385
['cont4']
MAE CV: 1805.7020035767518
['cont5']
MAE CV: 1807.6281659436088
['cont6']
MAE CV: 1806.0229665168695
['cont7']
MAE CV: 1802.6861159704754
['cont8']
MAE CV: 1807.5520615071628
['cont9']
MAE CV: 1804.0166180093172
['cont10']
MAE CV: 1805.870466065785
['cont11']
MAE CV: 1803.8369697265314
['cont12']
MAE CV: 1803.481203910519
['cont13']
MAE CV: 1806.5451203881023
['cont14']
MAE CV: 1783.3221522273861
['cat1']
MAE CV: 1755.2089525031286
['cat2']
MAE CV: 1749.8383086108192
['cat3']
MAE CV: 1765.9024558534838
['cat4']
MAE CV: 1799.7595783223792
['cat5']
MAE CV: 1797.3212144493205
['cat6']
MAE CV: 1785.9792864729225
['cat7']
MAE CV: 1760.4777183341694
['cat8']
MAE CV: 1801.1656725853848
['cat9']
MAE CV: 1755.2649805580431
['cat10']
MAE CV: 1716.4392883366938
['cat11']
MAE CV: 1752.3649031270375
['cat12']
MAE CV: 1685.4052196905805
['cat13']
MAE CV: 1755.5969356434857
['cat14

KeyboardInterrupt: 

In [9]:
selected_features = [
                        'cat80',
                        'cat101',
                        'cat100',
                        'cat57',
                        'cat114',
                        'cat79',
                        'cat44',
                        'cat26',
                        'cat94',
                        'cat38',
                        'cat32',
                        'cat35',
                        'cat67',
                        'cat59'
                   ]

In [10]:
X = train[selected_features]

In [11]:
itrain, itest = train_test_split(range(len(X)), stratify=train.outlier_flag, test_size=0.2, random_state=11232)

In [12]:
X_train = X.iloc[itrain]
X_test  = X.iloc[itest]

y_train = y.iloc[itrain]
y_test  = y.iloc[itest]

In [19]:
clf = RandomForestRegressor(n_estimators=100, max_depth=13, n_jobs=-1, random_state=12121)
clf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=13,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=-1, oob_score=False,
           random_state=12121, verbose=0, warm_start=False)

In [20]:
y_hat = np.exp(clf.predict(X_test))
print('MAE on unseen examples ', mean_absolute_error(np.exp(y_test), y_hat))

MAE on unseen examples  1309.51065277
