```
Create a basic model using only the continous features ?

* How to deal with the continous features ?
* How can we select continuous feature that actually relate with our target variable ?
* Which model is suitable to deal with the continuous variables ?
```

In [86]:
%matplotlib inline

import numpy as np
import pandas as pd
import os,sys

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_validation import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion

from scipy.stats.mstats import gmean

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

basepath = os.path.expanduser('~/Desktop/src/AllState_Claims_Severity/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(2016)

from data import *

In [27]:
train, test, sample_sub = load_data()

In [28]:
data = pd.concat((train, test))

** Numerical variables. **

In [30]:
numerical_features = [col for col in data.columns if 'cont' in col]
print('Number of numerical features: {}'.format(len(numerical_features)))
print('Numerical Features: \n{}'.format(numerical_features))

Number of numerical features: 14
Numerical Features: 
['cont1', 'cont10', 'cont11', 'cont12', 'cont13', 'cont14', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9']


In [141]:
def create_features(X):
    """
        
    Modifications:

    1. cont1: create a variable defining frequency count of the variable
    2. cont2: create a categorical variable out of it.
    3. cont3: create a categorical variable out of it.
    4. cont6: create a variable defining frequency count of the variable.
    5. cont7: create a variable defining frequency count of the variable.
    6. cont9: create a variable defining frequency count of the variable.
    7. cont10: create a variable defining frequency count of the variable.
    8. cont11: create a variable defining frequency count of the variable.
    9. cont12: create a variable defining frequency count of the variable.
    10.cont13: create a variable defining frequency count of the variable.
        
    """
    
    X['cont1_count'] = X.groupby(['cont1'])['cont1'].transform(lambda x: len(x))
    X['cont2_cat']   = X['cont2'].map(lambda x: np.round(x, decimals=1))
    X['cont3_count'] = X.groupby(['cont3'])['cont3'].transform(lambda x: len(x))
    X['cont6_count'] = X.groupby(['cont6'])['cont6'].transform(lambda x: len(x))
    X['cont7_count'] = X.groupby(['cont7'])['cont7'].transform(lambda x: len(x))
    X['cont9_count'] = X.groupby(['cont9'])['cont9'].transform(lambda x: len(x))
    X['cont10_count'] = X.groupby(['cont10'])['cont10'].transform(lambda x: len(x))
    X['cont11_count'] = X.groupby(['cont11'])['cont11'].transform(lambda x: len(x))
    X['cont12_count'] = X.groupby(['cont12'])['cont12'].transform(lambda x: len(x))
    X['cont13_count'] = X.groupby(['cont13'])['cont13'].transform(lambda x: len(x))
    
    return X
        
class ContinuousFeatureMorpher(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = create_features(X)
        return X
    
class VarSelect(BaseEstimator, TransformerMixin):
    def __init__(self, keys):
        self.keys = keys
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, df):
        return df[self.keys]    

In [142]:
# feature list
feature_list = [
    'cont1',
    'cont1_count',
    'cont2',
    'cont2_cat',
    'cont3',
    'cont3_count',
    'cont6',
    'cont6_count',
    'cont7',
    'cont7_count',
    'cont9',
    'cont9_count',
    'cont10',
    'cont10_count',
    'cont11',
    'cont11_count',
    'cont12',
    'cont12_count',
    'cont13',
    'cont13_count'
]

In [143]:
train_ = data[:len(train)]
test_  = data[len(train):]

In [154]:
# remove instances with very high loss values
mask_remove_outliers = train_.loss < 2e4

train_ = train_[mask_remove_outliers]

In [155]:
# split into train and test set
itr, ite = train_test_split(range(len(train_)), test_size=0.3, random_state=21386)

Xtr = train_.iloc[itr][numerical_features]
Xte = train_.iloc[ite][numerical_features]

ytr = np.log(train_.iloc[itr]['loss'])
yte = np.log(train_.iloc[ite]['loss'])

In [156]:
# model definition
pipeline_rf = Pipeline([(
            'union', FeatureUnion([
                    ('morpher', ContinuousFeatureMorpher()),
                    ('var', VarSelect(keys=feature_list))
                ])
        ),
        ('model', RandomForestRegressor(n_estimators=50, max_depth=7, n_jobs=-1, random_state=23137))
        ])

pipeline_xgbr = Pipeline([(
            'union', FeatureUnion([
                    ('morpher', ContinuousFeatureMorpher()),
                    ('var', VarSelect(keys=feature_list))
                ])
        ),
        ('model', xgb.XGBRegressor(seed=23123137))
        ])

In [157]:
def cv(train, target, estimators, **params):
    kf = KFold(len(train), n_folds=params['n_folds'], shuffle=params['shuffle'], random_state=123731)
    scores = []
    
    for i, (itr, ite) in enumerate(kf):
        print('Fold: '.format(i))
        
        Xtr = train.iloc[itr]
        Xte = train.iloc[ite]
        
        ytr = target.iloc[itr]
        yte = target.iloc[ite]
        
        errors = []
        yhats  = []
        
        for k, est in estimators.items():
            print('Estimator: {}'.format(k))
            
            est.fit(Xtr, ytr)
            yhat = np.exp(est.predict(Xte))
            error = mean_absolute_error(np.exp(yte), yhat)
            
            yhats.append(yhat)
            errors.append(error)
            
            print('MAE: {}'.format(error))
            
        ensemble_yhat  = gmean(yhats)
        ensemble_score = mean_absolute_error(np.exp(yte), ensemble_yhat) 
        print('Ensemble MAE: {}'.format(ensemble_score))
        print('-'*50+'\n')
        
        scores.append(ensemble_score)
    
    return scores

In [158]:
params = {
    'n_folds': 3,
    'shuffle': True
}

estimators = {
    'RandomForestRegressor': pipeline_rf,
    'XGBoostRegressor':      pipeline_xgbr
}

cv(Xtr, ytr, estimators, **params)

Fold: 
Estimator: RandomForestRegressor
MAE: 1710.8088283159805
Estimator: XGBoostRegressor
MAE: 1708.1254922427784
Ensemble MAE: 1708.4919552260162
--------------------------------------------------

Fold: 
Estimator: RandomForestRegressor
MAE: 1719.64414114493
Estimator: XGBoostRegressor
MAE: 1716.3023026043495
Ensemble MAE: 1716.9879330906203
--------------------------------------------------

Fold: 
Estimator: RandomForestRegressor
MAE: 1737.9925495270613
Estimator: XGBoostRegressor
MAE: 1735.81144343213
Ensemble MAE: 1735.76850945925
--------------------------------------------------



[1708.4919552260162, 1716.9879330906203, 1735.76850945925]

In [79]:
pipeline.fit(Xtr, ytr)
preds = np.exp(pipeline.predict(Xte))
print('MAE on unseen examples: {}'.format(mean_absolute_error(np.exp(yte), preds)))

MAE on unseen examples: 1778.2843834427758
