In [18]:
%matplotlib inline

import numpy as np
import pandas as pd
import os, sys

import warnings
warnings.filterwarnings('ignore')

from sklearn.decomposition import PCA
from sklearn.cross_validation import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

import xgboost as xgb

basepath = os.path.expanduser('~/Desktop/src/AllState_Claims_Severity/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(12)

In [2]:
train      = pd.read_csv(os.path.join(basepath, 'data/raw/train.csv'))
test       = pd.read_csv(os.path.join(basepath, 'data/raw/test.csv'))
sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sample_submission.csv'))

In [3]:
# append train and test
data = pd.concat((train, test))

In [7]:
# target variable
y = np.log(train.loss)

In [4]:
# categorical and continuous variables
categorical_variables = [col for col in data.columns if 'cat' in col]
continuous_variables  = [col for col in data.columns if 'cont' in col]

In [6]:
data_cont = data[continuous_variables]

In [12]:
pca = PCA(n_components=10, whiten=True)
pca.fit(data_cont)
data_rem = pca.transform(data_cont)

In [13]:
print('Explained variance by the components {}'.format(np.sum(pca.explained_variance_ratio_)))

Explained variance by the components 0.9858669971480126


In [15]:
train_ = data_cont[:len(train)]
test_  = data_cont[len(train):]

In [16]:
itrain, itest = train_test_split(range(len(train_)), test_size=0.2, random_state=1231)

X_train = train_.iloc[itrain]
X_test  = train_.iloc[itest]

y_train = y.iloc[itrain]
y_test  = y.iloc[itest]

In [20]:
kf = KFold(len(X_train), n_folds=3, random_state=12313)

for i, (itr, ite) in enumerate(kf):
    print('Fold: {}'.format(i))
    
    Xtr = X_train.iloc[itr]
    Xte = X_train.iloc[ite]
    
    ytr = y_train.iloc[itr]
    yte = y_train.iloc[ite]
    
    est = RandomForestRegressor(n_jobs=-1, random_state=123111)
    est.fit(Xtr, ytr)
    
    yhat = est.predict(Xte)
    
    print('MAE on unseen examples: {}'.format(mean_absolute_error(np.exp(yte), np.exp(yhat))))
    

Fold: 0
MAE on unseen examples: 1935.4824954172411
Fold: 1
MAE on unseen examples: 1937.2655072917094
Fold: 2
MAE on unseen examples: 1949.493133571672
