In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import os, sys

import warnings
warnings.filterwarnings('ignore')

from sklearn.decomposition import PCA
from sklearn.cross_validation import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

import xgboost as xgb

basepath = os.path.expanduser('~/Desktop/src/AllState_Claims_Severity/')
sys.path.append(os.path.join(basepath, 'src'))

np.random.seed(12)

In [2]:
train      = pd.read_csv(os.path.join(basepath, 'data/raw/train.csv'))
test       = pd.read_csv(os.path.join(basepath, 'data/raw/test.csv'))
sample_sub = pd.read_csv(os.path.join(basepath, 'data/raw/sample_submission.csv'))

In [3]:
# append train and test
data = pd.concat((train, test))

In [4]:
# target variable
y = np.log(train.loss)

In [5]:
# categorical and continuous variables
categorical_variables = [col for col in data.columns if 'cat' in col]
continuous_variables  = [col for col in data.columns if 'cont' in col]

In [18]:
data_cont = data[continuous_variables]

In [7]:
pca = PCA(n_components=10, whiten=True)
pca.fit(data_cont)
data_rem = pca.transform(data_cont)

In [8]:
print('Explained variance by the components {}'.format(np.sum(pca.explained_variance_ratio_)))

Explained variance by the components 0.9858669971480126


In [19]:
train_ = data_cont[:len(train)]
test_  = data_cont[len(train):]

In [15]:
itrain, itest = train_test_split(range(len(train)), test_size=0.2, random_state=1231)

In [22]:
X_train = train_.iloc[itrain]
X_test  = train_.iloc[itest]

y_train = y.iloc[itrain]
y_test  = y.iloc[itest]

In [25]:
def cv(X_train, y_train):
    kf = KFold(len(X_train), n_folds=3, random_state=12313)

    for i, (itr, ite) in enumerate(kf):
        print('Fold: {}'.format(i))

        Xtr = X_train.iloc[itr]
        Xte = X_train.iloc[ite]

        ytr = y_train.iloc[itr]
        yte = y_train.iloc[ite]

        est = RandomForestRegressor(n_jobs=-1, random_state=123111)
        est.fit(Xtr, ytr)

        yhat = est.predict(Xte)

        print('MAE on unseen examples: {}'.format(mean_absolute_error(np.exp(yte), np.exp(yhat))))    

In [26]:
def get_correlated_features(df, numerical_columns):
    """
    Arguments
    ---------
    
    df: Dataframe
    
    Returns
    -------
    List of correlated pairs
    """
    
    df_cont = df[numerical_columns]
    
    correlated_pairs = []
    df_corr          = df_cont.corr()
    index            = df_corr.index.values
    
    for i in range(len(df_corr)):
        for j in range(len(numerical_columns)):
            if i == j:
                continue
            else:
                if abs(df_cont.iloc[i, j]) > 0.8:
                    correlated_pairs.append((index[i], numerical_columns[j]))
                    
    return correlated_pairs

In [13]:
correlated_pairs = get_correlated_features(data, continuous_variables)

In [31]:
def remove_correlated_pairs(X_train, y_train, X_test, y_test, correlated_pairs):
    columns = X_train.columns
    
    for col1, col2 in correlated_pairs:
        print('Pair: {0}, {1}'.format(col1, col2))
        
        features = columns.drop([col1])
        cv(X_train[features], y_train)
        
        est = RandomForestRegressor(n_jobs=-1, random_state=123111)
        est.fit(X_train[features], y_train)
        
        yhat = est.predict(X_test[features])
        print('First feature removed')
        print('MAE on unseen examples {}'.format(mean_absolute_error(np.exp(y_test), np.exp(yhat))))
        print('-'*50)
        
        features = columns.drop([col2])
        cv(X_train[features], y_train)
        
        est = RandomForestRegressor(n_jobs=-1, random_state=123111)
        est.fit(X_train[features], y_train)
        
        yhat = est.predict(X_test[features])
        print('Second feature removed')
        print('MAE on unseen examples {}'.format(mean_absolute_error(np.exp(y_test), np.exp(yhat))))
        print('-'*50)
        
        features = columns.drop([col1, col2])
        cv(X_train[features], y_train)
        
        est = RandomForestRegressor(n_jobs=-1, random_state=123111)
        est.fit(X_train[features], y_train)
        
        yhat = est.predict(X_test[features])
        print('Both features removed')
        print('MAE on unseen examples {}'.format(mean_absolute_error(np.exp(y_test), np.exp(yhat))))
        print('-'*50)
        
        print('\n\n')

In [32]:
remove_correlated_pairs(X_train, y_train, X_test, y_test, correlated_pairs)

Pair: cont1, cont10
Fold: 0
MAE on unseen examples: 1930.230364379847
Fold: 1
MAE on unseen examples: 1935.535350540787
Fold: 2
MAE on unseen examples: 1948.797271494167
First feature removed
MAE on unseen examples 1936.212314434266
--------------------------------------------------
Fold: 0
MAE on unseen examples: 1935.9103237878917
Fold: 1
MAE on unseen examples: 1937.2062102517655
Fold: 2
MAE on unseen examples: 1950.2240258861727
Second feature removed
MAE on unseen examples 1932.9738788665557
--------------------------------------------------
Fold: 0
MAE on unseen examples: 1926.2824028341113
Fold: 1
MAE on unseen examples: 1934.4459317829198
Fold: 2
MAE on unseen examples: 1949.5650529422155
Both features removed
MAE on unseen examples 1931.2949022551961
--------------------------------------------------



Pair: cont1, cont13
Fold: 0
MAE on unseen examples: 1930.230364379847
Fold: 1
MAE on unseen examples: 1935.535350540787
Fold: 2
MAE on unseen examples: 1948.797271494167
First 