In [1]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from matplotlib.colors import ListedColormap
import numpy as np
import scipy as sp
from patsy import dmatrices
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor


#%matplotlib inline

In [2]:
data = pd.read_csv('Table2_Hunt_2013_edit.csv')
data.head()

Unnamed: 0,Gross pay,Phi-h,Position,Pressure,Random 1,Random 2,Gross pay transform,Production
0,0.1,0.5,2.1,19,5,379,3.54,15.1
1,1.0,4.0,1.1,16,13,269,5.79,21.3
2,1.9,19.0,1.0,14,12,245,8.51,22.75
3,3.1,21.7,2.1,17,6,273,11.52,15.72
4,4.1,24.6,2.9,11,10,237,10.16,7.71


In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer

from statsmodels.stats.outliers_influence import variance_inflation_factor

In [4]:
X = data
y = X.pop('Production')
X.head()

Unnamed: 0,Gross pay,Phi-h,Position,Pressure,Random 1,Random 2,Gross pay transform
0,0.1,0.5,2.1,19,5,379,3.54
1,1.0,4.0,1.1,16,13,269,5.79
2,1.9,19.0,1.0,14,12,245,8.51
3,3.1,21.7,2.1,17,6,273,11.52
4,4.1,24.6,2.9,11,10,237,10.16


#### Modified VIF from:
https://www.kaggle.com/ffisegydd/sklearn-multicollinearity-class

In [5]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

class ReduceVIF(BaseEstimator, TransformerMixin):
    def __init__(self, thresh=10.0, impute=True, impute_strategy='median'):
        # From looking at documentation, values between 5 and 10 are "okay".
        # Above 10 is too high and so should be removed.
        self.thresh = thresh
        
        # The statsmodel function will fail with NaN values, as such we have to impute them.
        # By default we impute using the median value.
        # This imputation could be taken out and added as part of an sklearn Pipeline.
        if impute:
            self.imputer = Imputer(strategy=impute_strategy)

    def fit(self, X, y=None):
        print('ReduceVIF fit')
        if hasattr(self, 'imputer'):
            self.imputer.fit(X)
        return self

    def transform(self, X, y=None):
        print('ReduceVIF transform')
        columns = X.columns.tolist()
        if hasattr(self, 'imputer'):
            X = pd.DataFrame(self.imputer.transform(X), columns=columns)
        return ReduceVIF.calculate_vif(X, self.thresh)

    @staticmethod
    def calculate_vif(X, thresh=10.0):
        # Taken from https://stats.stackexchange.com/a/253620/53565 and modified
        dropped=True
        while dropped:
            variables = X.columns
            dropped = False
            vif = [variance_inflation_factor(X[variables].values, X.columns.get_loc(var)) for var in X.columns]
            
            max_vif = max(vif)
            if max_vif > thresh:
                maxloc = vif.index(max_vif)
                print 'Dropping ' + str(list(data.columns.values)[maxloc])  + ' with vif = ' + str(max_vif)
                X = X.drop([X.columns.tolist()[maxloc]], axis=1)
                dropped=True
        return X

In [6]:
transformer = ReduceVIF()
X = transformer.fit_transform(X, y)

X.head()

ReduceVIF fit
ReduceVIF transform
Dropping X.columns Gross pay transform with vif = 123.29817154016271
Dropping X.columns Pressure with vif = 25.237730405486285
Dropping X.columns Gross pay with vif = 15.667421811179521


Unnamed: 0,Phi-h,Position,Random 1,Random 2
0,0.5,2.1,5.0,379.0
1,4.0,1.1,13.0,269.0
2,19.0,1.0,12.0,245.0
3,21.7,2.1,6.0,273.0
4,24.6,2.9,10.0,237.0


#### What?  That does not make much sense.... or does it?

"i casi son due, ho c'e' un errore nel loro programma VIF, ma ne dubito, e comunque non ne so abbastanza da trovarlo, almeno per il momento... oppure semplicemente porta a un risultato inaspettato. O forse no, magari ha considerato Phi-h e Gross-pay e ha deciso di scartare la seconda. Il domain knowledge (articolo originale) suggerirebbe di tenerle entrambe perche' Gross pay e la 'h' di Phi-h sono correlate, ma Phi no, e serve...... Interessante che tenga la Position. Discutibile che sian sopravvissute le due Random, o forse no visto che erano state fabbricate a bella posta per sembrare legittime, e non sono correlate alle altre, al contrario della mia Gross pay transform".


This paper cautions (part 6) against simply "eliminating one or more of the independent variables that are highly correlated with the other independent variables:
https://pdfs.semanticscholar.org/ed1f/4466a0982f3e8de202de01ecceb473d11893.pdf