# Variance Inflation Factor

In [1]:
import pandas as pd
import numpy as np
from patsy import dmatrices
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
COUNTRY = "Burkina Faso"

In [3]:
PATH_TO_DATA_FOLDER = "../Dataset time-series/output_data/" + COUNTRY + "/"

## Time-series dataset

In [4]:
# Load the time-series data (daily interpolation).
df = pd.read_csv(PATH_TO_DATA_FOLDER + COUNTRY + "-day.csv", header = [0, 1], index_col = 0)
df.index.name = "Datetime"
df.index = pd.to_datetime(df.index)
freq = "D"
df.index.freq = freq

In [5]:
df.columns.get_level_values(1).unique()

Index(['1 Month Anomaly Rainfalls (%)', '3 Months Anomaly Rainfalls (%)',
       'Code', 'Exchange rate', 'FCG', 'Fatalities', 'Lat', 'Lon', 'NDVI',
       'NDVI Anomaly', 'Population', 'Price cereals and tubers',
       'Rainfalls (mm)', 'Ramadan', 'rCSI'],
      dtype='object', name='Indicator')

In [6]:
# Consider only some indicators for the analysis with Spearman correlation.
INDICATORS_TO_CONSIDER = ["FCG", "1 Month Anomaly Rainfalls (%)", "3 Months Anomaly Rainfalls (%)", 
                          "NDVI", "Price cereals and tubers", "Exchange rate", "Fatalities", "NDVI Anomaly", 
                          "rCSI", "Rainfalls (mm)"]

df = df.loc[:, df.columns.get_level_values(1).isin(INDICATORS_TO_CONSIDER)]
df.head()

AdminStrata,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,...,Sud-Ouest,Sud-Ouest,Sud-Ouest,Sud-Ouest,Sud-Ouest,Sud-Ouest,Sud-Ouest,Sud-Ouest,Sud-Ouest,Sud-Ouest
Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI,...,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2019-09-08,104.747834,106.271378,0.796155,30.9217,4.0,0.496905,95.011603,0.077833,424.5106,19.4995,...,117.742597,117.51808,0.796155,36.2381,0.0,0.6591,100.49678,0.295372,465.6788,17.7619
2019-09-09,104.800365,106.240116,0.799253,31.1039,4.0,0.500725,95.0447,0.078059,426.2755,19.5674,...,117.536886,117.497259,0.799253,35.8286,0.0,0.660844,100.491038,0.294013,467.8467,17.5612
2019-09-10,104.881777,106.220459,0.802352,32.0303,4.0,0.504537,95.083381,0.078284,428.0404,20.4007,...,117.317641,117.473018,0.802352,36.2542,0.0,0.66256,100.485497,0.292654,470.0146,17.8142
2019-09-11,104.988996,106.210471,0.805451,32.1673,4.0,0.508317,95.128519,0.078509,428.2382,20.4056,...,117.088827,117.445525,0.805451,35.8838,0.0,0.664241,100.480693,0.291296,471.5916,17.3366
2019-09-12,105.122024,106.210151,0.808549,31.7701,4.0,0.512066,95.180114,0.078734,428.436,19.2385,...,116.850446,117.414779,0.808549,34.995,0.0,0.665889,100.476627,0.289937,473.1686,17.2483


In [7]:
df.rename({"1 Month Anomaly Rainfalls (%)": "omonthrainfalls", 
           "3 Months Anomaly Rainfalls (%)": "tmonthrainfalls", 
           "Exchange rate": "exchange", 
           "FCG": "fcg", 
           "NDVI Anomaly": "ndvianonaly", 
           "Price cereals and tubers": "pricecereals", 
           "Rainfalls (mm)": "rainfalls", 
           "rCSI": "rcsi"}, axis = 1, level = 1, inplace = True)

In [8]:
df.drop(["NDVI", "exchange", "tmonthrainfalls"], axis = 1, level = 1, inplace = True)

In [9]:
def vif_func(group):
    group = group[group.name]

    # Gather features.
    features = "+".join(group.columns[group.columns != "fcg"])

    # get y and X dataframes based on this regression:
    y, X = dmatrices('fcg ~' + features, group, return_type = "dataframe")
    
    # For each X, calculate VIF and save in dataframe
    vif = pd.DataFrame()
    vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif["features"] = X.columns
    
    return vif.set_index("features")

vif_factor = df.groupby(axis = 1, level = 0).apply(vif_func) 
vif_factor.head()

  return 1 - self.ssr/self.centered_tss


AdminStrata,Boucle-Du-Mouhoun,Cascades,Centre,Centre-Est,Centre-Nord,Centre-Ouest,Centre-Sud,Est,Hauts-Bassins,Nord,Plateau-Central,Sahel,Sud-Ouest
Unnamed: 0_level_1,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor
features,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Intercept,3633.032383,4715.285986,2705.842524,1059.159324,1661.222419,1689.397206,836.764183,1162.852701,18311.636798,9948.19506,803.02267,530.444669,5081.942481
omonthrainfalls,2.044891,2.39148,2.41062,2.049408,1.800823,1.875202,4.312438,1.510069,1.465856,4.799384,3.003253,4.01948,2.151755
Fatalities,2.019042,1.430907,1.438347,1.654952,2.119137,2.559831,3.461069,1.436318,2.152942,3.097535,,2.694591,1.152143
ndvianonaly,4.550058,4.224591,5.769851,3.367943,12.969098,3.290237,3.207451,2.423421,15.86747,28.106274,3.221617,6.658451,3.631125
pricecereals,2.318911,2.75546,1.5394,1.57619,11.5675,2.677768,2.118859,2.484708,3.7057,4.134287,3.21766,5.04724,3.359135


In [10]:
vif_factor.mean(axis = 1)

features
Intercept          4010.676800
omonthrainfalls       2.602666
Fatalities            2.101401
ndvianonaly           7.483661
pricecereals          3.577140
rainfalls             5.224977
rcsi                  4.595812
dtype: float64