# Variance Inflation Factor

In [1]:
import pandas as pd
import numpy as np
from patsy import dmatrices
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import dataframe_image as dfi

In [2]:
COUNTRY = "Burkina Faso"

In [3]:
PATH_TO_DATA_FOLDER = "../Dataset time-series/output_data/" + COUNTRY + "/"

## Time-series dataset

In [4]:
# Load the time-series data (daily interpolation).
df = pd.read_csv(PATH_TO_DATA_FOLDER + COUNTRY + "-day.csv", header = [0, 1], index_col = 0)
df.index.name = "Datetime"
df.index = pd.to_datetime(df.index)
freq = "D"
df.index.freq = freq

In [5]:
df.columns.get_level_values(1).unique()

Index(['1 Month Anomaly Rainfalls (%)', '3 Months Anomaly Rainfalls (%)',
       'Code', 'Exchange rate', 'FCG', 'Fatalities', 'Lat', 'Lon', 'NDVI',
       'NDVI Anomaly', 'Population', 'Price cereals and tubers',
       'Rainfalls (mm)', 'Ramadan', 'rCSI'],
      dtype='object', name='Indicator')

In [6]:
# Consider only some indicators for the analysis with Spearman correlation.
INDICATORS_TO_CONSIDER = ["FCG", "1 Month Anomaly Rainfalls (%)", "3 Months Anomaly Rainfalls (%)", 
                          "NDVI", "Price cereals and tubers", "Fatalities", "NDVI Anomaly", 
                          "rCSI", "Rainfalls (mm)"] # "Exchange rate"

df = df.loc[:, df.columns.get_level_values(1).isin(INDICATORS_TO_CONSIDER)]
df.head()

AdminStrata,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Boucle-Du-Mouhoun,Cascades,...,Sahel,Sud-Ouest,Sud-Ouest,Sud-Ouest,Sud-Ouest,Sud-Ouest,Sud-Ouest,Sud-Ouest,Sud-Ouest,Sud-Ouest
Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI,1 Month Anomaly Rainfalls (%),...,rCSI,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2019-09-08,104.747834,106.271378,30.9217,4.0,0.496905,95.011603,0.077833,424.5106,19.4995,120.695958,...,23.6735,117.742597,117.51808,36.2381,0.0,0.6591,100.49678,0.295372,465.6788,17.7619
2019-09-09,104.800365,106.240116,31.1039,4.0,0.500725,95.0447,0.078059,426.2755,19.5674,120.653097,...,23.856,117.536886,117.497259,35.8286,0.0,0.660844,100.491038,0.294013,467.8467,17.5612
2019-09-10,104.881777,106.220459,32.0303,4.0,0.504537,95.083381,0.078284,428.0404,20.4007,120.554396,...,24.631,117.317641,117.473018,36.2542,0.0,0.66256,100.485497,0.292654,470.0146,17.8142
2019-09-11,104.988996,106.210471,32.1673,4.0,0.508317,95.128519,0.078509,428.2382,20.4056,120.44291,...,23.3179,117.088827,117.445525,35.8838,0.0,0.664241,100.480693,0.291296,471.5916,17.3366
2019-09-12,105.122024,106.210151,31.7701,4.0,0.512066,95.180114,0.078734,428.436,19.2385,120.318638,...,23.122,116.850446,117.414779,34.995,0.0,0.665889,100.476627,0.289937,473.1686,17.2483


In [7]:
df.rename({"1 Month Anomaly Rainfalls (%)": "omonthrainfalls", 
           "3 Months Anomaly Rainfalls (%)": "tmonthrainfalls", 
           "Exchange rate": "exchange", 
           "FCG": "fcg",
           "Fatalities": "fatalities",
           "NDVI Anomaly": "ndvianonaly",
           "NDVI": "ndvi",
           "Price cereals and tubers": "pricecereals", 
           "Rainfalls (mm)": "rainfalls", 
           "rCSI": "rcsi"}, axis = 1, level = 1, inplace = True)

In [8]:
#df.drop(["tmonthrainfalls", "NDVI", "ndvianonaly"], axis = 1, level = 1, inplace = True)

In [9]:
def vif_func(group):
    group = group[group.name]

    # Gather features.
    features = "+".join(group.columns[group.columns != "fcg"])

    # get y and X dataframes based on this regression:
    y, X = dmatrices('fcg ~' + features, group, return_type = "dataframe")
    
    # For each X, calculate VIF and save in dataframe
    vif = pd.DataFrame()
    vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif["features"] = X.columns
    
    return vif.set_index("features")

vif_factor = df.groupby(axis = 1, level = 0).apply(vif_func) 
vif_factor.head()

  return 1 - self.ssr/self.centered_tss


AdminStrata,Boucle-Du-Mouhoun,Cascades,Centre,Centre-Est,Centre-Nord,Centre-Ouest,Centre-Sud,Est,Hauts-Bassins,Nord,Plateau-Central,Sahel,Sud-Ouest
Unnamed: 0_level_1,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor
features,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Intercept,6031.731079,4803.129991,4537.497432,4555.297964,6457.506874,4212.899748,2396.53678,2236.228722,30361.849575,11630.659643,2889.982641,3145.708559,6002.50907
omonthrainfalls,2.122063,2.531073,2.607611,2.34845,2.371986,1.991858,4.4623,2.966524,1.805962,6.144298,3.111877,5.554796,3.085489
tmonthrainfalls,3.722025,10.046579,1.740588,3.928829,3.819422,3.652706,2.625991,4.603897,4.728978,1.461397,1.744862,14.666364,4.192655
fatalities,3.006944,2.275182,1.733049,1.801734,3.3489,3.609131,5.099451,1.97739,2.261209,3.843269,,3.123701,1.196863
ndvi,29.632675,17.474228,18.56355,8.525882,21.32614,15.782706,10.000363,27.821472,7.343053,15.659329,6.695747,24.537632,6.042172


In [10]:
vif_factor.mean(axis = 1)

features
Intercept          6866.272160
omonthrainfalls       3.161868
tmonthrainfalls       4.687253
fatalities            2.773069
ndvi                 16.108073
ndvianonaly          19.105223
pricecereals          6.837847
rainfalls            15.017690
rcsi                  8.946378
dtype: float64

In [11]:
vif = vif_factor.mean(axis = 1).to_frame().rename({0: "VIF"}, axis = 1).astype(str)
vif.style.export_png(f"./output_images/VIF_{COUNTRY}.png")