# Variance Inflation Factor

In [1]:
import pandas as pd
import numpy as np
from patsy import dmatrices
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
COUNTRY = "Cameroon"

In [3]:
PATH_TO_DATA_FOLDER = "../Dataset time-series/output_data/" + COUNTRY + "/"

## Time-series dataset

In [4]:
# Load the time-series data (daily interpolation).
df = pd.read_csv(PATH_TO_DATA_FOLDER + COUNTRY + ".csv", header = [0, 1], index_col = 0)
df.index.name = "Datetime"
df.index = pd.to_datetime(df.index)
freq = "D"
df.index.freq = freq

In [5]:
df.columns.get_level_values(1).unique()

Index(['1 Month Anomaly Rainfalls (%)', '3 Months Anomaly Rainfalls (%)',
       'Area', 'Code', 'FCG', 'Fatalities', 'Lat', 'Lon', 'NDVI',
       'NDVI Anomaly', 'Population', 'Rainfalls (mm)', 'Ramadan', 'Waterways',
       'rCSI'],
      dtype='object', name='Indicator')

In [6]:
# Consider only some indicators for the analysis with Spearman correlation.
INDICATORS_TO_CONSIDER = ["FCG", "1 Month Anomaly Rainfalls (%)", "3 Months Anomaly Rainfalls (%)", 
                          "NDVI", "Price cereals and tubers", "Fatalities", "NDVI Anomaly", 
                          "rCSI", "Rainfalls (mm)"] # "Exchange rate"

df = df.loc[:, df.columns.get_level_values(1).isin(INDICATORS_TO_CONSIDER)]
df.head()

AdminStrata,Adamawa,Adamawa,Adamawa,Adamawa,Adamawa,Adamawa,Adamawa,Adamawa,Central,Central,...,South-West,South-West,West,West,West,West,West,West,West,West
Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),FCG,Fatalities,NDVI,NDVI Anomaly,Rainfalls (mm),rCSI,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),...,Rainfalls (mm),rCSI,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),FCG,Fatalities,NDVI,NDVI Anomaly,Rainfalls (mm),rCSI
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2018-01-01,,,,,,,,,,,...,,,,,,,,,,
2018-01-02,,,,,,,,,,,...,,,,,,,,,,
2018-01-03,,,,,,,,,,,...,,,,,,,,,,
2018-01-04,,,,,,,,,,,...,,,,,,,,,,
2018-01-05,,,,,,,,,,,...,,,,,,,,,,


In [7]:
df.rename({"1 Month Anomaly Rainfalls (%)": "omonthrainfalls", 
           "3 Months Anomaly Rainfalls (%)": "tmonthrainfalls", 
           "Exchange rate": "exchange", 
           "FCG": "fcg",
           "Fatalities": "fatalities",
           "NDVI Anomaly": "ndvianonaly",
           "NDVI": "ndvi",
           "Price cereals and tubers": "pricecereals", 
           "Rainfalls (mm)": "rainfalls", 
           "rCSI": "rcsi"}, axis = 1, level = 1, inplace = True)

In [8]:
def vif_func(group):
    group = group[group.name]

    # Gather features.
    features = "+".join(group.columns[group.columns != "fcg"])

    # Get y and X dataframes based on this regression.
    y, X = dmatrices('fcg ~' + features, group, return_type = "dataframe")
    
    # For each X, calculate VIF and save in dataframe.
    vif = pd.DataFrame()
    vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif["features"] = X.columns
    
    return vif.set_index("features")

vif_factor = df.groupby(axis = 1, level = 0).apply(vif_func) 
vif_factor.head()

  return 1 - self.ssr/self.centered_tss


AdminStrata,Adamawa,Central,East,Far-North,Littoral,North,North-West,South,South-West,West
Unnamed: 0_level_1,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor
features,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Intercept,1724.964433,4713.468231,5235.628673,545.748451,9511.602386,1159.207423,1505.638928,12066.830569,12843.335903,1512.911527
omonthrainfalls,1.68598,1.396595,1.582241,1.224535,1.76693,1.221173,1.282572,1.427852,1.841063,1.592255
tmonthrainfalls,1.711412,1.708446,2.175488,1.383453,1.779595,1.747776,1.48608,1.446732,2.074952,1.615155
fatalities,1.102305,1.06176,1.446811,1.581505,1.372585,1.11723,1.154773,,1.323328,1.12544
ndvi,2.744073,2.379038,2.407369,2.293753,2.524456,2.875607,3.361605,2.156019,3.802303,2.585557


In [9]:
vif = vif_factor.mean(axis = 1).to_frame().rename({0: "VIF"}, axis = 1)
vif

Unnamed: 0_level_0,VIF
features,Unnamed: 1_level_1
Intercept,5081.933652
omonthrainfalls,1.50212
tmonthrainfalls,1.712909
fatalities,1.253971
ndvi,2.712978
ndvianonaly,1.821116
rainfalls,2.984742
rcsi,1.247589


In [10]:
import dataframe_image as dfi

vif.style.export_png(f"./output_images/vif_{COUNTRY}.png")