# Variance Inflation Factor

In [51]:
import pandas as pd
import numpy as np
from patsy import dmatrices
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [52]:
COUNTRY = "Mali"

In [53]:
PATH_TO_DATA_FOLDER = "../Dataset time-series/output_data/" + COUNTRY + "/"

## Time-series dataset

In [54]:
# Load the time-series data (daily interpolation).
df = pd.read_csv(PATH_TO_DATA_FOLDER + COUNTRY + "-day.csv", header = [0, 1], index_col = 0)
df.index.name = "Datetime"
df.index = pd.to_datetime(df.index)
freq = "D"
df.index.freq = freq

In [55]:
df.columns.get_level_values(1).unique()

Index(['1 Month Anomaly Rainfalls (%)', '3 Months Anomaly Rainfalls (%)',
       'Code', 'FCG', 'Fatalities', 'Lat', 'Lon', 'NDVI', 'NDVI Anomaly',
       'Population', 'Price cereals and tubers', 'Rainfalls (mm)', 'Ramadan',
       'rCSI'],
      dtype='object', name='Indicator')

In [56]:
# Consider only some indicators for the analysis with Spearman correlation.
INDICATORS_TO_CONSIDER = ["FCG", "1 Month Anomaly Rainfalls (%)", "3 Months Anomaly Rainfalls (%)", 
                          "NDVI", "Price cereals and tubers", "Exchange rate", "Fatalities", "NDVI Anomaly", 
                          "rCSI", "Rainfalls (mm)"]

df = df.loc[:, df.columns.get_level_values(1).isin(INDICATORS_TO_CONSIDER)]
df.head()

AdminStrata,Bamako,Bamako,Bamako,Bamako,Bamako,Bamako,Bamako,Bamako,Bamako,Kayes,...,Sikasso,Tombouctou-Taoudeni,Tombouctou-Taoudeni,Tombouctou-Taoudeni,Tombouctou-Taoudeni,Tombouctou-Taoudeni,Tombouctou-Taoudeni,Tombouctou-Taoudeni,Tombouctou-Taoudeni,Tombouctou-Taoudeni
Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI,1 Month Anomaly Rainfalls (%),...,rCSI,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2020-06-25,97.021539,95.768473,58.93578,4.0,0.21976,97.802744,0.177408,183.6227,33.06422,95.932145,...,25.427096,103.041785,105.416579,40.500916,14.0,0.107955,100.591062,0.319924,2.9866,47.159438
2020-06-26,97.165819,95.747304,58.80597,4.0,0.2206,97.804598,0.177382,188.3227,32.5,96.169947,...,25.427096,103.306152,105.669399,36.985597,11.0,0.107915,100.580403,0.321816,3.1442,47.427984
2020-06-27,97.295875,95.733084,58.885542,4.0,0.221462,97.806032,0.177357,193.0227,32.341867,96.40364,...,25.427096,103.56742,105.91819,35.849802,11.0,0.107876,100.569645,0.323708,3.3018,46.166008
2020-06-28,97.411706,95.725813,58.885542,4.0,0.222345,97.807045,0.177331,197.7227,32.341867,96.633224,...,25.427096,103.825589,106.162951,34.679245,11.0,0.107838,100.558788,0.325601,3.4594,44.981132
2020-06-29,97.513313,95.72549,58.510638,4.0,0.223249,97.807638,0.177306,202.4227,31.724924,96.858699,...,25.427096,104.082527,106.405931,33.423077,11.0,0.107801,100.548616,0.325601,3.5807,43.923077


In [57]:
df.rename({"1 Month Anomaly Rainfalls (%)": "omonthrainfalls", 
           "3 Months Anomaly Rainfalls (%)": "tmonthrainfalls", 
           "Exchange rate": "exchange", 
           "FCG": "fcg", 
           "NDVI Anomaly": "ndvianonaly", 
           "Price cereals and tubers": "pricecereals", 
           "Rainfalls (mm)": "rainfalls", 
           "rCSI": "rcsi"}, axis = 1, level = 1, inplace = True)

In [58]:
df.drop(["tmonthrainfalls", "NDVI", "ndvianonaly"], axis = 1, level = 1, inplace = True)

In [59]:
def vif_func(group):
    group = group[group.name]

    # Gather features.
    features = "+".join(group.columns[group.columns != "fcg"])

    # get y and X dataframes based on this regression:
    y, X = dmatrices('fcg ~' + features, group, return_type = "dataframe")
    
    # For each X, calculate VIF and save in dataframe
    vif = pd.DataFrame()
    vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif["features"] = X.columns
    
    return vif.set_index("features")

vif_factor = df.groupby(axis = 1, level = 0).apply(vif_func) 
vif_factor.head()

AdminStrata,Bamako,Kayes,Kidal,Koulikoro,Mopti,Segou,Sikasso,Tombouctou-Taoudeni
Unnamed: 0_level_1,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor
features,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Intercept,269.442547,1136.25146,2139.537178,820.597412,560.005299,699.179672,356.465521,1813.54211
omonthrainfalls,3.343001,2.951173,5.083326,2.465971,3.560853,8.192215,3.081432,6.525906
Fatalities,8.637373,2.03568,3.485511,1.409067,4.194407,1.579309,2.650801,1.581535
pricecereals,4.384775,4.093538,9.154498,4.605924,1.33591,3.283548,6.710273,2.588772
rainfalls,12.41662,3.266797,6.484012,6.036188,2.824471,5.139652,3.657718,6.686363


In [60]:
vif_factor.mean(axis = 1)

features
Intercept          974.377650
omonthrainfalls      4.400485
Fatalities           3.196710
pricecereals         4.519655
rainfalls            5.813978
rcsi                 3.156394
dtype: float64