# Variance Inflation Factor

In [1]:
import pandas as pd
import numpy as np
from patsy import dmatrices
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import dataframe_image as dfi

In [2]:
COUNTRY = "Cameroon"

In [3]:
PATH_TO_DATA_FOLDER = "../Dataset time-series/output_data/" + COUNTRY + "/"

## Time-series dataset

In [4]:
# Load the time-series data (daily interpolation).
df = pd.read_csv(PATH_TO_DATA_FOLDER + COUNTRY + "-day.csv", header = [0, 1], index_col = 0)
df.index.name = "Datetime"
df.index = pd.to_datetime(df.index)
freq = "D"
df.index.freq = freq

In [5]:
df.columns.get_level_values(1).unique()

Index(['1 Month Anomaly Rainfalls (%)', '3 Months Anomaly Rainfalls (%)',
       'Code', 'FCG', 'Fatalities', 'Lat', 'Lon', 'NDVI', 'NDVI Anomaly',
       'Population', 'Rainfalls (mm)', 'Ramadan', 'rCSI'],
      dtype='object', name='Indicator')

In [6]:
# Consider only some indicators for the analysis with Spearman correlation.
INDICATORS_TO_CONSIDER = ["FCG", "1 Month Anomaly Rainfalls (%)", "3 Months Anomaly Rainfalls (%)", 
                          "NDVI", "Price cereals and tubers", "Exchange rate", "Fatalities", "NDVI Anomaly", 
                          "rCSI", "Rainfalls (mm)"]

df = df.loc[:, df.columns.get_level_values(1).isin(INDICATORS_TO_CONSIDER)]
df.head()

AdminStrata,Adamawa,Adamawa,Adamawa,Adamawa,Adamawa,Adamawa,Adamawa,Adamawa,Central,Central,...,South-West,South-West,West,West,West,West,West,West,West,West
Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),FCG,Fatalities,NDVI,NDVI Anomaly,Rainfalls (mm),rCSI,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),...,Rainfalls (mm),rCSI,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),FCG,Fatalities,NDVI,NDVI Anomaly,Rainfalls (mm),rCSI
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2019-06-30,101.837038,102.942884,26.79795,0.0,0.698378,104.961832,514.7136,16.43836,94.105788,103.036325,...,250.133,57.68452,98.404472,114.494632,39.07455,1.0,0.733825,103.350519,478.4319,20.90831
2019-07-01,101.745991,102.802374,27.08333,0.0,0.700721,104.874987,520.9204,17.0,93.430054,102.645889,...,261.6982,56.5694,97.784255,113.824392,37.46919,1.0,0.735867,103.274955,489.1485,20.0493
2019-07-02,101.651256,102.667455,25.39063,0.0,0.703054,104.788066,527.1272,15.9375,92.797532,102.271692,...,273.2634,57.57078,97.234921,113.185486,37.77065,1.0,0.737881,103.199658,499.8651,19.56696
2019-07-03,101.552834,102.538126,24.86611,0.0,0.705376,104.701069,533.334,15.60826,92.208223,101.913731,...,284.8286,54.00416,96.756472,112.577914,38.15074,1.0,0.739869,103.124628,510.5817,20.51282
2019-07-04,101.450723,102.414388,24.41773,0.0,0.707687,104.613995,539.5408,15.32682,91.662127,101.572009,...,296.3938,54.08571,96.348907,112.001678,37.79831,1.0,0.74183,103.049865,521.2983,20.32333


In [7]:
df.rename({"1 Month Anomaly Rainfalls (%)": "omonthrainfalls", 
           "3 Months Anomaly Rainfalls (%)": "tmonthrainfalls", 
           "Exchange rate": "exchange", 
           "FCG": "fcg",
           "Fatalities": "fatalities",
           "NDVI Anomaly": "ndvianonaly",
           "NDVI": "ndvi",
           "Price cereals and tubers": "pricecereals", 
           "Rainfalls (mm)": "rainfalls", 
           "rCSI": "rcsi"}, axis = 1, level = 1, inplace = True)

In [8]:
#df.drop(["tmonthrainfalls", "NDVI", "ndvianonaly"], axis = 1, level = 1, inplace = True)

In [9]:
def vif_func(group):
    group = group[group.name]

    # Gather features.
    features = "+".join(group.columns[group.columns != "fcg"])

    # get y and X dataframes based on this regression:
    y, X = dmatrices('fcg ~' + features, group, return_type = "dataframe")
    
    # For each X, calculate VIF and save in dataframe
    vif = pd.DataFrame()
    vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif["features"] = X.columns
    
    return vif.set_index("features")

vif_factor = df.groupby(axis = 1, level = 0).apply(vif_func) 
vif_factor.head()

  return 1 - self.ssr/self.centered_tss


AdminStrata,Adamawa,Central,East,Far-North,Littoral,North,North-West,South,South-West,West
Unnamed: 0_level_1,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor
features,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Intercept,43987.124329,34144.475712,173996.179538,2291.779288,87663.131169,5406.897633,11455.232419,47159.666133,31309.881156,21652.571102
omonthrainfalls,3.111942,6.364601,11.946215,3.036543,26.333586,2.550389,1.645721,3.817635,3.186574,5.919111
tmonthrainfalls,10.940597,1.88891,7.229847,5.414378,11.778486,13.680864,4.458302,3.79239,2.304299,3.613779
fatalities,3.823023,2.149146,6.471934,2.815645,6.058688,1.034919,2.335317,,1.957809,2.995179
ndvi,7.585104,20.804874,14.065741,2.833941,11.280213,11.123398,6.407964,5.73206,9.122934,13.399575


In [10]:
vif_factor.mean(axis = 1)

features
Intercept          45906.693848
omonthrainfalls        6.791232
tmonthrainfalls        6.510185
fatalities             3.293518
ndvi                  10.235580
ndvianonaly            7.938556
rainfalls             10.402606
rcsi                   3.922853
dtype: float64

In [11]:
vif = vif_factor.mean(axis = 1).to_frame().rename({0: "VIF"}, axis = 1).astype(str)
vif.style.export_png(f"./output_images/VIF_{COUNTRY}.png")