# Variance Inflation Factor

In [1]:
import pandas as pd
import numpy as np
from patsy import dmatrices
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
COUNTRY = "Yemen"

In [3]:
PATH_TO_DATA_FOLDER = "../Dataset time-series/output_data/" + COUNTRY + "/"

## Time-series dataset

In [4]:
# Load the time-series data (daily interpolation).
df = pd.read_csv(PATH_TO_DATA_FOLDER + COUNTRY + "-day.csv", header = [0, 1], index_col = 0)
df.index.name = "Datetime"
df.index = pd.to_datetime(df.index)
freq = "D"
df.index.freq = freq

In [5]:
df.columns.get_level_values(1).unique()

Index(['1 Month Anomaly Rainfalls (%)', '3 Months Anomaly Rainfalls (%)',
       'Code', 'Exchange rate', 'FCG', 'Fatalities', 'Lat', 'Lon', 'NDVI',
       'NDVI Anomaly', 'Population', 'Price cereals and tubers',
       'Rainfalls (mm)', 'Ramadan', 'rCSI'],
      dtype='object', name='Indicator')

In [6]:
# Consider only some indicators for the analysis with Spearman correlation.
INDICATORS_TO_CONSIDER = ["FCG", "1 Month Anomaly Rainfalls (%)", "3 Months Anomaly Rainfalls (%)", 
                          "NDVI", "Price cereals and tubers", "Exchange rate", "Fatalities", "NDVI Anomaly", 
                          "rCSI", "Rainfalls (mm)"]

df = df.loc[:, df.columns.get_level_values(1).isin(INDICATORS_TO_CONSIDER)]
df.head()

AdminStrata,Abyan,Abyan,Abyan,Abyan,Abyan,Abyan,Abyan,Abyan,Abyan,Abyan,...,Taizz,Taizz,Taizz,Taizz,Taizz,Taizz,Taizz,Taizz,Taizz,Taizz
Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI,...,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2018-08-22,90.454758,111.487446,0.207215,31.8098,10.0,0.098889,98.450151,0.164924,12.9544,37.4219,...,93.865658,147.652361,0.207371,39.0431,171.0,0.198394,104.340193,0.181576,41.6606,50.919
2018-08-23,89.92758,110.382403,0.210332,32.3747,10.0,0.098958,98.353086,0.166085,13.0949,37.3462,...,94.296436,145.973629,0.210846,39.3717,147.0,0.198516,104.184601,0.181735,42.6437,50.4159
2018-08-24,89.43678,109.316308,0.213449,33.7721,10.0,0.099033,98.254291,0.167246,13.2354,37.9994,...,94.778817,144.313356,0.214321,36.6621,145.0,0.198655,104.02175,0.181894,43.6268,49.3913
2018-08-25,88.982357,108.289161,0.216565,34.5337,10.0,0.099113,98.153766,0.168407,13.3759,36.0682,...,95.312802,142.671542,0.217796,37.2052,156.0,0.198812,103.851638,0.182052,44.6099,50.3024
2018-08-26,88.564312,107.300961,0.219682,32.3279,10.0,0.099197,98.05151,0.169568,13.5164,38.2185,...,95.89839,141.048187,0.221272,37.0257,164.0,0.198987,103.674267,0.182211,45.593,50.293


In [7]:
df.rename({"1 Month Anomaly Rainfalls (%)": "omonthrainfalls", 
           "3 Months Anomaly Rainfalls (%)": "tmonthrainfalls", 
           "Exchange rate": "exchange", 
           "FCG": "fcg", 
           "NDVI Anomaly": "ndvianonaly", 
           "Price cereals and tubers": "pricecereals", 
           "Rainfalls (mm)": "rainfalls", 
           "rCSI": "rcsi"}, axis = 1, level = 1, inplace = True)

In [8]:
df.drop(["NDVI", "tmonthrainfalls", "exchange"], axis = 1, level = 1, inplace = True)

In [9]:
def vif_func(group):
    group = group[group.name]

    # Gather features.
    features = "+".join(group.columns[group.columns != "fcg"])

    # get y and X dataframes based on this regression:
    y, X = dmatrices('fcg ~' + features, group, return_type = "dataframe")
    
    # For each X, calculate VIF and save in dataframe
    vif = pd.DataFrame()
    vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif["features"] = X.columns
    
    return vif.set_index("features")

vif_factor = df.groupby(axis = 1, level = 0).apply(vif_func) 
vif_factor.head()

AdminStrata,Abyan,Aden,Al Bayda,Al Dhale'e,Al Hudaydah,Al Jawf,Al Maharah,Al Mahwit,Amanat Al Asimah,Amran,Dhamar,Hajjah,Ibb,Lahj,Marib,Raymah,Sa'ada,Sana'a,Shabwah,Taizz
Unnamed: 0_level_1,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor
features,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
Intercept,195.277876,162.536173,183.551323,343.010453,189.113936,8149.871893,783.43879,248.580255,359.864471,394.32558,378.344241,671.183748,367.892148,248.985426,1095.170928,223.952887,536.864205,364.660485,1322.509441,710.30206
omonthrainfalls,2.105651,1.667642,1.806171,1.780193,2.416743,2.62867,1.318039,1.801876,2.322493,2.510647,2.539547,1.862621,1.799365,1.990948,2.786799,1.972111,1.652144,2.43552,2.095277,2.643972
Fatalities,1.274635,1.055794,1.286321,1.31112,2.419964,1.571687,1.088857,1.313606,1.432929,1.264318,1.273205,3.482758,1.167755,1.468575,2.097348,1.126704,1.667148,1.170511,1.097871,2.37671
ndvianonaly,3.100545,1.619352,1.781371,2.817401,1.489187,2.124899,1.200568,1.627474,1.847555,1.485842,1.589174,3.143443,1.872213,3.142813,2.843919,1.563283,1.801126,1.905421,3.527138,4.302969
pricecereals,1.429665,1.453912,1.351644,1.407063,2.304595,1.891942,1.230092,1.477444,1.534742,1.341802,1.334436,1.601445,1.154738,1.807755,2.878715,1.225652,1.390679,1.580073,2.546115,1.245763


In [10]:
vif_factor.mean(axis = 1)

features
Intercept          846.471816
omonthrainfalls      2.106821
Fatalities           1.547391
ndvianonaly          2.239285
pricecereals         1.609414
rainfalls            1.813439
rcsi                 1.206737
dtype: float64