# Variance Inflation Factor

In [1]:
import pandas as pd
import numpy as np
from patsy import dmatrices
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
COUNTRY = "Syria"

In [3]:
PATH_TO_DATA_FOLDER = "../Dataset time-series/output_data/" + COUNTRY + "/"

## Time-series dataset

In [4]:
# Load the time-series data (daily interpolation).
df = pd.read_csv(PATH_TO_DATA_FOLDER + COUNTRY + "-day.csv", header = [0, 1], index_col = 0)
df.index.name = "Datetime"
df.index = pd.to_datetime(df.index)
freq = "D"
df.index.freq = freq

In [5]:
df.columns.get_level_values(1).unique()

Index(['1 Month Anomaly Rainfalls (%)', '3 Months Anomaly Rainfalls (%)',
       'Code', 'Exchange rate', 'FCG', 'Fatalities', 'Lat', 'Lon', 'NDVI',
       'NDVI Anomaly', 'Population', 'Price cereals and tubers',
       'Rainfalls (mm)', 'Ramadan', 'rCSI'],
      dtype='object', name='Indicator')

In [6]:
# Consider only some indicators for the analysis with Spearman correlation.
INDICATORS_TO_CONSIDER = ["FCG", "1 Month Anomaly Rainfalls (%)", "3 Months Anomaly Rainfalls (%)", 
                          "NDVI", "Price cereals and tubers", "Exchange rate", "Fatalities", "NDVI Anomaly", 
                          "rCSI", "Rainfalls (mm)"]

df = df.loc[:, df.columns.get_level_values(1).isin(INDICATORS_TO_CONSIDER)]
df.head()

AdminStrata,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,...,Tartous,Tartous,Tartous,Tartous,Tartous,Tartous,Tartous,Tartous,Tartous,Tartous
Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI,...,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Exchange rate,FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2018-09-29,99.920932,103.281473,0.014694,36.83872,109.5,0.117052,89.349382,0.039965,0.1372,34.0544,...,113.829315,179.577637,0.014121,12.46537,21.0,0.445703,114.29633,0.128021,7.9716,27.31994
2018-09-30,99.894768,102.581411,0.014829,36.83872,99.5,0.116985,89.32613,0.040093,0.15,34.0544,...,113.631431,173.128816,0.014243,12.94964,21.0,0.44519,114.196334,0.128717,8.1647,26.22302
2018-10-01,99.936859,102.137474,0.015061,36.49929,97.5,0.116915,89.301813,0.040317,0.7876,33.67614,...,113.462981,166.727201,0.01443,12.45675,21.0,0.444679,114.087915,0.129239,10.0496,25.22491
2018-10-02,100.047045,101.810225,0.015157,36.37038,97.0,0.116842,89.27532,0.040412,1.4245,31.54574,...,113.241732,160.712897,0.014494,12.18369,21.0,0.44417,113.971913,0.129066,11.5057,27.92877
2018-10-03,100.225326,101.599662,0.015254,38.25699,85.0,0.116765,89.246651,0.040507,2.0614,32.7672,...,112.967684,155.085903,0.014559,12.72342,21.0,0.443665,113.848328,0.128892,12.9618,27.0827


In [7]:
df.rename({"1 Month Anomaly Rainfalls (%)": "omonthrainfalls", 
           "3 Months Anomaly Rainfalls (%)": "tmonthrainfalls", 
           "Exchange rate": "exchange", 
           "FCG": "fcg", 
           "NDVI Anomaly": "ndvianonaly", 
           "Price cereals and tubers": "pricecereals", 
           "Rainfalls (mm)": "rainfalls", 
           "rCSI": "rcsi"}, axis = 1, level = 1, inplace = True)

In [8]:
df.drop(["NDVI", "tmonthrainfalls", "exchange"], axis = 1, level = 1, inplace = True)

In [9]:
def vif_func(group):
    group = group[group.name]

    # Gather features.
    features = "+".join(group.columns[group.columns != "fcg"])

    # get y and X dataframes based on this regression:
    y, X = dmatrices('fcg ~' + features, group, return_type = "dataframe")
    
    # For each X, calculate VIF and save in dataframe
    vif = pd.DataFrame()
    vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif["features"] = X.columns
    
    return vif.set_index("features")

vif_factor = df.groupby(axis = 1, level = 0).apply(vif_func) 
vif_factor.head()

AdminStrata,Al-Hasakeh,Aleppo,Ar-Raqqa,As-Sweida,Damascus,Dar'a,Deir-ez-Zor,Hama,Homs,Lattakia,Rural Damascus,Tartous
Unnamed: 0_level_1,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor
features,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Intercept,171.532432,125.486061,131.570924,216.766078,110.586413,158.322109,215.664793,160.236056,220.135419,4763.786321,888.982753,1722.549898
omonthrainfalls,2.573919,2.515283,2.136237,1.653601,2.370864,2.240947,2.876715,2.141955,2.141265,2.651936,1.960745,2.358759
Fatalities,1.235211,1.413148,1.160155,1.426852,1.198716,3.47887,4.117419,1.628934,1.347844,1.229832,1.458853,1.175353
ndvianonaly,1.656972,2.122485,1.285361,1.847111,2.033085,1.219747,1.688573,2.037103,1.373545,1.599243,2.547982,1.713608
pricecereals,2.277501,1.317785,1.348062,2.276638,2.452264,2.761742,1.89941,1.639688,1.716137,1.512833,2.696253,1.15537


In [10]:
vif_factor.mean(axis = 1)

features
Intercept          740.468271
omonthrainfalls      2.301852
Fatalities           1.739266
ndvianonaly          1.760401
pricecereals         1.921140
rainfalls            2.280645
rcsi                 1.511084
dtype: float64