# Variance Inflation Factor

In [1]:
import pandas as pd
import numpy as np
from patsy import dmatrices
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
COUNTRY = "Yemen"

In [3]:
PATH_TO_DATA_FOLDER = "../Dataset time-series/output_data/" + COUNTRY + "/"

## Time-series dataset

In [4]:
# Load the time-series data (daily interpolation).
df = pd.read_csv(PATH_TO_DATA_FOLDER + COUNTRY + ".csv", header = [0, 1], index_col = 0)
df.index.name = "Datetime"
df.index = pd.to_datetime(df.index)
freq = "D"
df.index.freq = freq

In [5]:
df.columns.get_level_values(1).unique()

Index(['1 Month Anomaly Rainfalls (%)', '3 Months Anomaly Rainfalls (%)',
       'Area', 'Code', 'FCG', 'Fatalities', 'Lat', 'Lon', 'NDVI',
       'NDVI Anomaly', 'Population', 'Price cereals and tubers',
       'Rainfalls (mm)', 'Ramadan', 'Waterways', 'rCSI'],
      dtype='object', name='Indicator')

In [6]:
# Consider only some indicators for the analysis with Spearman correlation.
INDICATORS_TO_CONSIDER = ["FCG", "1 Month Anomaly Rainfalls (%)", "3 Months Anomaly Rainfalls (%)", 
                          "NDVI", "Price cereals and tubers", "Fatalities", "NDVI Anomaly", 
                          "rCSI", "Rainfalls (mm)"] # "Exchange rate"

df = df.loc[:, df.columns.get_level_values(1).isin(INDICATORS_TO_CONSIDER)]
df.head()

AdminStrata,Abyan,Abyan,Abyan,Abyan,Abyan,Abyan,Abyan,Abyan,Abyan,Aden,...,Shabwah,Taizz,Taizz,Taizz,Taizz,Taizz,Taizz,Taizz,Taizz,Taizz
Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI,1 Month Anomaly Rainfalls (%),...,rCSI,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),FCG,Fatalities,NDVI,NDVI Anomaly,Price cereals and tubers,Rainfalls (mm),rCSI
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2018-01-01,,,,,,,,,,,...,,,,,,,,,,
2018-01-02,,,,,,,,,,,...,,,,,,,,,,
2018-01-03,,,,,,,,,,,...,,,,,,,,,,
2018-01-04,,,,,,,,,,,...,,,,,,,,,,
2018-01-05,,,,,,,,,,,...,,,,,,,,,,


In [7]:
df.rename({"1 Month Anomaly Rainfalls (%)": "omonthrainfalls", 
           "3 Months Anomaly Rainfalls (%)": "tmonthrainfalls", 
           "Exchange rate": "exchange", 
           "FCG": "fcg",
           "Fatalities": "fatalities",
           "NDVI Anomaly": "ndvianonaly",
           "NDVI": "ndvi",
           "Price cereals and tubers": "pricecereals", 
           "Rainfalls (mm)": "rainfalls", 
           "rCSI": "rcsi"}, axis = 1, level = 1, inplace = True)

In [8]:
df.drop(["tmonthrainfalls", "ndvianonaly"], axis = 1, level = 1, inplace = True)

In [9]:
def vif_func(group):
    group = group[group.name]

    # Gather features.
    features = "+".join(group.columns[group.columns != "fcg"])

    # Get y and X dataframes based on this regression.
    y, X = dmatrices('fcg ~' + features, group, return_type = "dataframe")
    
    # For each X, calculate VIF and save in dataframe.
    vif = pd.DataFrame()
    vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif["features"] = X.columns
    
    return vif.set_index("features")

vif_factor = df.groupby(axis = 1, level = 0).apply(vif_func) 
vif_factor.head()

AdminStrata,Abyan,Aden,Al Bayda,Al Dhale'e,Al Hudaydah,Al Jawf,Al Maharah,Al Mahwit,Amanat Al Asimah,Amran,Dhamar,Hajjah,Ibb,Lahj,Marib,Raymah,Sa'ada,Sana'a,Shabwah,Taizz
Unnamed: 0_level_1,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor,VIF Factor
features,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
Intercept,154.21907,155.882248,156.606053,135.234363,132.917721,3744.841153,576.269456,304.372927,189.565234,206.682892,349.263969,323.181971,267.035297,112.338926,492.333265,226.012704,219.641681,241.40464,414.443246,174.332038
omonthrainfalls,2.011619,2.149107,1.829229,1.484573,2.78635,3.553919,1.190789,2.15727,3.454453,3.276343,2.615615,2.092428,1.525049,2.399057,3.278057,1.866014,1.933753,4.292737,1.814281,1.547866
fatalities,1.14423,1.065755,1.157739,1.378517,1.20631,1.857995,1.132893,1.55266,1.103857,1.116083,1.198522,1.721311,1.173521,1.139589,1.899539,1.253706,1.735263,1.396514,1.397927,1.328861
ndvi,1.902154,1.261148,1.287974,1.38168,1.36378,1.801562,1.195541,1.049612,1.149711,1.232687,1.228459,1.362234,1.346473,1.706544,1.271949,1.180635,1.241806,1.166137,1.853732,1.316029
pricecereals,1.662246,1.504573,1.34441,1.246448,1.303454,3.729532,1.064232,1.435583,1.066678,1.199403,1.187096,1.422791,1.143368,1.65945,1.719281,1.570661,1.531005,1.373119,2.239808,1.144534


In [10]:
vif_factor.mean(axis = 1)

features
Intercept          428.828943
omonthrainfalls      2.362925
fatalities           1.348040
ndvi                 1.364992
pricecereals         1.527383
rainfalls            2.397046
rcsi                 1.287871
dtype: float64