## Variance Inflation Factor (V.I.F)

In [69]:
from IPython.core.display import display, HTML

> References

In [70]:
display(HTML("""<a href="https://etav.github.io/python/vif_factor_python.html">Ernest Tavares III: Variance Inflation Factor (VIF) </a>""")) 
display(HTML("""<a href="https://etav.github.io/python/vif_factor_python.html"> Dataset Github Repository </a>"""))
display(HTML("""<a href="https://github.com/h2oai/app-consumer-loan/blob/master/data/loan.csv"> Dataset Direct Link </a>"""))

In [71]:
import pandas as pd
import numpy as np

In [72]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [73]:
from patsy import dmatrices

In [74]:
df = pd.read_csv('data/loan.csv')
df.head()

Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,purpose,addr_state,dti,delinq_2yrs,revol_util,total_acc,bad_loan,longest_credit_length,verification_status
0,5000,36 months,10.65,10.0,RENT,24000.0,credit_card,AZ,27.65,0.0,83.7,9.0,0,26.0,verified
1,2500,60 months,15.27,0.0,RENT,30000.0,car,GA,1.0,0.0,9.4,4.0,1,12.0,verified
2,2400,36 months,15.96,10.0,RENT,12252.0,small_business,IL,8.72,0.0,98.5,10.0,0,10.0,not verified
3,10000,36 months,13.49,10.0,RENT,49200.0,other,CA,20.0,0.0,21.0,37.0,0,15.0,verified
4,5000,36 months,7.9,3.0,RENT,36000.0,wedding,AZ,11.2,0.0,28.3,12.0,0,7.0,verified


In [75]:
df.shape

(163987, 15)

In [76]:
df.isna().sum()

loan_amnt                   0
term                        0
int_rate                    0
emp_length               5804
home_ownership              0
annual_inc                  4
purpose                     0
addr_state                  0
dti                         0
delinq_2yrs                29
revol_util                193
total_acc                  29
bad_loan                    0
longest_credit_length      29
verification_status         0
dtype: int64

In [77]:
data = df.dropna(axis = 0, how = 'any')
data.shape

(157996, 15)

In [78]:
data.head()

Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,purpose,addr_state,dti,delinq_2yrs,revol_util,total_acc,bad_loan,longest_credit_length,verification_status
0,5000,36 months,10.65,10.0,RENT,24000.0,credit_card,AZ,27.65,0.0,83.7,9.0,0,26.0,verified
1,2500,60 months,15.27,0.0,RENT,30000.0,car,GA,1.0,0.0,9.4,4.0,1,12.0,verified
2,2400,36 months,15.96,10.0,RENT,12252.0,small_business,IL,8.72,0.0,98.5,10.0,0,10.0,not verified
3,10000,36 months,13.49,10.0,RENT,49200.0,other,CA,20.0,0.0,21.0,37.0,0,15.0,verified
4,5000,36 months,7.9,3.0,RENT,36000.0,wedding,AZ,11.2,0.0,28.3,12.0,0,7.0,verified


In [79]:
data.columns

Index(['loan_amnt', 'term', 'int_rate', 'emp_length', 'home_ownership',
       'annual_inc', 'purpose', 'addr_state', 'dti', 'delinq_2yrs',
       'revol_util', 'total_acc', 'bad_loan', 'longest_credit_length',
       'verification_status'],
      dtype='object')

In [80]:
data_i = data[['loan_amnt', 'int_rate', 'annual_inc', 'dti']]
data_i.head()

Unnamed: 0,loan_amnt,int_rate,annual_inc,dti
0,5000,10.65,24000.0,27.65
1,2500,15.27,30000.0,1.0
2,2400,15.96,12252.0,8.72
3,10000,13.49,49200.0,20.0
4,5000,7.9,36000.0,11.2


In [81]:
req_columns = data_i.columns.difference(["annual_inc"])
final_columns = "+".join(req_columns)
final_columns

'dti+int_rate+loan_amnt'

In [82]:
y, X = dmatrices('annual_inc ~ '+final_columns, data_i, return_type = 'dataframe')

In [83]:
print(type(X))
print(X.shape)
print(type(y))

<class 'pandas.core.frame.DataFrame'>
(157996, 4)
<class 'pandas.core.frame.DataFrame'>


In [84]:
vif = pd.DataFrame()

In [85]:


vif['VIF Factor'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

In [86]:
vif["features"] = X.columns

In [87]:
vif

Unnamed: 0,VIF Factor,features
0,13.99937,Intercept
1,1.033368,dti
2,1.092359,int_rate
3,1.062119,loan_amnt
