## Multicollinearity In Linear Regression

In [None]:
import pandas as pd

In [None]:
import statsmodels.api as sm
df_adv = pd.read_csv('data/Advertising.csv', index_col=0)
X = df_adv[['TV', 'radio','newspaper']]
y = df_adv['sales']
df_adv.head()


Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [None]:
X = sm.add_constant(X)

In [None]:
X

Unnamed: 0,const,TV,radio,newspaper
1,1.0,230.1,37.8,69.2
2,1.0,44.5,39.3,45.1
3,1.0,17.2,45.9,69.3
4,1.0,151.5,41.3,58.5
5,1.0,180.8,10.8,58.4
...,...,...,...,...
196,1.0,38.2,3.7,13.8
197,1.0,94.2,4.9,8.1
198,1.0,177.0,9.3,6.4
199,1.0,283.6,42.0,66.2


In [None]:
## fit a OLS model with intercept on TV and Radio

model= sm.OLS(y, X).fit()

In [None]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  sales   R-squared:                       0.897
Model:                            OLS   Adj. R-squared:                  0.896
Method:                 Least Squares   F-statistic:                     570.3
Date:                Sun, 14 Jun 2020   Prob (F-statistic):           1.58e-96
Time:                        23:06:45   Log-Likelihood:                -386.18
No. Observations:                 200   AIC:                             780.4
Df Residuals:                     196   BIC:                             793.6
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.9389      0.312      9.422      0.0

In [None]:
import matplotlib.pyplot as plt
X.iloc[:,1:].corr()

Unnamed: 0,TV,radio,newspaper
TV,1.0,0.054809,0.056648
radio,0.054809,1.0,0.354104
newspaper,0.056648,0.354104,1.0


# Another Example

In [None]:
df_salary = pd.read_csv('data/Salary_Data.csv')
df_salary.head()

Unnamed: 0,YearsExperience,Age,Salary
0,1.1,21.0,39343
1,1.3,21.5,46205
2,1.5,21.7,37731
3,2.0,22.0,43525
4,2.2,22.2,39891


In [None]:
X = df_salary[['YearsExperience', 'Age']]
y = df_salary['Salary']

In [None]:
## fit a OLS model with intercept on TV and Radio
X = sm.add_constant(X)
model= sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 Salary   R-squared:                       0.960
Model:                            OLS   Adj. R-squared:                  0.957
Method:                 Least Squares   F-statistic:                     323.9
Date:                Sun, 14 Jun 2020   Prob (F-statistic):           1.35e-19
Time:                        23:06:46   Log-Likelihood:                -300.35
No. Observations:                  30   AIC:                             606.7
Df Residuals:                      27   BIC:                             610.9
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const           -6661.9872   2.28e+04     

In [None]:
X.iloc[:,1:].corr()

Unnamed: 0,YearsExperience,Age
YearsExperience,1.0,0.987258
Age,0.987258,1.0


In [None]:
# Import library for VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

X = df_salary.iloc[:,:-1]
calc_vif(X)

Unnamed: 0,variables,VIF
0,YearsExperience,11.24047
1,Age,11.24047


# Fixing Multicollinearity
1. laeve it alone
2. Dropping one of the correlated features will help in bringing down the multicollinearity between correlated features
3. between correlated features

In [None]:
list(df_salary)

['YearsExperience', 'Age', 'Salary']

In [None]:
# df2 = df_salary.copy()
# df2['Age_Years_Experience'] = df_salary.apply(lambda x: x['Age'] - x['YearsExperience'],axis=1)
# del df2['YearsExperience']
# del df2['Salary']
# X= df2

In [None]:
# X = df_salary.drop(['Age'],axis=1)
# calc_vif(X)

In [None]:
## fit a OLS model with intercept on TV and Radio
X = sm.add_constant(X)
model= sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 Salary   R-squared:                       0.960
Model:                            OLS   Adj. R-squared:                  0.957
Method:                 Least Squares   F-statistic:                     323.9
Date:                Sun, 14 Jun 2020   Prob (F-statistic):           1.35e-19
Time:                        23:06:46   Log-Likelihood:                -300.35
No. Observations:                  30   AIC:                             606.7
Df Residuals:                      27   BIC:                             610.9
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const           -6661.9872   2.28e+04     