In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import RFE  # RFE(estimator, 2, step=1)
from sklearn.svm import SVR

from sklearn.linear_model import LinearRegression
from statsmodels.formula.api import ols

In [2]:
data = pd.read_csv('datasets/Advertising.csv', index_col=0)

In [3]:
data.head(2)

Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4


# Using Statsmodels

In [7]:
model1 = ols(formula='Sales ~ TV', data=data).fit()
print model1.params
print model1.pvalues

Intercept    7.032594
TV           0.047537
dtype: float64
Intercept    1.406300e-35
TV           1.467390e-42
dtype: float64


In [8]:
model1.summary()

0,1,2,3
Dep. Variable:,Sales,R-squared:,0.612
Model:,OLS,Adj. R-squared:,0.61
Method:,Least Squares,F-statistic:,312.1
Date:,"Sun, 22 Jan 2017",Prob (F-statistic):,1.47e-42
Time:,21:11:41,Log-Likelihood:,-519.05
No. Observations:,200,AIC:,1042.0
Df Residuals:,198,BIC:,1049.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,7.0326,0.458,15.360,0.000,6.130 7.935
TV,0.0475,0.003,17.668,0.000,0.042 0.053

0,1,2,3
Omnibus:,0.531,Durbin-Watson:,1.935
Prob(Omnibus):,0.767,Jarque-Bera (JB):,0.669
Skew:,-0.089,Prob(JB):,0.716
Kurtosis:,2.779,Cond. No.,338.0


In [11]:
data.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [14]:
sales_pred = model1.predict(data[['TV']])
sales_pred[:5]

array([ 17.97077451,   9.14797405,   7.85022376,  14.23439457,  15.62721814])

In [18]:
model2 = ols(formula='Sales~TV+Radio+Newspaper', data=data).fit()
model2.summary()

0,1,2,3
Dep. Variable:,Sales,R-squared:,0.897
Model:,OLS,Adj. R-squared:,0.896
Method:,Least Squares,F-statistic:,570.3
Date:,"Sun, 22 Jan 2017",Prob (F-statistic):,1.58e-96
Time:,21:22:29,Log-Likelihood:,-386.18
No. Observations:,200,AIC:,780.4
Df Residuals:,196,BIC:,793.6
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,2.9389,0.312,9.422,0.000,2.324 3.554
TV,0.0458,0.001,32.809,0.000,0.043 0.049
Radio,0.1885,0.009,21.893,0.000,0.172 0.206
Newspaper,-0.0010,0.006,-0.177,0.860,-0.013 0.011

0,1,2,3
Omnibus:,60.414,Durbin-Watson:,2.084
Prob(Omnibus):,0.0,Jarque-Bera (JB):,151.241
Skew:,-1.327,Prob(JB):,1.44e-33
Kurtosis:,6.332,Cond. No.,454.0


In [19]:
model3 = ols(formula='Sales~TV+Radio', data=data).fit()
model3.summary()

0,1,2,3
Dep. Variable:,Sales,R-squared:,0.897
Model:,OLS,Adj. R-squared:,0.896
Method:,Least Squares,F-statistic:,859.6
Date:,"Sun, 22 Jan 2017",Prob (F-statistic):,4.83e-98
Time:,21:22:32,Log-Likelihood:,-386.2
No. Observations:,200,AIC:,778.4
Df Residuals:,197,BIC:,788.3
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,2.9211,0.294,9.919,0.000,2.340 3.502
TV,0.0458,0.001,32.909,0.000,0.043 0.048
Radio,0.1880,0.008,23.382,0.000,0.172 0.204

0,1,2,3
Omnibus:,60.022,Durbin-Watson:,2.081
Prob(Omnibus):,0.0,Jarque-Bera (JB):,148.679
Skew:,-1.323,Prob(JB):,5.19e-33
Kurtosis:,6.292,Cond. No.,425.0


In [20]:
data.corr()

Unnamed: 0,TV,Radio,Newspaper,Sales
TV,1.0,0.054809,0.056648,0.782224
Radio,0.054809,1.0,0.354104,0.576223
Newspaper,0.056648,0.354104,1.0,0.228299
Sales,0.782224,0.576223,0.228299,1.0


In [31]:
cols = ['TV', 'Newspaper', 'Radio']
vif_df = pd.DataFrame()

for col in cols:
    formula = '{0} ~ {1}'.format(col, ' + '.join([cl for cl in cols if cl != col]))
    vif_model = ols(formula=formula, data=data).fit()
    VIF = 1./(1 - vif_model.rsquared)
    vif_df = pd.concat([vif_df, pd.DataFrame(data=[[round(VIF, 3)]], columns=[col])], axis=1)

vif_df

Unnamed: 0,TV,Newspaper,Radio
0,1.005,1.145,1.145


# Sklearn