In [1]:
import pandas as pd
import typing
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import matplotlib.pyplot as plt
import patsy
import seaborn as sns
import numpy as np
from statsmodels.formula.api import ols
from scipy import stats

%matplotlib inline

In [2]:
auto_df = pd.read_csv('../datasets/Auto.csv')
auto_df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
392,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
393,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
394,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
395,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


In [3]:
auto_df = auto_df[auto_df['horsepower'] != '?']
auto_df = auto_df.astype({'horsepower':float})

In [4]:
def perform_prediction_with_formula(formula, data):
    model = ols(formula=formula, data=data)
    return model.fit()

# Year
## mpg ~ C(year)

In [6]:
result_1 = perform_prediction_with_formula('mpg ~ C(year)', auto_df)
result_1.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.43
Model:,OLS,Adj. R-squared:,0.412
Method:,Least Squares,F-statistic:,23.8
Date:,"Fri, 06 Dec 2019",Prob (F-statistic):,1.99e-39
Time:,12:45:27,Log-Likelihood:,-1251.1
No. Observations:,392,AIC:,2528.0
Df Residuals:,379,BIC:,2580.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,17.6897,1.112,15.913,0.000,15.504,19.875
C(year)[T.71],3.4215,1.601,2.137,0.033,0.274,6.569
C(year)[T.72],1.0246,1.586,0.646,0.519,-2.094,4.143
C(year)[T.73],-0.5897,1.460,-0.404,0.687,-3.460,2.281
C(year)[T.74],5.0796,1.617,3.142,0.002,1.900,8.259
C(year)[T.75],2.5770,1.559,1.653,0.099,-0.488,5.642
C(year)[T.76],3.8839,1.513,2.567,0.011,0.908,6.859
C(year)[T.77],5.6853,1.586,3.584,0.000,2.567,8.804
C(year)[T.78],6.3715,1.494,4.265,0.000,3.434,9.309

0,1,2,3
Omnibus:,21.476,Durbin-Watson:,0.872
Prob(Omnibus):,0.0,Jarque-Bera (JB):,14.234
Skew:,0.338,Prob(JB):,0.000811
Kurtosis:,2.357,Cond. No.,14.2


## mpg ~ year

In [7]:
result_2 = perform_prediction_with_formula('mpg ~ year', auto_df)
result_2.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.337
Model:,OLS,Adj. R-squared:,0.335
Method:,Least Squares,F-statistic:,198.3
Date:,"Fri, 06 Dec 2019",Prob (F-statistic):,1.08e-36
Time:,12:45:29,Log-Likelihood:,-1280.6
No. Observations:,392,AIC:,2565.0
Df Residuals:,390,BIC:,2573.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-70.0117,6.645,-10.536,0.000,-83.076,-56.947
year,1.2300,0.087,14.080,0.000,1.058,1.402

0,1,2,3
Omnibus:,21.407,Durbin-Watson:,0.775
Prob(Omnibus):,0.0,Jarque-Bera (JB):,15.843
Skew:,0.387,Prob(JB):,0.000363
Kurtosis:,2.391,Cond. No.,1570.0


# Year, horsepower, cylinders
## mpg ~ C(year) + horsepower + cylinders

In [8]:
result_1 = perform_prediction_with_formula('mpg ~ C(year) + horsepower + cylinders', auto_df)
result_1.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.787
Model:,OLS,Adj. R-squared:,0.779
Method:,Least Squares,F-statistic:,99.46
Date:,"Fri, 06 Dec 2019",Prob (F-statistic):,4.4e-117
Time:,12:45:35,Log-Likelihood:,-1058.1
No. Observations:,392,AIC:,2146.0
Df Residuals:,377,BIC:,2206.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,39.2275,1.095,35.831,0.000,37.075,41.380
C(year)[T.71],-1.3324,1.011,-1.318,0.188,-3.320,0.655
C(year)[T.72],-2.4755,0.984,-2.515,0.012,-4.411,-0.540
C(year)[T.73],-2.4558,0.902,-2.723,0.007,-4.229,-0.682
C(year)[T.74],-1.2385,1.040,-1.191,0.235,-3.284,0.807
C(year)[T.75],-2.6535,0.999,-2.657,0.008,-4.617,-0.690
C(year)[T.76],-1.2670,0.973,-1.302,0.194,-3.180,0.646
C(year)[T.77],0.5231,1.003,0.522,0.602,-1.449,2.496
C(year)[T.78],0.6564,0.958,0.685,0.494,-1.228,2.540

0,1,2,3
Omnibus:,8.584,Durbin-Watson:,1.428
Prob(Omnibus):,0.014,Jarque-Bera (JB):,9.741
Skew:,0.254,Prob(JB):,0.00767
Kurtosis:,3.582,Cond. No.,1680.0


## mpg ~ year + horsepower + cylinders

In [9]:
result_2 = perform_prediction_with_formula('mpg ~ year + horsepower + cylinders', auto_df)
result_2.summary()

0,1,2,3
Dep. Variable:,mpg,R-squared:,0.738
Model:,OLS,Adj. R-squared:,0.736
Method:,Least Squares,F-statistic:,364.0
Date:,"Fri, 06 Dec 2019",Prob (F-statistic):,2.15e-112
Time:,12:45:38,Log-Likelihood:,-1098.8
No. Observations:,392,AIC:,2206.0
Df Residuals:,388,BIC:,2221.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-10.1516,4.899,-2.072,0.039,-19.783,-0.520
year,0.6631,0.061,10.946,0.000,0.544,0.782
horsepower,-0.0587,0.010,-5.806,0.000,-0.079,-0.039
cylinders,-1.9462,0.221,-8.802,0.000,-2.381,-1.512

0,1,2,3
Omnibus:,21.247,Durbin-Watson:,1.193
Prob(Omnibus):,0.0,Jarque-Bera (JB):,24.198
Skew:,0.519,Prob(JB):,5.56e-06
Kurtosis:,3.635,Cond. No.,3210.0


### Which model is better? 

The one treating year as categorical value.


### What if there were more than 13 values for variable `year`? 

Continous - nothing.

Categorical - more parameters to train.

### Which model is easier to train? 

Continous one - there are less parameters to train.