In [1]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

In [2]:
boston_obj = load_boston()
data_train, data_test, price_train, price_test = train_test_split(boston_obj.data, boston_obj.target)

In [3]:
import statsmodels.api as sm
import numpy as np

In [4]:
data_train, data_test = sm.add_constant(data_train), sm.add_constant(data_test)    # Necessary to add the intercept
data_train[:5, :]

array([[1.00000e+00, 1.41500e-01, 0.00000e+00, 6.91000e+00, 0.00000e+00,
        4.48000e-01, 6.16900e+00, 6.60000e+00, 5.72090e+00, 3.00000e+00,
        2.33000e+02, 1.79000e+01, 3.83370e+02, 5.81000e+00],
       [1.00000e+00, 1.22040e-01, 0.00000e+00, 2.89000e+00, 0.00000e+00,
        4.45000e-01, 6.62500e+00, 5.78000e+01, 3.49520e+00, 2.00000e+00,
        2.76000e+02, 1.80000e+01, 3.57980e+02, 6.65000e+00],
       [1.00000e+00, 1.34284e+00, 0.00000e+00, 1.95800e+01, 0.00000e+00,
        6.05000e-01, 6.06600e+00, 1.00000e+02, 1.75730e+00, 5.00000e+00,
        4.03000e+02, 1.47000e+01, 3.53890e+02, 6.43000e+00],
       [1.00000e+00, 1.59360e-01, 0.00000e+00, 6.91000e+00, 0.00000e+00,
        4.48000e-01, 6.21100e+00, 6.50000e+00, 5.72090e+00, 3.00000e+00,
        2.33000e+02, 1.79000e+01, 3.94460e+02, 7.44000e+00],
       [1.00000e+00, 3.76800e-02, 8.00000e+01, 1.52000e+00, 0.00000e+00,
        4.04000e-01, 7.27400e+00, 3.83000e+01, 7.30900e+00, 2.00000e+00,
        3.29000e+02, 1.260

In [5]:
data_train[:5, 0]

array([1., 1., 1., 1., 1.])

In [6]:
ols1 = sm.OLS(price_train, data_train)    # Target, features
model1 = ols1.fit()
model1.params    # The parameters of the regression

array([ 3.64424531e+01, -1.14274506e-01,  4.29514615e-02,  2.13346874e-02,
        3.37968985e+00, -1.87457001e+01,  3.96451094e+00, -1.10620098e-02,
       -1.58529701e+00,  3.13376260e-01, -1.27981020e-02, -9.56905746e-01,
        9.85052307e-03, -4.53309996e-01])

In [7]:
model1.predict([[    # An example prediction
    1,      # Intercept term; always 1
    10,     # Per capita crime rate
    25,     # Proportion of land zoned for large homes
    5,      # Proportion of land zoned for non-retail business
    1,      # Tract bounds the Charles River
    0.3,    # NOX concentration
    10,     # Average number of rooms per dwelling
    2,      # Proportion of owner-occupied units built prior to 1940
    10,     # Weighted distance to employment centers
    3,      # Index for highway accessibility
    400,    # Tax rate
    15,     # Pupil/teacher ratio
    200,    # Index for number of blacks
    5       # % lower status of population
]])

array([39.17701957])

In [8]:
print(model1.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.746
Model:                            OLS   Adj. R-squared:                  0.737
Method:                 Least Squares   F-statistic:                     82.50
Date:                Mon, 18 May 2020   Prob (F-statistic):          4.81e-100
Time:                        17:39:31   Log-Likelihood:                -1120.6
No. Observations:                 379   AIC:                             2269.
Df Residuals:                     365   BIC:                             2324.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         36.4425      5.993      6.081      0.0

In [9]:
ols2 = sm.OLS(price_train, np.delete(data_train, [3, 7], axis=1))
model2 = ols2.fit()
print(model2.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.746
Model:                            OLS   Adj. R-squared:                  0.738
Method:                 Least Squares   F-statistic:                     97.82
Date:                Mon, 18 May 2020   Prob (F-statistic):          6.57e-102
Time:                        17:39:57   Log-Likelihood:                -1120.9
No. Observations:                 379   AIC:                             2266.
Df Residuals:                     367   BIC:                             2313.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         36.6486      5.962      6.147      0.0

In [10]:
np.exp((model2.aic - model1.aic)/2)

0.18366383307728376

In [11]:
np.exp((model1.aic - model2.aic)/2)

5.4447300987081695

In [12]:
from sklearn.metrics import mean_squared_error

In [13]:
price_train_pred = model2.predict(np.delete(data_train, [3, 7], axis=1))
mean_squared_error(price_train, price_train_pred)     # Performance on the training set

21.693403990588923

In [14]:
price_test_pred = model2.predict(np.delete(data_test, [3, 7], axis=1))
mean_squared_error(price_test, price_test_pred)     # Performance on the training set

23.056569590691115

In [15]:
price_test_pred_mod1 = model1.predict(data_test)
mean_squared_error(price_test, price_test_pred_mod1)

23.263516518601527