In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [3]:
raw_data = pd.read_csv("./data/BP_Data.csv")

In [4]:
raw_data

Unnamed: 0,Age,Weight,Height,Pulse,Systol,Diastol
0,21,71.0,1629,88,170,76
1,22,56.5,1569,64,120,60
2,24,56.0,1561,68,125,75
3,24,61.0,1619,52,148,120
4,25,65.0,1566,72,140,78
5,27,62.0,1639,72,106,72
6,28,53.0,1494,64,120,76
7,28,53.0,1568,80,108,62
8,31,65.0,1540,76,124,70
9,32,57.0,1530,60,134,64


In [5]:
raw_data.shape

(39, 6)

# Simple Linear Regression

In [6]:
X = raw_data['Weight']
X = sm.add_constant(X)
y = raw_data['Systol']

In [7]:
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                 Systol   R-squared:                       0.272
Model:                            OLS   Adj. R-squared:                  0.252
Method:                 Least Squares   F-statistic:                     13.81
Date:                Tue, 15 Feb 2022   Prob (F-statistic):           0.000665
Time:                        23:11:00   Log-Likelihood:                -149.01
No. Observations:                  39   AIC:                             302.0
Df Residuals:                      37   BIC:                             305.3
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         66.5969     16.464      4.045      0.0

# Multiple Linear Regression

In [8]:
X = raw_data[['Age','Weight','Height','Pulse']]
X = sm.add_constant(X)
y = raw_data['Systol']

model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                 Systol   R-squared:                       0.337
Model:                            OLS   Adj. R-squared:                  0.259
Method:                 Least Squares   F-statistic:                     4.324
Date:                Tue, 15 Feb 2022   Prob (F-statistic):            0.00619
Time:                        23:11:00   Log-Likelihood:                -147.17
No. Observations:                  39   AIC:                             304.3
Df Residuals:                      34   BIC:                             312.7
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        100.1172     59.699      1.677      0.1

## But have we been doing lies? No.

In [9]:
manual_beta = np.dot(np.dot(np.linalg.inv(np.dot(X.T, X)), X.T), y)
manual_beta

array([ 1.00117217e+02, -4.87384697e-01,  1.28796234e+00, -1.95478783e-02,
       -7.65273076e-02])

In [10]:
error_terms = y - np.dot(X, manual_beta)

In [11]:
sigma2 = np.var(error_terms, ddof = 5)
sigma2

127.3244704089468

In [12]:
cov_beta = np.linalg.inv(np.dot(X.T, X))*sigma2
var_beta = np.diag(cov_beta)
np.sqrt(var_beta)

array([5.96987115e+01, 2.68839075e-01, 3.45468570e-01, 4.00630207e-02,
       2.06015043e-01])

## Enough Monkey's on Enough Typewriters...gives you "The Moon is a Harsh Mistress" and "LOTR" at the same time!

In [13]:
X = np.random.normal(loc=0.0, scale=1.0, size=[1000,99])
X = sm.add_constant(X)
y = np.random.normal(loc=0.0, scale=1.0, size=[1000,1])
model = sm.OLS(y, X)
results = model.fit()
print(results.summary()) ## "Enough"

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.104
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     1.053
Date:                Tue, 15 Feb 2022   Prob (F-statistic):              0.349
Time:                        23:11:02   Log-Likelihood:                -1342.8
No. Observations:                1000   AIC:                             2886.
Df Residuals:                     900   BIC:                             3376.
Df Model:                          99                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0324      0.033     -0.984      0.3

# Comparing Models

### Nested Models

In [14]:
num_obs = 30
X = np.random.uniform(size=[num_obs, 10])
X = sm.add_constant(X)
y = 10 + 5*X[:,1] + 3*X[:,2] + 4*X[:,4] + np.random.normal(loc = 0, scale = 0.6, size = num_obs)

model_full = sm.OLS(y, X)
results_full = model_full.fit()
print(results_full.summary()) ## "Enough"

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.959
Model:                            OLS   Adj. R-squared:                  0.938
Method:                 Least Squares   F-statistic:                     44.53
Date:                Tue, 15 Feb 2022   Prob (F-statistic):           4.76e-11
Time:                        23:11:03   Log-Likelihood:                -21.555
No. Observations:                  30   AIC:                             65.11
Df Residuals:                      19   BIC:                             80.52
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         10.0807      0.738     13.652      0.0

In [15]:
model_reduced = sm.OLS(y, X[:,[0,1,2,4]])
results_reduced = model_reduced.fit()
print(results_reduced.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.950
Model:                            OLS   Adj. R-squared:                  0.944
Method:                 Least Squares   F-statistic:                     163.4
Date:                Tue, 15 Feb 2022   Prob (F-statistic):           5.50e-17
Time:                        23:11:04   Log-Likelihood:                -24.672
No. Observations:                  30   AIC:                             57.34
Df Residuals:                      26   BIC:                             62.95
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         10.2146      0.304     33.564      0.0

In [16]:
model_dropping_useful = sm.OLS(y, X[:,[0,1,2]])
results_dropping_useful = model_dropping_useful.fit()
print(results_dropping_useful.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.754
Model:                            OLS   Adj. R-squared:                  0.736
Method:                 Least Squares   F-statistic:                     41.46
Date:                Tue, 15 Feb 2022   Prob (F-statistic):           5.88e-09
Time:                        23:11:04   Log-Likelihood:                -48.437
No. Observations:                  30   AIC:                             102.9
Df Residuals:                      27   BIC:                             107.1
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         10.9687      0.639     17.164      0.0

In [17]:
from statsmodels.stats.anova import anova_lm
anova_lm(results_reduced,results_full)

Unnamed: 0,df_resid,ssr,df_diff,ss_diff,F,Pr(>F)
0,26.0,9.098309,0.0,,,
1,19.0,7.391582,7.0,1.706727,0.626732,0.728005


In [18]:
from statsmodels.stats.anova import anova_lm
anova_lm(results_dropping_useful, results_full)

Unnamed: 0,df_resid,ssr,df_diff,ss_diff,F,Pr(>F)
0,27.0,44.36529,0.0,,,
1,19.0,7.391582,8.0,36.973708,11.880076,6e-06


### Are we telling lies? Let's do it by hand

In [19]:
numerator = np.sum((results_reduced.resid**2 - results_full.resid**2))/(results_reduced.df_resid- results_full.df_resid)
denominator = np.sum((results_full.resid**2))/(results_full.df_resid)
F_stat = numerator/denominator
print(F_stat)

0.6267323746484085


In [20]:
from scipy.stats import f
1-f.cdf(F_stat, (results_reduced.df_resid- results_full.df_resid), results_full.df_resid)

0.728004681695053

### Let's go back and look at R^2 and adjusted R^2

### Stepwise Regression (actually uses cross-validation) but the idea is the same!

In [24]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X, y)
reg.coef_

array([ 0.        ,  5.21231223,  2.95591151, -0.28036634,  3.16948628,
       -0.66605767,  0.39877574, -0.0134247 ,  0.10613507,  0.14290809,
        0.48617359])

In [26]:
reg.intercept_

10.08071832678506

In [33]:
reg_sequential =LinearRegression()
sfs = SequentialFeatureSelector(reg_sequential, n_features_to_select=4)
sfs.fit(X,y)
sfs.get_support()

array([False,  True,  True, False,  True,  True, False, False, False,
       False, False])