In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [2]:
raw_data = pd.read_csv("./data/BP_Data.csv")

In [3]:
raw_data

Unnamed: 0,Age,Weight,Height,Pulse,Systol,Diastol
0,21,71.0,1629,88,170,76
1,22,56.5,1569,64,120,60
2,24,56.0,1561,68,125,75
3,24,61.0,1619,52,148,120
4,25,65.0,1566,72,140,78
5,27,62.0,1639,72,106,72
6,28,53.0,1494,64,120,76
7,28,53.0,1568,80,108,62
8,31,65.0,1540,76,124,70
9,32,57.0,1530,60,134,64


In [4]:
raw_data.shape

(39, 6)

# Simple Linear Regression

In [5]:
X = raw_data['Weight']
X = sm.add_constant(X)
y = raw_data['Systol']

In [6]:
y

0     170
1     120
2     125
3     148
4     140
5     106
6     120
7     108
8     124
9     134
10    116
11    114
12    130
13    118
14    138
15    134
16    120
17    120
18    114
19    124
20    114
21    136
22    126
23    124
24    128
25    134
26    112
27    128
28    134
29    128
30    140
31    138
32    118
33    110
34    142
35    134
36    116
37    132
38    152
Name: Systol, dtype: int64

In [7]:
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                 Systol   R-squared:                       0.272
Model:                            OLS   Adj. R-squared:                  0.252
Method:                 Least Squares   F-statistic:                     13.81
Date:                Tue, 22 Feb 2022   Prob (F-statistic):           0.000665
Time:                        19:31:59   Log-Likelihood:                -149.01
No. Observations:                  39   AIC:                             302.0
Df Residuals:                      37   BIC:                             305.3
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         66.5969     16.464      4.045      0.0

# Multiple Linear Regression

In [8]:
X = raw_data[['Age','Weight','Height','Pulse']]
X = sm.add_constant(X)
y = raw_data['Systol']

model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                 Systol   R-squared:                       0.337
Model:                            OLS   Adj. R-squared:                  0.259
Method:                 Least Squares   F-statistic:                     4.324
Date:                Tue, 22 Feb 2022   Prob (F-statistic):            0.00619
Time:                        19:31:59   Log-Likelihood:                -147.17
No. Observations:                  39   AIC:                             304.3
Df Residuals:                      34   BIC:                             312.7
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        100.1172     59.699      1.677      0.1

## But have we been doing lies? No.

In [9]:
manual_beta = np.dot(np.dot(np.linalg.inv(np.dot(X.T, X)), X.T), y)
manual_beta

array([ 1.00117217e+02, -4.87384697e-01,  1.28796234e+00, -1.95478783e-02,
       -7.65273076e-02])

In [10]:
error_terms = y - np.dot(X, manual_beta)

In [11]:
sigma2 = np.var(error_terms, ddof = 5)
sigma2

127.3244704089468

In [12]:
cov_beta = np.linalg.inv(np.dot(X.T, X))*sigma2
var_beta = np.diag(cov_beta)
np.sqrt(var_beta)

array([5.96987115e+01, 2.68839075e-01, 3.45468570e-01, 4.00630207e-02,
       2.06015043e-01])

## Enough Monkey's on Enough Typewriters...gives you "The Moon is a Harsh Mistress" and "LOTR" at the same time!

In [13]:
X = np.random.normal(loc=0.0, scale=1.0, size=[1000,99])
X = sm.add_constant(X)
y = np.random.normal(loc=0.0, scale=1.0, size=[1000,1])
model = sm.OLS(y, X)
results = model.fit()
print(results.summary()) ## "Enough"

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.100
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     1.007
Date:                Tue, 22 Feb 2022   Prob (F-statistic):              0.466
Time:                        19:32:01   Log-Likelihood:                -1370.1
No. Observations:                1000   AIC:                             2940.
Df Residuals:                     900   BIC:                             3431.
Df Model:                          99                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0864      0.033     -2.585      0.0

# Comparing Models

### Nested Models

In [14]:
num_obs = 30
X = np.random.uniform(size=[num_obs, 10])
X = sm.add_constant(X)
y = 10 + 5*X[:,1] + 3*X[:,2] + 4*X[:,4] + np.random.normal(loc = 0, scale = 0.6, size = num_obs)

model_full = sm.OLS(y, X)
results_full = model_full.fit()
print(results_full.summary()) ## "Enough"

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.951
Model:                            OLS   Adj. R-squared:                  0.926
Method:                 Least Squares   F-statistic:                     37.03
Date:                Tue, 22 Feb 2022   Prob (F-statistic):           2.46e-10
Time:                        19:54:32   Log-Likelihood:                -20.868
No. Observations:                  30   AIC:                             63.74
Df Residuals:                      19   BIC:                             79.15
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         10.5317      0.739     14.255      0.0

In [15]:
model_reduced = sm.OLS(y, X[:,[0,1,2,4]])
results_reduced = model_reduced.fit()
print(results_reduced.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.937
Model:                            OLS   Adj. R-squared:                  0.929
Method:                 Least Squares   F-statistic:                     128.3
Date:                Tue, 22 Feb 2022   Prob (F-statistic):           1.06e-15
Time:                        19:55:30   Log-Likelihood:                -24.763
No. Observations:                  30   AIC:                             57.53
Df Residuals:                      26   BIC:                             63.13
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         10.2584      0.318     32.227      0.0

In [16]:
model_dropping_useful = sm.OLS(y, X[:,[0,1,2]])
results_dropping_useful = model_dropping_useful.fit()
print(results_dropping_useful.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.680
Model:                            OLS   Adj. R-squared:                  0.656
Method:                 Least Squares   F-statistic:                     28.70
Date:                Tue, 22 Feb 2022   Prob (F-statistic):           2.08e-07
Time:                        19:55:43   Log-Likelihood:                -49.070
No. Observations:                  30   AIC:                             104.1
Df Residuals:                      27   BIC:                             108.3
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         12.1705      0.570     21.366      0.0

In [18]:
from statsmodels.stats.anova import anova_lm
anova_lm(results_reduced,results_full)

Unnamed: 0,df_resid,ssr,df_diff,ss_diff,F,Pr(>F)
0,26.0,9.154151,0.0,,,
1,19.0,7.060583,7.0,2.093568,0.804826,0.5936


In [19]:
from statsmodels.stats.anova import anova_lm
anova_lm(results_dropping_useful, results_full)

Unnamed: 0,df_resid,ssr,df_diff,ss_diff,F,Pr(>F)
0,27.0,46.276957,0.0,,,
1,19.0,7.060583,8.0,39.216373,13.191387,3e-06


### Are we telling lies? Let's do it by hand

In [20]:
numerator = np.sum((results_reduced.resid**2 - results_full.resid**2))/(results_reduced.df_resid- results_full.df_resid)
denominator = np.sum((results_full.resid**2))/(results_full.df_resid)
F_stat = numerator/denominator
print(F_stat)

0.8048260336714497


In [21]:
from scipy.stats import f
1-f.cdf(F_stat, (results_reduced.df_resid- results_full.df_resid), results_full.df_resid)

0.5936003116813156

### Let's go back and look at R^2 and adjusted R^2

### Stepwise Regression (actually uses cross-validation) but the idea is the same!

In [26]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X, y)
reg.coef_

array([ 0.        ,  4.45525474,  2.5796808 , -0.62446963,  4.11583567,
       -0.44134847, -0.45993292, -0.05387097,  0.41380227,  0.49132356,
        0.10481864])

In [27]:
reg.intercept_

10.53169469051823

In [28]:
reg_sequential =LinearRegression()
sfs = SequentialFeatureSelector(reg_sequential, n_features_to_select=4)
sfs.fit(X,y)
sfs.get_support()

array([False,  True,  True, False,  True, False, False, False,  True,
       False, False])