# Simple Linear Models

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from urllib.request import urlopen
from statsmodels.formula.api import ols
import statsmodels.regression.linear_model as sm
from statsmodels.stats.anova import anova_lm

## Define Models through Formulas

In [2]:
# Get the data
inFile = 'swim100m.csv'
url_base = 'https://raw.githubusercontent.com/thomas-haslwanter/statsintro_python/master/ipynb/Data/data_kaplan/'
url = url_base + inFile
data = pd.read_csv(urlopen(url))

### OLS Model

In [3]:
# Different models
model1 = ols("time ~ sex", data).fit()  # one factor
print(model1.summary())

                            OLS Regression Results                            
Dep. Variable:                   time   R-squared:                       0.287
Model:                            OLS   Adj. R-squared:                  0.275
Method:                 Least Squares   F-statistic:                     24.13
Date:                Sun, 20 May 2018   Prob (F-statistic):           7.28e-06
Time:                        14:09:37   Log-Likelihood:                -219.23
No. Observations:                  62   AIC:                             442.5
Df Residuals:                      60   BIC:                             446.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     65.1923      1.517     42.986      0.0

### ANOVA

In [4]:
print(anova_lm(model1))

            df       sum_sq      mean_sq          F    PR(>F)
sex        1.0  1720.655232  1720.655232  24.132575  0.000007
Residual  60.0  4278.006477    71.300108        NaN       NaN


  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [5]:
model2 = ols("time ~ sex + year", data).fit()   # two factors
print(model2.summary())

                            OLS Regression Results                            
Dep. Variable:                   time   R-squared:                       0.844
Model:                            OLS   Adj. R-squared:                  0.839
Method:                 Least Squares   F-statistic:                     159.6
Date:                Sun, 20 May 2018   Prob (F-statistic):           1.58e-24
Time:                        14:10:24   Log-Likelihood:                -172.12
No. Observations:                  62   AIC:                             350.2
Df Residuals:                      59   BIC:                             356.6
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    555.7168     33.800     16.441      0.0

In [6]:
model3 = ols("time ~ sex * year", data).fit()   # two factors with interaction
print(model3.summary())

                            OLS Regression Results                            
Dep. Variable:                   time   R-squared:                       0.893
Model:                            OLS   Adj. R-squared:                  0.888
Method:                 Least Squares   F-statistic:                     162.1
Date:                Sun, 20 May 2018   Prob (F-statistic):           3.67e-28
Time:                        14:10:40   Log-Likelihood:                -160.30
No. Observations:                  62   AIC:                             328.6
Df Residuals:                      58   BIC:                             337.1
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept       697.3012     39.221     17.779

In [7]:
print(anova_lm(model3))

            df       sum_sq      mean_sq           F        PR(>F)
sex        1.0  1720.655232  1720.655232  156.140793  4.299569e-18
year       1.0  3342.177104  3342.177104  303.285733  1.039245e-24
sex:year   1.0   296.675432   296.675432   26.921801  2.826421e-06
Residual  58.0   639.153942    11.019896         NaN           NaN


  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


## Polynomial Regression

In [8]:
# Generate the data
t = np.arange(0,10,0.1)
y = 4 + 3*t + 2*t**2 + 5*np.random.randn(len(t))

# Make the fit. Note that this is another "OLS" than the one in "model_formulas"!
M = np.column_stack((np.ones(len(t)), t, t**2))
res = sm.OLS(y, M).fit()
    
# Display the results
print('Summary:')
print(res.summary())

Summary:
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.995
Model:                            OLS   Adj. R-squared:                  0.995
Method:                 Least Squares   F-statistic:                     9345.
Date:                Sun, 20 May 2018   Prob (F-statistic):          1.19e-111
Time:                        14:12:13   Log-Likelihood:                -301.01
No. Observations:                 100   AIC:                             608.0
Df Residuals:                      97   BIC:                             615.8
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.1358      1.466      1.457

In [9]:
print('The fit parameters are: {0}'.format(str(res.params)))
print('The confidence intervals are:')
print(res.conf_int())

The fit parameters are: [ 2.1358047   3.68499153  1.95721038]
The confidence intervals are:
[[-0.77377743  5.04538683]
 [ 2.32667129  5.04331178]
 [ 1.82444563  2.08997512]]
