In [8]:
import statsmodels.api as sm
from scipy import stats
import numpy as np
import pandas as pd

*Key Questions*
1. Can we fit a line that minimizes the least squares (find the line that has the least sum of squared residuals)?
2. How much of the sums of squares in our dependent variable can be explained by the independent variable (R^2)
3. How can we tell if our R^2 value is significant (F-statistic)


 - SS(mean) = (data - x_bar)^2
 - SS(fit) = (data - line)^2
 - R^2: ( SS(mean) - SS(fit) ) / SS(fit)
 - F-statistic: ( (SS(mean) - SS(fit)) / (pfit - pmean) ) / ( SS(fit) / (n - pfit) )

 - pfit = number of parameters in the line
 - pmean = number of parameters in the mean line

In [15]:
size = [1.4, 2.6, 1.0, 3.7, 5.5, 3.2, 3.0, 4.9, 6.3]
weight = [0.9, 1.8, 2.4, 3.5, 3.9, 4.4, 5.1, 5.6, 6.3]
weight = sm.add_constant(weight)

In [37]:
m = sm.OLS(size, weight)
results = m.fit()
print(stats.describe(results.resid))
print(results.summary())

DescribeResult(nobs=9, minmax=(-1.5482051282051295, 1.8851794871794851), mean=-1.7516852166308026e-15, variance=1.2398456730769227, skewness=0.02705828720635625, kurtosis=-0.8218541701430309)
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.613
Model:                            OLS   Adj. R-squared:                  0.558
Method:                 Least Squares   F-statistic:                     11.10
Date:                Tue, 21 Jun 2022   Prob (F-statistic):             0.0126
Time:                        18:58:51   Log-Likelihood:                -13.208
No. Observations:                   9   AIC:                             30.42
Df Residuals:                       7   BIC:                             30.81
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 c



In [20]:
cars = sm.datasets.get_rdataset('mtcars').data
cars.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [33]:
cars[["mpg", "hp", "wt"]].corr()

Unnamed: 0,mpg,hp,wt
mpg,1.0,-0.776168,-0.867659
hp,-0.776168,1.0,0.658748
wt,-0.867659,0.658748,1.0


In [34]:
y = cars["mpg"]
x1 = cars["hp"]
x2 = cars["wt"]

In [35]:
X1 = sm.add_constant(x1)
X2 = sm.add_constant(x2)
X = sm.add_constant(cars[["hp", "wt"]])

In [37]:
m1 = sm.OLS(y, X1).fit()
m2 = sm.OLS(y, X2).fit()
m = sm.OLS(y, X).fit()

results = {"m1": m1.params, "m2": m2.params, "m": m.params}
results

{'m1': const    30.098861
 hp       -0.068228
 dtype: float64,
 'm2': const    37.285126
 wt       -5.344472
 dtype: float64,
 'm': const    37.227270
 hp       -0.031773
 wt       -3.877831
 dtype: float64}

### Do results change when x1 = x2?
`Yes`

In [45]:
x2 = cars["hp"]
X2 = sm.add_constant(x2)
X = sm.add_constant(cars[["hp", "hp"]])

In [46]:
m1 = sm.OLS(y, X1).fit()
m2 = sm.OLS(y, X2).fit()
m = sm.OLS(y, X).fit()

results = {"m1": m1.params, "m2": m2.params, "m": m.params}
results

{'m1': const    30.098861
 hp       -0.068228
 dtype: float64,
 'm2': const    30.098861
 hp       -0.068228
 dtype: float64,
 'm': const    30.098861
 hp       -0.034114
 hp       -0.034114
 dtype: float64}