In [7]:
"""Chapter 3 | Linear Regression applied exercises."""

import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

from ISLP import load_data
from ISLP.models import (
    ModelSpec as MS,
    summarize,
    poly
)
from pandas.plotting import scatter_matrix
from statsmodels.stats.anova import anova_lm

In [8]:
# (a) Perform a simple linear regression of y onto x, without an intercept.
# Report the coefficient estimate βˆ, the standard error of this coefficient
# estimate, and the t-statistic and p-value associated with the null
# hypothesis H0 : β = 0. Comment on these results.

rng = np.random.default_rng(1)
x = rng.normal(size=100)
y = 2 * x + rng.normal(size=100)

model = sm.OLS(y, x)
results = model.fit()
summarize(results)

# Coefficient estimate: The estimated value of β is approximately 1.9762,
# which is close to the true value of 2 that was used to generate the data.

# Standard error: The standard error of the coefficient estimate is 0.117,
# which indicates the precision of the estimate. A smaller standard error
# suggests a more precise estimate.

# t-statistic: The t-statistic is 16.898, which is very high. This statistic
# measures how many standard deviations the coefficient estimate is away from
# 0.

# p-value: The p-value is 0.000, which is much less than the typical
# significance level of 0.05. This means there is strong evidence to reject
# the null hypothesis.

Unnamed: 0,coef,std err,t,P>|t|
x1,1.9762,0.117,16.898,0.0


In [9]:
# (b) Now perform a simple linear regression of x onto y without an intercept,
#  and report the coefficient estimate, its standard error, and the
# corresponding t-statistic and p-values associated with the null hypothesis
# H0 : β = 0. Comment on these results.

model_2 = sm.OLS(x, y)
results_2 = model_2.fit()
summarize(results_2)

# Coefficient estimate: The estimated coefficient of y is 0.3757. This means
# that for each unit increase in y, x increases by 0.3757 units. This is the
# inverse relationship of what we obtained prior.

# Standard error: The standard error of 0.022 indicates that the estimate is
# precise, similar to the previous setup.

# t-statistic: The t-statistic is 16.898, indicating the coefficient estimate
# is many standard deviations away from 0.

# p-value: The p-value is 0.000, which is less than 0.05, suggesting the
# coefficient is statistically significant.

Unnamed: 0,coef,std err,t,P>|t|
x1,0.3757,0.022,16.898,0.0


In [None]:
# (c) What is the relationship between the results obtained in (a) and (b)?

# Given that 𝑦 = 2𝑥 + 𝜖 holds true, theoretically 𝑥 = (1/2)𝑦 + 𝜖′ should also
# hold, hence 0.5 ≈ 0.3757 with some discrepancy due to the noise added in
# the generation of 𝑦.

# The relationship between these two sets of results demonstrates the
# inherent symmetry in simple linear regression. Switching the
# dependent and independent variables results in reciprocal coefficients,
# provided the model fits well and there's no intercept.

In [None]:
# Skipped (d) and (e)

In [11]:
# (f) In Python, show that when regression is performed with an intercept,
# the t-statistic for H0 : β1 = 0 is the same for the regression of y onto x
# as it is for the regression of x onto y.

# Add a constant (intercept) term to the models
X = sm.add_constant(x)
Y = sm.add_constant(y)

# Perform the regression of y onto x
model_3 = sm.OLS(y, X)
results_3 = model_3.fit()

# Perform the regression of x onto y
model_4 = sm.OLS(x, Y)
results_4 = model_4.fit()

# Print the summary of both models
print(summarize(results_3))
print(summarize(results_4))

# We can observe the t-statistic is the same for β1 across both models.


         coef  std err       t  P>|t|
const -0.0760    0.101  -0.756  0.451
x1     1.9686    0.118  16.734  0.000
         coef  std err       t  P>|t|
const  0.0095    0.044   0.216  0.829
x1     0.3763    0.022  16.734  0.000
