# 8.4 Exercises

In [1]:
# Data Simulation (do not modify, just run)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.preprocessing import PolynomialFeatures

np.random.seed(42)
age = np.random.uniform(0, 80, 100)
a1 = -0.02
h1 = 40
k1 = 100
cog_fun = a1 * (age - h1)**2 + k1 + np.random.normal(0, 5, 100)
cog_fun = np.clip(cog_fun, 0, 100)
dataset1 = pd.DataFrame({'age': age, 'cog_fun': cog_fun})
dataset1 = dataset1[dataset1['age'] <= 65]
age = np.array(dataset1['age'])
cog_fun = np.array(dataset1['cog_fun'])
size = np.random.uniform(120, 220, 100)
b2 = (52 - 27) / (220 - 120)
c2 = 27
shoe_size = b2 * (size - 120) + c2 + np.random.normal(0, 1.5, 100)
shoe_size = np.clip(shoe_size, 27, 52)
dataset2 = pd.DataFrame({'size': size, 'shoe_size': shoe_size})

## Exercise 1: Detecting curvelinear relationships

In the following code chunk, two dataset are simulated. Identify the one which includes a non-linear relationship. 

Dataset 1: `age` is used to predict `cog_fun` (Cognitive functions).
Dataset 2: `size` is used to predict `shoe_size`.

In [None]:
print(dataset1.head())
print(dataset2.head())

# Plot the data

# Fit a linear model

# Plot residuals

##solution

# Plot data
plt.figure(figsize=(14, 6))
# Plot for dataset1
plt.subplot(1, 2, 1)
sns.scatterplot(x='age', y='cog_fun', data=dataset1, color='blue')
plt.title('Dataset 1: Age vs. Cog Fun')
plt.xlabel('Age')
plt.ylabel('Cog Fun')
plt.xlim(0, 80)
plt.ylim(40, 110)
plt.grid(True)

# Plot for dataset2
plt.subplot(1, 2, 2)
sns.scatterplot(x='size', y='shoe_size', data=dataset2, color='orange')
plt.title('Dataset 2: Size vs. Shoe Size')
plt.xlabel('Size')
plt.ylabel('Shoe Size')
plt.xlim(120, 220)
plt.ylim(27, 52)
plt.grid(True)

# Fit linear models and plot residuals
# dataset1
polynomial_features_p1 = PolynomialFeatures(degree=1, include_bias=True)
age_p1 = polynomial_features_p1.fit_transform(age.reshape(-1, 1))

linear_model1 = sm.OLS(cog_fun, age_p1).fit()
linear_fit1 = linear_model1.predict(age_p1)
linear_residuals1 = linear_model1.resid

# Fit linear model to dataset2
polynomial_features_p1 = PolynomialFeatures(degree=1, include_bias=True)
size_p1 = polynomial_features_p1.fit_transform(size.reshape(-1, 1))

linear_model2 = sm.OLS(shoe_size, size_p1).fit()
linear_fit2 = linear_model2.predict(age_p1)
linear_residuals2 = linear_model2.resid

# Plotting the residuals
plt.figure(figsize=(14, 6))

# Residuals for dataset1
plt.subplot(1, 2, 1)
sns.scatterplot(x=age, y=linear_residuals1, color='blue', label='Residuals')
plt.axhline(0, color='red', linestyle='--', label='Zero Residual Line')
plt.title('Residuals for Dataset 1 (Age vs. Cog Fun)')
plt.xlabel('Age')
plt.ylabel('Residuals')
plt.grid(True)
plt.legend()

# Residuals for dataset2
plt.subplot(1, 2, 2)
sns.scatterplot(x=size, y=linear_residuals2, color='orange', label='Residuals')
plt.axhline(0, color='red', linestyle='--', label='Zero Residual Line')
plt.title('Residuals for Dataset 2 (Size vs. Shoe Size)')
plt.xlabel('Size')
plt.ylabel('Residuals')
plt.grid(True)
plt.legend()

plt.tight_layout()
plt.show()

## Exercise 2: Fitting an polynomial

Fit a 2nd-degree polynomial to the dataset which suggest a non-linear relationship. Plot the model onto the data. Print the model summary and interpret it.   

In [None]:
# Fit the model

# Plot the model

# Hint: if your plot looks odd, try to incooperate the following lines, sorting your varibales.
# sorted_indices = np.argsort(age)
# age_sorted = age[sorted_indices]
# quadratic_fit_sorted = quadratic_fit[sorted_indices]

##solution
# Dataset 1 suggests a non-linear relationship. Thus, we fit the 2nd-degree polynomial to
# Dataset 2.

#Fit the model
polynomial_features_p2 = PolynomialFeatures(degree=2, include_bias=True)
age_p2 = polynomial_features_p2.fit_transform(age.reshape(-1, 1))

quadratic_model = sm.OLS(cog_fun, age_p2).fit()
quadratic_fit = quadratic_model.predict(age_p2)
quadratic_residuals = quadratic_model.resid

sorted_indices = np.argsort(age)
age_sorted = age[sorted_indices]
quadratic_fit_sorted = quadratic_fit[sorted_indices]

# Plot the model
plt.figure(figsize=(12, 6))
plt.scatter(age, cog_fun, color='blue', label='Original Data')
plt.plot(age_sorted, quadratic_fit_sorted, color='red', label='Quadratic Fit', linewidth=2)
plt.title('Quadratic Model Fit (Age vs Cog Fun)')
plt.xlabel('Age')
plt.ylabel('Cognitive Function (Cog Fun)')
plt.legend()
plt.grid(True)

print(quadratic_model.summary())

## Exercise 3: Centering

Fit your model from Exercise 2 again, this time with centered predictors. How does the Interpretation change?

In [None]:
# Center predictor
dataset1['age_centered'] = dataset1['age'] - dataset1['age'].mean()

# Extract your variables from the dataset (technically not needed but convinient,
# your code from above should run now without any changes)
age = np.array(dataset1['age_centered'])
cog_fun = np.array(dataset1['cog_fun'])



#  Fit the model

# Plot the model

# Hint: if your plot looks odd, try to incooperate the following lines, sorting your varibales.
# sorted_indices = np.argsort(age)
# age_sorted = age[sorted_indices]
# quadratic_fit_sorted = quadratic_fit[sorted_indices]

##solution
# Dataset 1 suggests a non-linear relationship. Thus, we fit the 2nd-degree polynomial to
# Dataset 2.

#Fit the model
polynomial_features_p2 = PolynomialFeatures(degree=2, include_bias=True)
age_p2 = polynomial_features_p2.fit_transform(age.reshape(-1, 1))

quadratic_model = sm.OLS(cog_fun, age_p2).fit()
quadratic_fit = quadratic_model.predict(age_p2)
quadratic_residuals = quadratic_model.resid

sorted_indices = np.argsort(age)
age_sorted = age[sorted_indices]
quadratic_fit_sorted = quadratic_fit[sorted_indices]

# Plot the model
plt.figure(figsize=(12, 6))
plt.scatter(age, cog_fun, color='blue', label='Original Data')
plt.plot(age_sorted, quadratic_fit_sorted, color='red', label='Quadratic Fit', linewidth=2)
plt.title('Quadratic Model Fit (Age vs Cog Fun)')
plt.xlabel('Age')
plt.ylabel('Cognitive Function (Cog Fun)')
plt.legend()
plt.grid(True)

## Voluntary Exercise 1: Higher-order polynomials

Use the dataset you used already to fit a 2nd-degree polynomial. However this time, fit a 3rd-order one an plot it. Also compare fit measures and state whether the 2nd-order one or the 3rd-order one provides better fit. Does the higher-order polynomial provide any value?

In [None]:
# Fit the model

# Plot the model

# Get fit measures of both models

##solution
#Fit the model
polynomial_features_p3 = PolynomialFeatures(degree=3, include_bias=True)
age_p3 = polynomial_features_p3.fit_transform(age.reshape(-1, 1))

cubic_model = sm.OLS(cog_fun, age_p3).fit()
cubic_fit = cubic_model.predict(age_p3)
cubic_residuals = cubic_model.resid

sorted_indices = np.argsort(age)
age_sorted = age[sorted_indices]
cubic_fit_sorted = cubic_fit[sorted_indices]

# Plot the model
plt.figure(figsize=(12, 6))
plt.scatter(age, cog_fun, color='blue', label='Original Data')
plt.plot(age_sorted, cubic_fit_sorted, color='red', label='Quadratic Fit', linewidth=2)
plt.title('Cubic Model Fit (Age vs Cog Fun)')
plt.xlabel('Age')
plt.ylabel('Cognitive Function (Cog Fun)')
plt.legend()
plt.grid(True)

# Get fit measures
print(quadratic_model.summary(),cubic_model.summary())