# Regression, continued.

## Setup

Load the packages and configure environment.

In [None]:
%matplotlib inline

import matplotlib.pylab as plt
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

## Interaction Terms

### Advertising Data

Using the Advertising data from ISL.

In [None]:
# download the data set directly from the web using pandas
url = "https://raw.githubusercontent.com/olearydj/INSY7120/refs/heads/main/notebooks/data/Advertising.csv"
data = pd.read_csv(url)

In [None]:
# recall that we need to drop the duplicated row numbers in the first column
sales = data.drop(data.columns[0], axis=1)
sales.head()

If we are interested in a model based on radio, TV and their interaction, first get the **main effects**:

In [None]:
# get the predictors of interest
X = sales[['radio', 'TV']]
y = sales[['sales']]

Then use `PolynomialFeatures` from SKL to transform the features before fitting the model. In this case:

- `degree=2` limited to two-way interactions (products of two variables) between features
- `interaction_only=True` generates only the interaction terms (e.g., $radio \times tv$), without the squared terms (e.g., $radio^2$)
- `include_bias=False` lets LinearRegression compute the intercept

The process below first specifies the transformation and then applies it with the fit method.

In [None]:
# generate interaction terms
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_interact = poly.fit_transform(X)

# inspect result - no head method for numpy, slice
X_interact[:5]

We can see that the first two columns are the original values for radio and TV and the third is their product.

To confirm the features created, use `poly.get_feature_names_out()`

In [None]:
poly.get_feature_names_out()

After transforming the input features, we can continue fitting the model and evaluating the results, as before.

In [None]:
mlr_interact = LinearRegression()

# use the transformed predictors!
mlr_interact.fit(X_interact, y)

In [None]:
# look at the estimated model parameters
print(f"Model Coefficients: {mlr_interact.coef_}")
print(f"Model Intercept: {mlr_interact.intercept_}")

In [None]:
# Make predictions with interaction data!
y_pred = mlr_interact.predict(X_interact)

# Evaluate the model
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y, y_pred)

print(f"Multiple Linear Regression Model, with Interaction Terms:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.4f}")

Compare without interaction.

In [None]:
mlr = LinearRegression()
mlr.fit(X, y)

# for just r2, use score method of fitted model
# this generates predictions implicitly
# for other metrics you need to predict first
r2 = mlr.score(X, y)
print(f"R² Score: {r2:.4f}")

Define a function to simplify.

In [None]:
def quick_fit(X, y):
    model = LinearRegression()
    model.fit(X, y)
    r2 = model.score(X,y)
    print(f"R² Score: {r2:.4f}")

Compare with SLR using radio.

In [None]:
X = sales[['radio']]
quick_fit(X, y)

Compare with SLR using TV.

In [None]:
X = sales[['TV']]
quick_fit(X, y)

SLR radio (0.332) < SLR TV (0.612) < MLR radio + TV (0.897) < MLR radio * TV (0.968)

### Credit Data

Use `Credit` dataset from ISL.

In [None]:
# download the data set directly from the web using pandas
url = "https://raw.githubusercontent.com/olearydj/INSY7120/refs/heads/main/notebooks/data/Credit.csv"
credit = pd.read_csv(url)

In [None]:
credit.columns = credit.columns.str.lower()
credit = pd.get_dummies(credit, drop_first=True, dtype=int)
credit.head()

Predict `balance` from `income` (quant) and `student` (qual).

In [None]:
# get the predictors of interest
X = credit[['income', 'student_Yes']]
y = credit[['balance']]

In [None]:
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_interact = poly.fit_transform(X)

# inspect result - no head method for numpy, slice
X_interact[:5]

In [None]:
poly.get_feature_names_out()

In [None]:
quick_fit(X_interact, y)

In [None]:
quick_fit(X, y)

In [None]:
# Create a figure with two subplots side by side
fig, axes = plt.subplots(1, 2, figsize=(16, 7))
fig.suptitle('Income vs Balance by Student Status: Without vs With Interaction', fontsize=16)

# Get student and non-student data
students = credit[credit['student_Yes'] == 1]
non_students = credit[credit['student_Yes'] == 0]

# Common x range for prediction lines
x_range = np.linspace(credit['income'].min(), credit['income'].max(), 100)

# ------ Left plot: Model without interaction (X) ------
# Scatter all data points
axes[0].scatter(non_students['income'], non_students['balance'], alpha=0.5, color='blue', label='Non-Student')
axes[0].scatter(students['income'], students['balance'], alpha=0.5, color='red', label='Student')

# Fit model without interaction
model_no_interact = LinearRegression().fit(X, y)

# Predict for non-students and students
X_pred_non = np.column_stack([x_range, np.zeros(100)])  # student_yes = 0
X_pred_stu = np.column_stack([x_range, np.ones(100)])   # student_yes = 1
y_pred_non = model_no_interact.predict(X_pred_non)
y_pred_stu = model_no_interact.predict(X_pred_stu)

# Plot regression lines
axes[0].plot(x_range, y_pred_non, 'b-', linewidth=2, label='Non-Student Line')
axes[0].plot(x_range, y_pred_stu, 'r-', linewidth=2, label='Student Line')
axes[0].set_title('Without Interaction (Main Effects Only)')
axes[0].set_xlabel('Income')
axes[0].set_ylabel('Balance')
axes[0].legend()
axes[0].grid(alpha=0.3)

# ------ Right plot: Model with interaction (X_interact) ------
# Scatter all data points
axes[1].scatter(non_students['income'], non_students['balance'], alpha=0.5, color='blue', label='Non-Student')
axes[1].scatter(students['income'], students['balance'], alpha=0.5, color='red', label='Student')

# Fit model with interaction
model_interact = LinearRegression().fit(X_interact, y)

# Prepare prediction data for interaction model
X_interact_pred_non = poly.transform(X_pred_non)  # Transform with interaction for non-students
X_interact_pred_stu = poly.transform(X_pred_stu)  # Transform with interaction for students
y_interact_pred_non = model_interact.predict(X_interact_pred_non)
y_interact_pred_stu = model_interact.predict(X_interact_pred_stu)

# Plot regression lines
axes[1].plot(x_range, y_interact_pred_non, 'b-', linewidth=2, label='Non-Student Line')
axes[1].plot(x_range, y_interact_pred_stu, 'r-', linewidth=2, label='Student Line') 
axes[1].set_title('With Interaction')
axes[1].set_xlabel('Income')
axes[1].set_ylabel('Balance')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.show()

## Polynomial Terms

Use `Auto` dataset.

In [None]:
# download the data set directly from the web using pandas
url = "https://raw.githubusercontent.com/olearydj/INSY7120/refs/heads/main/notebooks/data/Auto.csv"
cars = pd.read_csv(url)

In [None]:
cars.head()

From HW1, you may have seen that `autos` includes question marks for some horsepower values (5 rows). We'll have to clean that up first. For this example we'll simply convert them to `NaN` and drop those rows.

In [None]:
cars['horsepower'] = pd.to_numeric(cars['horsepower'], errors='coerce')
cars_clean = cars.dropna(subset=['horsepower'])

Same procedure as before, except `interaction_only=False` (the default).

In [None]:
# get the predictors of interest
X = cars_clean[['horsepower']]
y = cars_clean[['mpg']]

In [None]:
# generate polynomial terms
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_interact = poly.fit_transform(X)

# inspect result - no head method for numpy, slice
X_interact[:5]

In [None]:
poly.get_feature_names_out()

In [None]:
cars_power = LinearRegression()
cars_power.fit(X_interact, y)

In [None]:
# look at the estimated model parameters
print(f"Model Coefficients: {cars_power.coef_}")
print(f"Model Intercept: {cars_power.intercept_}")

In [None]:
# Make predictions with interaction data!
y_pred = cars_power.predict(X_interact)

# Evaluate the model
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y, y_pred)

print(f"Multiple Linear Regression Model, with Polynomial Terms:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.4f}")