In [None]:
# Install relevant libraries and read in dataset
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

ames = pd.read_csv("AmesHousing.csv")
ames.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


## 13.2.5

In [None]:
# Split the data
X = ames.drop("SalePrice", axis = 1)
y = ames["SalePrice"]



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 1. Model using size and number of rooms
from sklearn.compose import ColumnTransformer

ct_1 = ColumnTransformer(
  [
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)


lr_pipeline_1 = Pipeline(
  [("preprocessing", ct_1),
  ("linear_regression", LinearRegression())]
)

lr_pipeline_1_fitted = lr_pipeline_1.fit(X_train, y_train)
y_preds_1 = lr_pipeline_1_fitted.predict(X_test)

In [None]:
# Calculate RMSE of Model 1
from sklearn.metrics import root_mean_squared_error
root_mean_squared_error(y_test, y_preds_1)

61928.53719680031

In [None]:
# 2. Model using size, number of rooms, and building type
ct_2 = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(drop="first", sparse_output = False), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)


lr_pipeline_2 = Pipeline(
  [("preprocessing", ct_2),
  ("linear_regression", LinearRegression())]
)

lr_pipeline_2_fitted = lr_pipeline_2.fit(X_train, y_train)
y_preds_2 = lr_pipeline_2_fitted.predict(X_test)

In [None]:
# Calculate RMSE of Model 2
root_mean_squared_error(y_test, y_preds_2)

59589.20317423356

In [None]:
# 3. Model using size, building type, and their interaction
ct_preprocessing_3 = ColumnTransformer(
  [("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
  ("standardize", StandardScaler(), ["Gr Liv Area"])],
  remainder = "drop"
).set_output(transform = "pandas")

ct_inter = ColumnTransformer(
  [
    ("interaction", PolynomialFeatures(interaction_only = True), ["dummify__Bldg Type_1Fam", "standardize__Gr Liv Area"]),
  ],
  remainder = "passthrough"
).set_output(transform = "pandas")

X_train_preprocessed = ct_preprocessing_3.fit_transform(X_train, y_train)
X_train_inter = ct_inter.fit_transform(X_train_preprocessed, y_train)

lr_pipeline_3 = Pipeline(
  [("preprocessing", ct_preprocessing_3),
   ("inter", ct_inter),
    ("linear_regression", LinearRegression())]
)

lr_pipeline_3_fitted = lr_pipeline_3.fit(X_train, y_train)
y_preds_3 = lr_pipeline_3_fitted.predict(X_test)

In [None]:
# Calculate RMSE of Model 3
root_mean_squared_error(y_test, y_preds_3)

58698.924408821666

In [None]:
# 4. Model using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and building type

ct_preprocessing_4 = ColumnTransformer(
    [("dummify", OneHotEncoder(drop="first", sparse_output = False), ["Bldg Type"]),
     ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])], remainder = "drop").set_output(transform = "pandas")

ct_poly = ColumnTransformer(
  [("poly_gr_liv_area", PolynomialFeatures(degree=5, include_bias=False), ["standardize__Gr Liv Area"]),
   ("poly_tot_rms_abvgrd", PolynomialFeatures(degree=5, include_bias=False), ["standardize__TotRms AbvGrd"])], remainder="passthrough").set_output(transform="pandas")

X_train_preprocessed = ct_preprocessing_4.fit_transform(X_train, y_train)
X_train_poly = ct_poly.fit_transform(X_train_preprocessed, y_train)

lr_pipeline_4 = Pipeline(
  [("preprocessing", ct_preprocessing_4),
   ("poly", ct_poly),
    ("linear_regression", LinearRegression())]
)

lr_pipeline_4_fitted = lr_pipeline_4.fit(X_train, y_train)
y_preds_4 = lr_pipeline_4_fitted.predict(X_test)

In [None]:
# Calculate RMSE of Model 4
root_mean_squared_error(y_test, y_preds_4)

61742.47507941424


Based on the calculated root mean squared errors for all four of the above models, it seems **Model 3 performed the best** as it had the lowest RMSE (roughly 58,699).


## 13.3.1

In [None]:
# cross_val_score for Model 1
from sklearn.model_selection import cross_val_score
scores_1 = cross_val_score(lr_pipeline_1, X, y, cv=5, scoring="neg_root_mean_squared_error")
scores_1.mean()

-55806.32634926364

In [None]:
# cross_val_score for Model 2
scores_2 = cross_val_score(lr_pipeline_2, X, y, cv=5, scoring="neg_root_mean_squared_error")
scores_2.mean()

-54168.081429193844

In [None]:
# cross_val_score for Model 3
scores_3 = cross_val_score(lr_pipeline_3, X, y, cv=5, scoring="neg_root_mean_squared_error")
scores_3.mean()

-54096.070676835996

In [None]:
# cross_val_score for Model 4
scores_4 = cross_val_score(lr_pipeline_4, X, y, cv=5, scoring="neg_root_mean_squared_error")
scores_4.mean()

-55176.96594338135


Once again Model 3 performed the best, which aligns with the RMSE analyses earlier.


## 13.3.3

In [21]:
# Trying various degrees of size and number of rooms through GridSearchCV
from sklearn.model_selection import GridSearchCV

ct = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(drop="first", sparse_output=False), ["Bldg Type"]),
        ("poly_gr_liv_area", Pipeline([("polynomial", PolynomialFeatures())]), ["Gr Liv Area"]),
        ("poly_tot_rms_abvgrd", Pipeline([("polynomial", PolynomialFeatures())]), ["TotRms AbvGrd"])
    ],
    remainder="drop"
)

lr_pipeline = Pipeline(
  [("preprocessing", ct),
  ("linear_regression", LinearRegression())]
)

param_grid = {
    "preprocessing__poly_gr_liv_area__polynomial__degree": np.arange(1, 11),
    "preprocessing__poly_tot_rms_abvgrd__polynomial__degree": np.arange(1, 11)
}

gscv = GridSearchCV(lr_pipeline, param_grid, cv=5, scoring="r2")
gscv_fitted = gscv.fit(X, y)

print("Best parameters:", gscv_fitted.best_params_)
print("Best R-squared score:", gscv_fitted.best_score_)

Best parameters: {'preprocessing__poly_gr_liv_area__polynomial__degree': 3, 'preprocessing__poly_tot_rms_abvgrd__polynomial__degree': 1}
Best R-squared score: 0.5576406034746231


Q1: The model with a degree 3 polynomial for house size and a degree 1 polynomial for number of rooms performed best.

Q2: The main downside of trying all possible model options is that it's computationally time-consuming. Personally, I would start by picking a smaller range of values for tuning (for example, degrees 1 through 5) and if I do not get a satisfactory R-squared value from any of those, I would see if other numbers would work better.