In [29]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.metrics import mean_squared_error

In [3]:
ames = pd.read_csv('https://www.dropbox.com/scl/fi/g0n5le5p6fr136ggetfsf/AmesHousing.csv?rlkey=jlr9xtz1o6u5rghfo29a5c02f&dl=1')
ames

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,2926,923275080,80,RL,37.0,7937,Pave,,IR1,Lvl,...,0,,GdPrv,,0,3,2006,WD,Normal,142500
2926,2927,923276100,20,RL,,8885,Pave,,IR1,Low,...,0,,MnPrv,,0,6,2006,WD,Normal,131000
2927,2928,923400125,85,RL,62.0,10441,Pave,,Reg,Lvl,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,132000
2928,2929,924100070,20,RL,77.0,10010,Pave,,Reg,Lvl,...,0,,,,0,4,2006,WD,Normal,170000


In [6]:
X = ames.drop("SalePrice", axis=1)
y = ames["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Column transformer to dummify
ct_dummies = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output=False, handle_unknown='ignore'), make_column_selector(dtype_include=object))
    ],
    remainder="passthrough"
).set_output(transform="pandas")
# Apply the dummification transformer to training data
X_train_dummified = ct_dummies.fit_transform(X_train)
X_test_dummified = ct_dummies.transform(X_test)

# Column transformer to create interaction features
ct_inter = ColumnTransformer(
    [
        ("interaction", PolynomialFeatures(interaction_only=True, include_bias=False), [
            "remainder__TotRms AbvGrd", "dummify__Bldg Type_1Fam"
        ])
    ],
    remainder="drop"
).set_output(transform="pandas")

# Apply the interaction transformer to the dummified training data
X_train_inter = ct_inter.fit_transform(X_train_dummified)
X_test_inter = ct_inter.transform(X_test_dummified)

# Final pipeline for standardization and linear regression
lr_pipeline = Pipeline(
    [
        ("standardize", StandardScaler()),
        ("linear_regression", LinearRegression())
    ]
)

In [9]:
# 1. Model using only size and number of rooms
model1_pipeline = Pipeline([
    ("preprocessing", ColumnTransformer([
        ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
    ])),
    ("linear_regression", LinearRegression())
])
model1_pipeline.fit(X_train, y_train)
y_preds1 = model1_pipeline.predict(X_test)
rmse1 = np.sqrt(mean_squared_error(y_test, y_preds1))


In [10]:
# 2. Model using size, number of rooms, and building type
model2_pipeline = Pipeline([
    ("preprocessing", ColumnTransformer([
        ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"]),
        ("dummify", OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ["Bldg Type"])
    ])),
    ("linear_regression", LinearRegression())
])
model2_pipeline.fit(X_train, y_train)
y_preds2 = model2_pipeline.predict(X_test)
rmse2 = np.sqrt(mean_squared_error(y_test, y_preds2))


In [11]:
# 3. Model using size and building type, and their interaction
model3_pipeline = Pipeline([
    ("preprocessing", ColumnTransformer([
        ("standardize", StandardScaler(), ["Gr Liv Area"]),
        ("dummify", OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ["Bldg Type"])
    ])),
    ("interaction", PolynomialFeatures(interaction_only=True, include_bias=False)),
    ("linear_regression", LinearRegression())
])
model3_pipeline.fit(X_train, y_train)
y_preds3 = model3_pipeline.predict(X_test)
rmse3 = np.sqrt(mean_squared_error(y_test, y_preds3))

In [12]:
# 4. Model using a 5-degree polynomial on size, number of rooms, and building type
model4_pipeline = Pipeline([
    ("preprocessing", ColumnTransformer([
        ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"]),
        ("dummify", OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ["Bldg Type"])
    ])),
    ("polynomial_features", PolynomialFeatures(degree=5, include_bias=False)),
    ("linear_regression", LinearRegression())
])
model4_pipeline.fit(X_train, y_train)
y_preds4 = model4_pipeline.predict(X_test)
rmse4 = np.sqrt(mean_squared_error(y_test, y_preds4))


In [15]:
# Print RMSE for each model
print(f"RMSE for Model 1 (Size & Rooms): {rmse1}")
print(f"RMSE for Model 2 (Size, Rooms, & Building Type): {rmse2}")
print(f"RMSE for Model 3 (Size, Building Type & their Interaction): {rmse3}")
print(f"RMSE for Model 4 (5-Degree Polynomial on Size, Rooms, & Building Type): {rmse4}")

# Determine which model performed best
best_model = min(rmse1, rmse2, rmse3, rmse4)
print(f"\nThe best model is Model 3 and has an RMSE of: {best_model}")

RMSE for Model 1 (Size & Rooms): 59261.71322786227
RMSE for Model 2 (Size, Rooms, & Building Type): 57078.218094312484
RMSE for Model 3 (Size, Building Type & their Interaction): 55945.56207987036
RMSE for Model 4 (5-Degree Polynomial on Size, Rooms, & Building Type): 734594.7128939299

The best model is Model 3 and has an RMSE of: 55945.56207987036


In [25]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_predict

def cross_val_rmse(model, X, y, cv=5):
    y_preds = cross_val_predict(model, X, y, cv=cv)
    return np.sqrt(mean_squared_error(y, y_preds))

cv_rmse1 = cross_val_rmse(model1_pipeline, X_train, y_train)
cv_rmse2 = cross_val_rmse(model2_pipeline, X_train, y_train)
cv_rmse3 = cross_val_rmse(model3_pipeline, X_train, y_train)
cv_rmse4 = cross_val_rmse(model4_pipeline, X_train, y_train)

best_cv_model = min(cv_rmse1, cv_rmse2, cv_rmse3, cv_rmse4)
print(f"Cross-validated RMSE for Model 1 (Size & Rooms): {cv_rmse1}")
print(f"Cross-validated RMSE for Model 2 (Size, Rooms, & Building Type): {cv_rmse2}")
print(f"Cross-validated RMSE for Model 3 (Size, Building Type & Interaction): {cv_rmse3}")
print(f"Cross-validated RMSE for Model 4 (5-Degree Polynomial on Size, Rooms, & Building Type): {cv_rmse4}")
print(f"The best model based on cross-validated RMSE has an RMSE of: {best_cv_model}")
print("\nThis does in fact agree with the earlier conclusion that Model 3 is the best model")

Cross-validated RMSE for Model 1 (Size & Rooms): 54555.64707408057
Cross-validated RMSE for Model 2 (Size, Rooms, & Building Type): 52911.88094497924
Cross-validated RMSE for Model 3 (Size, Building Type & Interaction): 52310.041736047235
Cross-validated RMSE for Model 4 (5-Degree Polynomial on Size, Rooms, & Building Type): 139867.4179068003
The best model based on cross-validated RMSE has an RMSE of: 52310.041736047235


In [34]:
X = ames.drop("SalePrice", axis=1)
y = ames["SalePrice"]

preprocessor = ColumnTransformer(
    [
        ("scale", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"]),
        ("dummify", OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ["Bldg Type"])
    ]
)
pipeline = Pipeline(
    [
        ("preprocessing", preprocessor),
        ("polynomial_features", PolynomialFeatures()),
        ("linear_regression", LinearRegression())
    ]
)

param_grid = {
    "polynomial_features__degree": range(1, 11),  # Degrees 1 to 10
    "preprocessing__scale__with_mean": [True, False]  # Scaling options
}

grid_search = GridSearchCV(pipeline, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_rmse = np.sqrt(-grid_search.best_score_)

y_preds = best_model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_preds))

print(f"Best Parameters: {best_params}")
print(f"Best Cross-validated RMSE: {best_rmse}")
print(f"Test RMSE: {test_rmse}")

Best Parameters: {'polynomial_features__degree': 2, 'preprocessing__scale__with_mean': True}
Best Cross-validated RMSE: 52117.3236904214
Test RMSE: 55307.38763963786


Q1:
shown in output above

Q2:
computational costs, possibility of overfitting