# Practice 7.1

## Setup

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from plotnine import *

In [None]:
df = pd.read_csv("/content/sample_data/AmesHousing.csv")

X = df[["Gr Liv Area", "TotRms AbvGrd", "Bldg Type"]]
y = df["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Practice 1

In [None]:
# Model 1
numerical_features_m1 = ["Gr Liv Area", "TotRms AbvGrd"]
preprocessor_m1 = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features_m1)],
    remainder='drop')
m1 = make_pipeline(
    preprocessor_m1,
    LinearRegression())
m1.fit(X_train, y_train)
y1_pred = m1.predict(X_test)
rmse1 = np.sqrt(mean_squared_error(y_test, y1_pred))

# Model 2
numerical_features = ["Gr Liv Area", "TotRms AbvGrd"]
categorical_features = ["Bldg Type"]
preprocessor_m2 = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_features)])
m2 = make_pipeline(
    preprocessor_m2,
    LinearRegression())
m2.fit(X_train, y_train)
y2_pred = m2.predict(X_test)
rmse2 = np.sqrt(mean_squared_error(y_test, y2_pred))

# Model 3 - Chat for help
from sklearn.preprocessing import FunctionTransformer
def add_interaction(X):
    X = X.copy()
    X["Interaction"] = X["Gr Liv Area"] * pd.get_dummies(X["Bldg Type"], drop_first=True).iloc[:, 0]
    return X
interaction = FunctionTransformer(add_interaction)
numerical_features = ["Gr Liv Area"]
categorical_features = ["Bldg Type"]
preprocessor_m3 = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_features)])
m3 = make_pipeline(
    interaction,
    preprocessor_m3,
    LinearRegression())
m3.fit(X_train, y_train)
y3_pred = m3.predict(X_test)
rmse3 = np.sqrt(mean_squared_error(y_test, y3_pred))

# Model 4 - Chat for help
numerical_features = ["Gr Liv Area", "TotRms AbvGrd"]
categorical_features = ["Bldg Type"]
preprocessor_m4 = ColumnTransformer(transformers=[
    ('poly', PolynomialFeatures(degree=5, include_bias=False), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_features)])
m4 = make_pipeline(
    preprocessor_m4,
    StandardScaler(with_mean=False),
    LinearRegression())
m4.fit(X_train, y_train)
y4_pred = m4.predict(X_test)
rmse4 = np.sqrt(mean_squared_error(y_test, y4_pred))


print("Model 1 RMSE:", round(rmse1, 2))
print("Model 2 RMSE:", round(rmse2, 2))
print("Model 3 RMSE:", round(rmse3, 2))
print("Model 4 RMSE:", round(rmse4, 2))

Model 1 RMSE: 61928.54
Model 2 RMSE: 59589.2
Model 3 RMSE: 59231.89
Model 4 RMSE: 61791.59


Model 3 performed the best with the lowest RMSE.

## Practice 2

In [None]:
models = {
    "Model 1": m1,
    "Model 2": m2,
    "Model 3": m3,
    "Model 4": m4}

rmse_cv = {}

for name, model in models.items():
    scores = -cross_val_score(
        model,
        X,
        y,
        cv=5,
        scoring="neg_root_mean_squared_error")
    rmse_cv[name] = scores.mean()

for name, score in rmse_cv.items():
    print(f"{name}: {round(score, 2)}")

Model 1: 55806.33
Model 2: 54168.08
Model 3: 54344.55
Model 4: 70854.54


Model 2 performs the best here with the loweset cross validated RMSE. Although Model 3 was better for our test split, Model 2 fits new data better.

## Practice 3

In [None]:
pre = ColumnTransformer([
    ("size",  PolynomialFeatures(include_bias=False), ["Gr Liv Area"]),
    ("rooms", PolynomialFeatures(include_bias=False), ["TotRms AbvGrd"]),
    ("cat",   OneHotEncoder(handle_unknown="ignore", drop="first"), ["Bldg Type"])])

pipe = Pipeline([
    ("pre", pre),
    ("scale", StandardScaler(with_mean=False)),
    ("lr", LinearRegression())])

param_grid = {
    "pre__size__degree":  np.arange(1, 11),
    "pre__rooms__degree": np.arange(1, 11)}

gscv = GridSearchCV(
    pipe,
    param_grid,
    cv=5,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1)

gscv.fit(X, y)

best_deg_size  = gscv.best_params_["pre__size__degree"]
best_deg_rooms = gscv.best_params_["pre__rooms__degree"]
best_rmse = -gscv.best_score_

print(best_deg_size, best_deg_rooms)
print(round(best_rmse, 2))


3 1
52781.98


Q1: Model 4

Q2: It would take a long time to run all possible model options, but we can randomize it to a few.