In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

In [None]:
ames = pd.read_csv('/content/AmesHousing.csv')
ames.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


#Once again consider four modeling options for house price:

1. Using only the size and number of rooms.
2. Using size, number of rooms, and building type.
3. Using size and building type, and their interaction.
4. Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.

Use cross_val_score with the pipelines you made earlier to find the cross-validated root mean squared error for each model.

Which do you prefer? Does this agree with your conclusion from earlier?

In [None]:
# Define Data and split

X = ames[["Gr Liv Area", "TotRms AbvGrd","Bldg Type"]]
y = ames["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
# Size and number of rooms
#Create CT
ct1 = ColumnTransformer(
  # Standardizing
  [("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])],
  # Dropping Bldg Type
  remainder = "drop"
)
lr = LinearRegression()

lr1_pipeline = Pipeline(
  [("preprocessing", ct1),
  ("OLS", lr)]
)

# Fitting Pipeline
lr1_fitted = lr1_pipeline.fit(X_train, y_train)

# RMSE
y_preds = lr1_fitted.predict(X_test)
rmse_val1 = np.sqrt(mean_squared_error(y_test, y_preds))
print("Validation RMSE 1:",rmse_val1)


Validation RMSE 1: 53063.73484334011


In [None]:
# Size, number of rooms, building type
#Create CT
ct2 = ColumnTransformer(
  [
    # Dummifying Bldg Type
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    # Standardizing
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])]
)
lr = LinearRegression()

lr2_pipeline = Pipeline(
  [("preprocessing", ct2),
  ("OLS", lr)]
)

# Fitting Pipeline
lr2_fitted = lr2_pipeline.fit(X_train, y_train)

# RMSE
y_preds = lr2_fitted.predict(X_test)
rmse_val2 = np.sqrt(mean_squared_error(y_test, y_preds))
print("Validation RMSE 2:",rmse_val2)


Validation RMSE 2: 51235.62273726728


In [None]:
# Size building type and their interaction
# Create CT with interaction
ct3 = ColumnTransformer(
    [("standardize", StandardScaler(), ["Gr Liv Area"]),
     ("onehot", OneHotEncoder(drop='first'), ["Bldg Type"])],
    remainder="passthrough"
)

# Create interaction features pipeline
inter_pipeline = Pipeline([
    ("dummify", ct3),
    ("interaction", PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ("OLS", lr)
])

# Fitting pipeline
inter_fitted = inter_pipeline.fit(X_train, y_train)

# RMSE
y_preds3 = inter_fitted.predict(X_test)
rmse_val3 = np.sqrt(mean_squared_error(y_test, y_preds3))
print("Validation RMSE 3:",rmse_val3)

Validation RMSE 3: 50736.4076056684


In [None]:
# 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and building type
# Create CT with interaction
ct4 = ColumnTransformer(
  [
    # Dummify Bldg Type
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    # Creating 5-degree polynomial for Size and # Rooms
    ("5th-degree", PolynomialFeatures(degree=5), ["Gr Liv Area", "TotRms AbvGrd"]), # Creating 5-degree polynomial for Size and # Rooms
] , remainder = "drop"
)

poly_pipeline = Pipeline(
  [("preprocessing", ct4),
  ("OLS", lr)]
)

# Fitting Pipeline
poly_fitted= poly_pipeline.fit(X_train, y_train)

# RMSE
y_preds4 = poly_fitted.predict(X_test)
rmse_val4 = np.sqrt(mean_squared_error(y_test, y_preds4))
print("Validation RMSE 4:",rmse_val4)

Validation RMSE 4: 55537.24283776525


The third model with size and building type, and their interaction performed the best based on just RMSE.

In [None]:
# Cross validation
# Model 1
scores1 = cross_val_score(lr1_pipeline, X, y, cv = 5, scoring = "neg_mean_squared_error")
cv_rmse1 = np.sqrt(-scores1.mean())
print("Cross-Validated RMSE for Model 1:",cv_rmse1)

# Model 2
scores2 = cross_val_score(lr2_pipeline, X, y, cv = 5, scoring = "neg_mean_squared_error")
cv_rmse2 = np.sqrt(-scores2.mean())
print("Cross-Validated RMSE for Model 2:",cv_rmse2)

# Model 3
scores3 = cross_val_score(inter_pipeline, X, y, cv = 5, scoring = "neg_mean_squared_error")
cv_rmse3 = np.sqrt(-scores3.mean())
print("Cross-Validated RMSE for Model 3:",cv_rmse3)

# Model 4
scores4 = cross_val_score(poly_pipeline, X, y, cv = 5, scoring = "neg_mean_squared_error")
cv_rmse4 = np.sqrt(-scores4.mean())
print("Cross-Validated RMSE for Model 4:",cv_rmse4)

Cross-Validated RMSE for Model 1: 56001.24023779208
Cross-Validated RMSE for Model 2: 54304.05453641055
Cross-Validated RMSE for Model 3: 53529.85368834779
Cross-Validated RMSE for Model 4: 61154.63474475652


I prefer cross validation because it is easier to implement. It does agree with my conclusion from earlier with model 3 having the lowest CV RMSE.

# Consider one hundred modeling options for house price:

- House size, trying degrees 1 through 10
- Number of rooms, trying degrees 1 through 10
- Building Type
- Hint: The dictionary of possible values that you make to give to GridSearchCV will have two elements instead of one.

In [None]:
# Create CT
ct = ColumnTransformer(
    transformers=[
        ("poly_size", PolynomialFeatures(), ["Gr Liv Area"]),
        ("poly_rooms", PolynomialFeatures(), ["TotRms AbvGrd"]),
        ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"])
    ],
    remainder="drop"
)

# Define the pipeline
lr_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("linear_regression", LinearRegression())
    ]
).set_output(transform="pandas")

# Degrees 1-10 for rooms and size
degrees = {
    'preprocessing__poly_size__degree': np.arange(1, 11),
    'preprocessing__poly_rooms__degree': np.arange(1, 11)
}

# Set up GridSearchCV with the pipeline and parameter grid
gscv = GridSearchCV(lr_pipeline, degrees, cv = 5, scoring = 'r2')

# Fit our Grid Search to data
gscv_fitted = gscv.fit(X, y)

results_df = pd.DataFrame(gscv_fitted.cv_results_)

  _data = np.array(data, dtype=dtype, copy=copy,


In [None]:
# Create a DataFrame to show the degree combinations and mean test scores
results_df = pd.DataFrame({
    "degree_size": gscv_fitted.cv_results_['param_preprocessing__poly_size__degree'],
    "degree_rooms": gscv_fitted.cv_results_['param_preprocessing__poly_rooms__degree'],
    "r2": gscv_fitted.cv_results_['mean_test_score']
})

# Display the results
results_df.sort_values(by = "r2", ascending = False)

Unnamed: 0,degree_size,degree_rooms,r2
2,3,1,0.557641
12,3,2,0.556857
33,4,4,0.556855
43,4,5,0.556531
22,3,3,0.554039
...,...,...,...
89,10,9,-16.187893
99,10,10,-16.187893
90,1,10,-184.221752
91,2,10,-189.473425


# Q1: Which model performed the best?

The model with polynomial degree of 3 for size and polynomial degree of 1 for rooms is the best with a R^2 value of 0.5576

# Q2: What downsides do you see of trying all possible model options? How might you go about choosing a smaller number of tuning values to try?

Some downsides I see of trying all possible model options are long runtimes and increases risks of overfitting. To address this, I I could focus on testing a few values within the range to find effective combinations.