In [14]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.compose import ColumnTransformer

PART 1

Consider four possible models for predicting house prices:

Using only the size and number of rooms.
Using size, number of rooms, and building type.
Using size and building type, and their interaction.
Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.
Set up a pipeline for each of these four models.

Then, get predictions on the test set for each of your pipelines, and compute the root mean squared error. Which model performed best?

Note: You should only use the function train_test_split() one time in your code; that is, we should be predicting on the same test set for all three models.

In [5]:
ames = pd.read_csv("C:/Users/Luke Maier/Downloads/AmesHousing.csv")

ames.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 82 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Order            2930 non-null   int64  
 1   PID              2930 non-null   int64  
 2   MS SubClass      2930 non-null   int64  
 3   MS Zoning        2930 non-null   object 
 4   Lot Frontage     2440 non-null   float64
 5   Lot Area         2930 non-null   int64  
 6   Street           2930 non-null   object 
 7   Alley            198 non-null    object 
 8   Lot Shape        2930 non-null   object 
 9   Land Contour     2930 non-null   object 
 10  Utilities        2930 non-null   object 
 11  Lot Config       2930 non-null   object 
 12  Land Slope       2930 non-null   object 
 13  Neighborhood     2930 non-null   object 
 14  Condition 1      2930 non-null   object 
 15  Condition 2      2930 non-null   object 
 16  Bldg Type        2930 non-null   object 
 17  House Style   

In [32]:
X = ames[["Gr Liv Area", "TotRms AbvGrd","Bldg Type"]]
y = ames["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

numerical = ['Gr Liv Area','TotRms AbvGrd']
categorical = ['Bldg Type']



In [33]:
#MODEL 1

lr = LinearRegression()

ct = ColumnTransformer(
  [
    ("standardize", 
    StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)


lr_pipeline1 = Pipeline(
  [
  ("preprocessing", ct),
  ("linear_regression", lr)]
)

lr_fitted1 = lr_pipeline1.fit(X_train, y_train)

y_pred_test1 = lr_pipeline1.predict(X_test)

mse1 = mean_squared_error(y_test, y_pred_test1)
rmse1 = np.sqrt(mse1)
rsquared_test1 = r2_score(y_test, y_pred_test1)

In [34]:
print("RMSE:", rmse1.__round__(2))

Test RMSE: 52374.58
Test R^2:  0.57


In [35]:
#MODEL 2

ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False, drop = "first"), ["Bldg Type"]),
    ("standardize", 
    StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"])
  ],
  remainder = "drop"
)


lr_pipeline2 = Pipeline(
  [
  ("preprocessing", ct),
  ("linear_regression", lr)]
)

lr_fitted2 = lr_pipeline2.fit(X_train, y_train)

y_pred_test2 = lr_pipeline2.predict(X_test)

mse_test2 = mean_squared_error(y_test, y_pred_test2)
rmse2 = np.sqrt(mse_test2)
rsquared_test2 = r2_score(y_test, y_pred_test2)

In [38]:
print("RMSE:", rmse2.__round__(2))

RMSE: 50970.32


In [41]:
#MODEL 3

ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False, drop = "first"), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area"])
  ],
  remainder = "drop"
)

lr_pipeline3 = Pipeline(
  [
  ("preprocessing", ct),
  ("interaction", PolynomialFeatures(degree = 2, interaction_only = True, include_bias=False)),
  ("linear_regression", lr)]
)

lr_fitted3 = lr_pipeline3.fit(X_train, y_train)

y_pred_test3 = lr_pipeline3.predict(X_test)

mse_test3 = mean_squared_error(y_test, y_pred_test3)
rmse3 = np.sqrt(mse_test3)
rsquared_test3 = r2_score(y_test, y_pred_test3)

In [42]:
print("RMSE:", rmse3.__round__(2))

RMSE: 50489.2


In [44]:
#MODEL 4: 5 degree polynomial on size and num rooms

ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False, drop = "first"), ["Bldg Type"]),
    ("standardize", StandardScaler(), ["Gr Liv Area", "TotRms AbvGrd"]),
    
  ],
  remainder = "drop"
)

lr_pipeline4 = Pipeline(
  [
  ("preprocessing", ct),
  ("polynomial", PolynomialFeatures(degree=5, include_bias=False)),
  ("linear_regression", lr)]
)

lr_fitted4 = lr_pipeline4.fit(X_train, y_train)

y_pred_test4 = lr_pipeline4.predict(X_test)

mse_test4 = mean_squared_error(y_test, y_pred_test4)
rmse4 = np.sqrt(mse_test4)
rsquared_test4 = r2_score(y_test, y_pred_test4)

In [45]:
print("RMSE:", rmse4.__round__(2))

RMSE: 50961.37


Based on the RMSE values I would chose model 3 which uses size and building type and the interaction between the two. It has the lowest RMSE meaning it has the highest predictive accuracy.

PART 2

Once again consider four modeling options for house price:

Using only the size and number of rooms.
Using size, number of rooms, and building type.
Using size and building type, and their interaction.
Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.
Use cross_val_score with the pipelines you made earlier to find the cross-validated root mean squared error for each model.

Which do you prefer? Does this agree with your conclusion from earlier?

In [46]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score


score1 = cross_val_score(lr_pipeline1, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
score1 = np.sqrt(-score1)
cv1 = score1.mean()

score2 = cross_val_score(lr_pipeline2, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
score2 = np.sqrt(-score2)
cv2 = score2.mean()

score3 = cross_val_score(lr_pipeline3, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
score3 = np.sqrt(-score3)
cv3 = score3.mean()

score4 = cross_val_score(lr_pipeline4, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
score4 = np.sqrt(-score4)
cv4 = score4.mean()

In [48]:
print(cv1)
print(cv2)
print(cv3)
print(cv4)

56907.106690592365
55027.478422316795
54175.629742031706
145588.99809220855


Based on these RMSE values I would choose model 3 as my model. This is the same conclusion I reached earlier.

PART 3

Consider one hundred modeling options for house price:

House size, trying degrees 1 through 10
Number of rooms, trying degrees 1 through 10
Building Type
Hint: The dictionary of possible values that you make to give to GridSearchCV will have two elements instead of one.

Q1: Which model performed the best?

Q2: What downsides do you see of trying all possible model options? How might you go about choosing a smaller number of tuning values to try?

In [54]:


ct_polynomial = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["Bldg Type"]),
    ("polynomial_size", PolynomialFeatures(), ["Gr Liv Area"]),
    ("polynomial_room", PolynomialFeatures(), ["TotRms AbvGrd"])
  ],
  remainder = "drop"
)

polynomial_lr_pipeline = Pipeline(
  [("preprocessing", ct_polynomial),
  ("linear_regression", LinearRegression())]
).set_output(transform="pandas")

degrees = {'preprocessing__polynomial_size__degree': np.arange(1, 11),
           'preprocessing__polynomial_room__degree': np.arange(1, 11)}

gscv = GridSearchCV(polynomial_lr_pipeline, degrees, cv = 5, scoring='neg_mean_squared_error')

gscv_fitted = gscv.fit(X, y)

neg_mse_scores = gscv_fitted.cv_results_['mean_test_score']
rmse_scores = np.sqrt(-neg_mse_scores)

results_df = pd.DataFrame(data = {
    "polynomial_size": np.tile(np.arange(1, 11), 10),
    "polynomial_room": np.repeat(np.arange(1, 11), 10),
    "mean_test_score": rmse_scores
})

best = results_df.loc[results_df['mean_test_score'].idxmin()]
best


polynomial_size        3.000000
polynomial_room        1.000000
mean_test_score    52896.321648
Name: 2, dtype: float64

In [53]:
gscv = GridSearchCV(polynomial_lr_pipeline, degrees, cv = 5, scoring='r2')

gscv_fitted = gscv.fit(X, y)


results_df = pd.DataFrame(data = {
    "polynomial_size": np.tile(np.arange(1, 11), 10),
    "polynomial_room": np.repeat(np.arange(1, 11), 10),
    "mean_test_score": gscv_fitted.cv_results_['mean_test_score']
})

best = results_df.loc[results_df['mean_test_score'].idxmax()]
best

polynomial_size    3.000000
polynomial_room    1.000000
mean_test_score    0.557641
Name: 2, dtype: float64

1. The model with a polynomial with a degree of 3.
2. The downside is that it takes a lot of time and is computationally heavy.