Consider four possible models for predicting house prices:

Using only the size and number of rooms.

Using size, number of rooms, and building type.

Using size and building type, and their interaction.

Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.

1. Set up a pipeline for each of these four models.

2. Then, get predictions on the test set for each of your pipelines, and compute the root mean squared error. Which model performed best?

Note: You should only use the function train_test_split() one time in your code; that is, we should be predicting on the same test set for all three models.

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score


In [None]:
ames = pd.read_csv('/content/AmesHousing.csv')
ames.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [None]:
#Model 1
X = ames.drop("SalePrice", axis = 1)
y = ames["SalePrice"]

pipeline_1 = Pipeline([
    ('preprocess', ColumnTransformer([
        ('num', StandardScaler(), ['Gr Liv Area', 'TotRms AbvGrd']),
    ], remainder='drop')),
    ('linear_regression', LinearRegression())
])
scores = cross_val_score(pipeline_1, X, y, cv=5, scoring='r2')
scores.mean()

0.504208752508862

In [None]:
# Model 2
pipeline_2 = Pipeline([
    ('preprocess', ColumnTransformer([
        ('num', StandardScaler(), ['Gr Liv Area', 'TotRms AbvGrd']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Bldg Type'])
    ])),
    ('linear_regression', LinearRegression())
])
scores2 = cross_val_score(pipeline_2, X, y, cv=5, scoring='r2')
scores2.mean()

0.5328824390692034

In [None]:
# Model 3
pipeline_3 = Pipeline([
    ('preprocess', ColumnTransformer([
        ('num', StandardScaler(), ['Gr Liv Area']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Bldg Type'])
    ])),
    ('interaction', PolynomialFeatures(interaction_only=True)),
    ('linear_regression', LinearRegression())
])
scores3 = cross_val_score(pipeline_3, X, y, cv=5, scoring='r2')
scores3.mean()

0.544867241690556

In [None]:
# Model 4
pipeline_4 = Pipeline([
    ('preprocess', ColumnTransformer([
        ('num_size', Pipeline([
            ('poly', PolynomialFeatures(degree=5)),
            ('scaler', StandardScaler())
        ]), ['Gr Liv Area']),
        ('num_rooms', Pipeline([
            ('poly', PolynomialFeatures(degree=5)),
            ('scaler', StandardScaler())
        ]), ['TotRms AbvGrd']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Bldg Type'])
    ])),
    ('linear_regression', LinearRegression())
])
scores4 = cross_val_score(pipeline_4, X, y, cv=5, scoring='r2')
scores4.mean()

0.5106643234404162

In [None]:
from sklearn.metrics import mean_squared_error
import math

pipelines = [pipeline_1, pipeline_2, pipeline_3, pipeline_4]
rmse_scores = []

for i, pipeline in enumerate(pipelines):
  pipeline.fit(X_train, y_train)

  y_pred = pipeline.predict(X_test)

  rmse = math.sqrt(mean_squared_error(y_test, y_pred))
  rmse_scores.append(rmse)

  print(f"Model {i+1} RMSE: {rmse}")

NameError: name 'X_train' is not defined