# Consider four possible models for predicting house prices:

Using only the size and number of rooms.
Using size, number of rooms, and building type.
Using size and building type, and their interaction.
Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.
Set up a pipeline for each of these four models.

Then, get predictions on the test set for each of your pipelines, and compute the root mean squared error. Which model performed best?

Note: You should only use the function train_test_split() one time in your code; that is, we should be predicting on the same test set for all three models.

In [30]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error

In [12]:
# load the data
data = ("/Users/nicoleradovcich/Desktop/MSBA/GSB544/GSB_544/Practice_7/AmesHousing.csv")
ames = pd.read_csv(data)

In [14]:
# define variables
X = ames[['Gr Liv Area', 'TotRms AbvGrd', 'Bldg Type']]
y = ames['SalePrice']

In [15]:
# Split the data into training and test sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Define the feature transformation
numeric_features = ['Gr Liv Area', 'TotRms AbvGrd']
categorical_features = ['Bldg Type']

## Pipeline 1

In [18]:
#using only size and number of rooms
pipeline_1 = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features)
        ])),
    ('regressor', LinearRegression())
])

## Pipeline 2

In [19]:
#using size, number of rooms, and building type
pipeline_2 = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(), categorical_features)
        ])),
    ('regressor', LinearRegression())
])

## Pipeline 3

In [20]:
#Using size, building type, and their interaction
pipeline_3 = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), ['Gr Liv Area']),
            ('cat', OneHotEncoder(), categorical_features)
        ])),
    ('interaction', PolynomialFeatures(interaction_only=True, include_bias=False)),
    ('regressor', LinearRegression())
])

## Pipeline 4

In [21]:
#5-degree polynomial on size, 5-degree polynomial on number of rooms, and building type
pipeline_4 = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('num_poly', PolynomialFeatures(degree=5), numeric_features),
            ('cat', OneHotEncoder(), categorical_features)
        ])),
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

In [26]:
# List of pipelines
pipelines = [pipeline_1, pipeline_2, pipeline_3, pipeline_4]


In [27]:
# Dictionary to store RMSE for each model
rmse_results = {}

# Fit each pipeline and compute RMSE
for i, pipeline in enumerate(pipelines, 1):
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_results[f'Model {i}'] = rmse
    print(f'RMSE for Model {i}: {rmse}')

# Identify the best-performing model based on RMSE
best_model = min(rmse_results, key=rmse_results.get)
print(f"\nThe best-performing model is {best_model} with RMSE of {rmse_results[best_model]:.4f}")

RMSE for Model 1: 61928.53719680032
RMSE for Model 2: 59589.20317423357
RMSE for Model 3: 58276.726954679834
RMSE for Model 4: 61791.588516218035

The best-performing model is Model 3 with RMSE of 58276.7270
RMSE for Model 1: 61928.53719680032
RMSE for Model 2: 59589.20317423357
RMSE for Model 3: 58276.726954679834
RMSE for Model 4: 61791.588516218035

The best-performing model is Model 3 with RMSE of 58276.7270


# Once again consider four modeling options for house price:

Using only the size and number of rooms.
Using size, number of rooms, and building type.
Using size and building type, and their interaction.
Using a 5-degree polynomial on size, a 5-degree polynomial on number of rooms, and also building type.
Use cross_val_score with the pipelines you made earlier to find the cross-validated root mean squared error for each model.

Which do you prefer? Does this agree with your conclusion from earlier?

In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error

# Assuming 'ames' is your dataset
X = ames[['Gr Liv Area', 'TotRms AbvGrd', 'Bldg Type']]  # Input features
y = ames['SalePrice']  # Target variable

# Preprocessing for the models

# Model 1: Using size and number of rooms
model_1 = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), ['Gr Liv Area', 'TotRms AbvGrd'])  # Scaling numeric features
        ])),
    ('regressor', LinearRegression())
])

# Model 2: Using size, number of rooms, and building type
model_2 = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), ['Gr Liv Area', 'TotRms AbvGrd']),  # Scaling numeric features
            ('cat', OneHotEncoder(drop='first'), ['Bldg Type'])  # One-hot encoding for categorical feature
        ])),
    ('regressor', LinearRegression())
])

# Model 3: Using size and building type, including interaction terms
model_3 = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), ['Gr Liv Area', 'TotRms AbvGrd']),  # Scaling numeric features
            ('cat', OneHotEncoder(drop='first'), ['Bldg Type']),  # One-hot encoding for categorical feature
        ])),
    ('interaction', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),  # Interaction terms
    ('regressor', LinearRegression())
])

# Model 4: Using 5-degree polynomials on size and number of rooms, and building type
model_4 = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('num', PolynomialFeatures(degree=5, include_bias=False), ['Gr Liv Area', 'TotRms AbvGrd']),  # Polynomial features
            ('cat', OneHotEncoder(drop='first'), ['Bldg Type'])  # One-hot encoding for categorical feature
        ])),
    ('regressor', LinearRegression())
])

# Cross-validation for each model and compute RMSE
models = [model_1, model_2, model_3, model_4]
model_names = ['Model 1: Size and Rooms', 'Model 2: Size, Rooms, and Bldg Type', 'Model 3: Interaction Features', 'Model 4: 5-Degree Polynomials']

cv_rmse_scores = []

# Perform cross-validation for each model and compute RMSE
for model in models:
    cv_rmse = cross_val_score(model, X, y, cv=5, scoring='neg_root_mean_squared_error')  # Negate RMSE for output
    cv_rmse_scores.append(np.mean(cv_rmse))  # Take the mean of the RMSE scores

# Output results
for name, score in zip(model_names, cv_rmse_scores):
    print(f"{name}: Cross-validated RMSE = {-score:.2f}")


Model 1: Size and Rooms: Cross-validated RMSE = 55806.33
Model 2: Size, Rooms, and Bldg Type: Cross-validated RMSE = 54168.08
Model 3: Interaction Features: Cross-validated RMSE = 53363.04
Model 4: 5-Degree Polynomials: Cross-validated RMSE = 60117.00
Model 1: Size and Rooms: Cross-validated RMSE = 55806.33
Model 2: Size, Rooms, and Bldg Type: Cross-validated RMSE = 54168.08
Model 3: Interaction Features: Cross-validated RMSE = 53363.04
Model 4: 5-Degree Polynomials: Cross-validated RMSE = 60117.00


## Which do you prefer? Does this agree with your conclusion from earlier?
This agrees with my conclusion from earlier with Model 3 being the best model in both of these. 

# Consider one hundred modeling options for house price:

House size, trying degrees 1 through 10
Number of rooms, trying degrees 1 through 10
Building Type
Hint: The dictionary of possible values that you make to give to GridSearchCV will have two elements instead of one.

Q1: Which model performed the best?

Q2: What downsides do you see of trying all possible model options? How might you go about choosing a smaller number of tuning values to try?

In [43]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Assuming 'ames' is your dataset
X = ames[['Gr Liv Area', 'TotRms AbvGrd', 'Bldg Type']]  # Input features
y = ames['SalePrice']  # Target variable

# Define the grid of hyperparameters to try
param_grid = {
    'preprocessor__num__transformer__degree': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],  # Polynomial degrees for features
}

# Create a pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('num', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
                ('transformer', PolynomialFeatures())  # Polynomial features for numeric columns
            ]), ['Gr Liv Area', 'TotRms AbvGrd']),  # Apply polynomial features
            ('cat', OneHotEncoder(drop='first'), ['Bldg Type'])  # One-hot encoding for categorical feature
        ])),
    ('regressor', LinearRegression())  # Linear regression model
])

# GridSearchCV with cross-validation
grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)

# Fit the model and search for the best parameters
grid_search.fit(X, y)

# Get the best model
best_model = grid_search.best_estimator_

# Get the best parameters and the best RMSE score
best_params = grid_search.best_params_
best_rmse = -grid_search.best_score_

print(f"Best model parameters: {best_params}")
print(f"Best cross-validated RMSE: {best_rmse:.2f}")


Best model parameters: {'preprocessor__num__transformer__degree': 3}
Best cross-validated RMSE: 53805.94
Best model parameters: {'preprocessor__num__transformer__degree': 3}
Best cross-validated RMSE: 53805.94


## Which model performed the best?
Model 3 performed the best. 

## What downsides do you see of trying all possible model options? How might you go about choosing a smaller number of tuning values to try?
Some downsides are that it can result in a lot of models to evaluate which can be time consuming when dealing with large datasets or more complex models. I could go about choosing a smaller number of tuning values by testing a smaller range of hyperparameters. 