In [32]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

# Part 1

In [10]:
ames = pd.read_csv("/Users/OwenBarberie/Desktop/GSB_544/AmesHousing.csv")

In [11]:
ames.head

<bound method NDFrame.head of       Order        PID  MS SubClass MS Zoning  ...  Yr Sold  Sale Type Sale Condition SalePrice
0         1  526301100           20        RL  ...     2010        WD          Normal    215000
1         2  526350040           20        RH  ...     2010        WD          Normal    105000
2         3  526351010           20        RL  ...     2010        WD          Normal    172000
3         4  526353030           20        RL  ...     2010        WD          Normal    244000
4         5  527105010           60        RL  ...     2010        WD          Normal    189900
...     ...        ...          ...       ...  ...      ...        ...            ...       ...
2925   2926  923275080           80        RL  ...     2006        WD          Normal    142500
2926   2927  923276100           20        RL  ...     2006        WD          Normal    131000
2927   2928  923400125           85        RL  ...     2006        WD          Normal    132000
2928   292

In [25]:
X = ames[['Gr Liv Area', 'TotRms AbvGrd', 'Bldg Type']]
y = ames['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Gr Liv Area', 'TotRms AbvGrd']),
        ('cat', OneHotEncoder(drop='first'), ['Bldg Type'])
    ]
)

model1 = Pipeline([
    ('preprocessor', ColumnTransformer([('num', StandardScaler(), ['Gr Liv Area', 'TotRms AbvGrd'])])),
    ('regressor', LinearRegression())
])

model2 = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

model3 = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(interaction_only=True, include_bias=False)),
    ('regressor', LinearRegression())
])

model4 = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=5, include_bias=False)),
    ('regressor', LinearRegression())
])

models = [model1, model2, model3, model4]
rmse_scores = []

for i, model in enumerate(models, 1):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append((f'Model {i}', rmse))
    print(f'Model {i} RMSE: {rmse}')

best_model = min(rmse_scores, key=lambda x: x[1])
print(f"Best performing model: {best_model[0]} with RMSE: {best_model[1]}")


Model 1 RMSE: 61928.537196800295
Model 2 RMSE: 59589.203174233546
Model 3 RMSE: 58312.15540974156
Model 4 RMSE: 689799.9863300155
Best performing model: Model 3 with RMSE: 58312.15540974156
Model 1 RMSE: 61928.537196800295
Model 2 RMSE: 59589.203174233546
Model 3 RMSE: 58312.15540974156
Model 4 RMSE: 689799.9863300155
Best performing model: Model 3 with RMSE: 58312.15540974156


# Part 2

In [28]:
preprocessor.fit_transform(X_train)

array([[-0.24352173, -0.26170611,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 1.25495594,  0.37795929,  0.        ,  0.        ,  0.        ,  0.        ],
       [-0.53291028, -0.26170611,  0.        ,  1.        ,  0.        ,  0.        ],
       ...,
       [ 0.46607484,  0.37795929,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.4898602 , -0.26170611,  0.        ,  0.        ,  0.        ,  0.        ],
       [-1.43675395, -0.90137151,  0.        ,  0.        ,  0.        ,  0.        ]])

In [26]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Gr Liv Area', 'TotRms AbvGrd']),
        ('cat', OneHotEncoder(drop='first'), ['Bldg Type'])
    ]
)

model1 = Pipeline([
    ('preprocessor', ColumnTransformer([('num', StandardScaler(), ['Gr Liv Area', 'TotRms AbvGrd'])])),
    ('regressor', LinearRegression())
])

model2 = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

model3 = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(interaction_only=True, include_bias=False)),
    ('regressor', LinearRegression())
])

model4 = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=5, include_bias=False)),
    ('regressor', LinearRegression())
])

models = [model1, model2, model3, model4]
model_names = ["Model 1", "Model 2", "Model 3", "Model 4"]

rmse_scores = []
for model, name in zip(models, model_names):
    neg_mse_scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=5)
    rmse_scores.append((name, np.mean(np.sqrt(-neg_mse_scores))))
    print(f"{name} Cross-validated RMSE: {np.mean(np.sqrt(-neg_mse_scores))}")

best_model = min(rmse_scores, key=lambda x: x[1])
print(f"Best performing model: {best_model[0]} with RMSE: {best_model[1]}")


Model 1 Cross-validated RMSE: 55806.32634926364
Model 2 Cross-validated RMSE: 54168.081429193844
Model 3 Cross-validated RMSE: 53363.03910983061
Model 1 Cross-validated RMSE: 55806.32634926364
Model 2 Cross-validated RMSE: 54168.081429193844
Model 3 Cross-validated RMSE: 53363.03910983061


Model 4 Cross-validated RMSE: 339899.3132584556
Best performing model: Model 3 with RMSE: 53363.03910983061
Model 4 Cross-validated RMSE: 339899.3132584556
Best performing model: Model 3 with RMSE: 53363.03910983061


Although the RMSEs are different for the original test train split and the cross validation, the order from least to greatest RMSEs remains the same. That is, Model 3 (lowest), Model 2, Model 1, and Model 4(highest). 

# Part 3

In [38]:

X = ames[['Gr Liv Area', 'TotRms AbvGrd', 'Bldg Type']]
y = ames['SalePrice']

ct_poly = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output=False), ["Bldg Type"]),
        ("poly_size", PolynomialFeatures(), ["Gr Liv Area"]),
        ("poly_rooms", PolynomialFeatures(), ["TotRms AbvGrd"])
    ],
    remainder="drop"
)

lr_pipeline_poly = Pipeline(
    [
        ("preprocessing", ct_poly),
        ("linear_regression", LinearRegression())
    ]
).set_output(transform="pandas")

degrees = {
    'preprocessing__poly_size__degree': np.arange(1, 10),
    'preprocessing__poly_rooms__degree': np.arange(1, 10)
}

gscv = GridSearchCV(
    estimator=lr_pipeline_poly,
    param_grid=degrees,
    cv=5,
    scoring='r2'
)

gscv.fit(X, y)

print("Best parameters:", gscv.best_params_)
print("Best R^2 score:", gscv.best_score_)


Best parameters: {'preprocessing__poly_rooms__degree': np.int64(1), 'preprocessing__poly_size__degree': np.int64(3)}
Best R^2 score: 0.5576405916501644
Best parameters: {'preprocessing__poly_rooms__degree': np.int64(1), 'preprocessing__poly_size__degree': np.int64(3)}
Best R^2 score: 0.5576405916501644


  _data = np.array(data, dtype=dtype, copy=copy,
