In [37]:
# Imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score


In [38]:
# Load dataset and create classes, 0 = low and 2 = high
train_df = pd.read_csv("train.csv")
X = train_df.drop("SalePrice", axis=1)
y = train_df["SalePrice"]

In [39]:
# Features that may or may not exist in every house
none_fill_cols = [
    "Alley","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1",
    "BsmtFinType2", "FireplaceQu","GarageType","GarageFinish",
    "GarageQual","GarageCond","PoolQC","Fence","MiscFeature"
]

# Replace missing values with "None"
for col in none_fill_cols:
    if col in X.columns:
        X[col] = X[col].fillna("None")

# Fill missing numeric values with median, as it is more resistant to outliers
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns
X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].median())

In [40]:
# Define feature types and encoders
ordinal_mappings = {
    "ExterQual": ["Po", "Fa", "TA", "Gd", "Ex"],
    "ExterCond": ["Po", "Fa", "TA", "Gd", "Ex"],
    "HeatingQC": ["Po", "Fa", "TA", "Gd", "Ex"],
    "KitchenQual": ["Po", "Fa", "TA", "Gd", "Ex"],
    "FireplaceQu": ["Po", "Fa", "TA", "Gd", "Ex", "None"],
    "GarageQual": ["Po", "Fa", "TA", "Gd", "Ex", "None"],
    "GarageCond": ["Po", "Fa", "TA", "Gd", "Ex", "None"],
    "BsmtQual": ["Po", "Fa", "TA", "Gd", "Ex", "None"],
    "BsmtCond": ["Po", "Fa", "TA", "Gd", "Ex", "None"],
}

# Extract the column names that will use the ordinal encoding
ordinal_cols = [col for col in ordinal_mappings.keys() if col in X.columns]
ordinal_encoder = OrdinalEncoder(categories=[ordinal_mappings[col] for col in ordinal_cols])

# All other object columns are treated as nominal
nominal_cols = X.select_dtypes(include=["object"]).columns.difference(ordinal_cols)

In [41]:
# Build pipeline
preprocessor = ColumnTransformer(
    transformers = [
        # Scale numeric columns
        ("num", StandardScaler(), numeric_cols),

        # Encode ordinal columns
        ("ord", ordinal_encoder, ordinal_cols),

        # One hot encode nominal columns
        ("nom", OneHotEncoder(handle_unknown="ignore"), nominal_cols)
    ]
)

In [42]:
# Train test split
# Using 20% of data with a fixed seed
X_train, X_tst, y_train, y_tst = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Defining the full model with Polynomial Ridge Regression
ridge_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        # Adds nonlinear interaction between features
        ("poly", PolynomialFeatures(include_bias=False)),
        ("model", Ridge())
    ]
)

In [44]:
# Define hyperparameters
param_grid = {
    "poly__degree": [1, 2],
    "model__alpha": [0.1, 1.0, 10.0, 50.0]
}

In [45]:
# Grid search tests, train all models
grid_search = GridSearchCV(ridge_pipeline, param_grid, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1)
grid_search.fit(X_train, y_train)

In [46]:
# Extract best performing model from grid search
best_model = grid_search.best_estimator_
print("Best hyperparameters: ", grid_search.best_params_)

Best hyperparameters:  {'model__alpha': 50.0, 'poly__degree': 1}


In [47]:
# Evaluate best model
y_pred = best_model.predict(X_tst)

# Compute root MSE and R squared
root_mse = np.sqrt(mean_squared_error(y_tst, y_pred))
r_squared = r2_score(y_tst, y_pred)

print("Polynomial Ridge Regression Root MSE: ", root_mse)
print("Polynomial Ridge Regression R squared: ", r_squared)

Polynomial Ridge Regression Root MSE:  32025.921045002924
Polynomial Ridge Regression R squared:  0.8662821383242196
