In [37]:
# Imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [38]:
# Load Dataset
train_df = pd.read_csv("train.csv")
X = train_df.drop("SalePrice", axis=1)
y = train_df["SalePrice"]

In [39]:
# Features that may or may not exist in every house
none_fill_cols = [
    "Alley","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1",
    "BsmtFinType2", "FireplaceQu","GarageType","GarageFinish",
    "GarageQual","GarageCond","PoolQC","Fence","MiscFeature"
]

# Replace missing values with "None"
for col in none_fill_cols:
    if col in X.columns:
        X[col] = X[col].fillna("None")

# Fill missing numeric values with median, as it is more resistant to outliers
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns
X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].median())


In [40]:
# Define feature types and encoders
# There is a ranking order: poor, fair, typical, good, excellent
ordinal_mappings = {
    "ExterQual": ["Po", "Fa", "TA", "Gd", "Ex"],
    "ExterCond": ["Po", "Fa", "TA", "Gd", "Ex"],
    "HeatingQC": ["Po", "Fa", "TA", "Gd", "Ex"],
    "KitchenQual": ["Po", "Fa", "TA", "Gd", "Ex"],
    "FireplaceQu": ["Po", "Fa", "TA", "Gd", "Ex", "None"],
    "GarageQual": ["Po", "Fa", "TA", "Gd", "Ex", "None"],
    "GarageCond": ["Po", "Fa", "TA", "Gd", "Ex", "None"],
    "BsmtQual": ["Po", "Fa", "TA", "Gd", "Ex", "None"],
    "BsmtCond": ["Po", "Fa", "TA", "Gd", "Ex", "None"],
}

# Extract the column names that will use the ordinal encoding
ordinal_cols = [col for col in ordinal_mappings.keys() if col in X.columns]
ordinal_encoder = OrdinalEncoder(categories=[ordinal_mappings[col] for col in ordinal_cols])

# All other object columns are treated as nominal, these have no ranking and will be One Hot Encoded
nominal_cols = X.select_dtypes(include=["object"]).columns.difference(ordinal_cols)

In [41]:
# Build pipeline
preprocessor = ColumnTransformer(
    transformers = [
        # Scale numeric columns
        ("num", StandardScaler(), numeric_cols),

        # Encode ordinal columns
        ("ord", ordinal_encoder, ordinal_cols),

        # One hot encode nominal columns
        ("nom", OneHotEncoder(handle_unknown="ignore"), nominal_cols)
    ]
)

In [42]:
# Train test split
# Using 20% of data with a fixed seed
X_train, X_tst, y_train, y_tst = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Defining the full model with Linear Regression
linear_regression_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", LinearRegression())
    ]
)

In [44]:
# Training the model
linear_regression_pipeline.fit(X_train, y_train)

In [45]:
# Making predictions on the test set
y_pred = linear_regression_pipeline.predict(X_tst)

# Compute root mean squared error and R square
root_mse = np.sqrt(mean_squared_error(y_tst, y_pred))
r_squared = r2_score(y_tst, y_pred)

print("Root Mean Squared Error: ", root_mse)
print("R squared: ", r_squared)
print("Linear Regression Pipeline: ", linear_regression_pipeline)


Root Mean Squared Error:  75494.91325318534
R squared:  0.2569439977969199
Linear Regression Pipeline:  Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'Ha...
       'CentralAir', 'Condition1', 'Condition2', 'Electrical', 'Exterior1st',
       'Exterior2nd', 'Fence', 'Foundation', 'Functional', 'GarageFinish',
       'GarageType', 'Heating', 'HouseStyle', 'LandContour', 'LandSlope',
       'LotConfig', 'LotShape', 'MSZoning', 'MasVnrType', 'MiscFeature',
       'Neighborhood', 'PavedDrive', 'PoolQC', 'RoofMatl', 'RoofStyle',
       'SaleCondition', 'SaleType', 'Street', 'Utili