In [25]:
# Imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [26]:
# Load dataset and create classes, 0 = low and 2 = high
train_df = pd.read_csv("train.csv")
train_df["PriceClass"] = pd.qcut(
    train_df["SalePrice"],
    q = 3,
    labels = [0, 1, 2]
)

# Features will exclude SalePrice and PriceClass
X = train_df.drop(["SalePrice", "PriceClass"], axis=1)
y = train_df["PriceClass"]

In [27]:
# Features that may or may not exist in every house
none_fill_cols = [
    "Alley","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1",
    "BsmtFinType2", "FireplaceQu","GarageType","GarageFinish",
    "GarageQual","GarageCond","PoolQC","Fence","MiscFeature"
]

# Replace missing values with "None"
for col in none_fill_cols:
    if col in X.columns:
        X[col] = X[col].fillna("None")

# Fill missing numeric values with median, as it is more resistant to outliers
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns
X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].median())

In [28]:
# Define feature types and encoders
ordinal_mappings = {
    "ExterQual": ["Po", "Fa", "TA", "Gd", "Ex"],
    "ExterCond": ["Po", "Fa", "TA", "Gd", "Ex"],
    "HeatingQC": ["Po", "Fa", "TA", "Gd", "Ex"],
    "KitchenQual": ["Po", "Fa", "TA", "Gd", "Ex"],
    "FireplaceQu": ["Po", "Fa", "TA", "Gd", "Ex", "None"],
    "GarageQual": ["Po", "Fa", "TA", "Gd", "Ex", "None"],
    "GarageCond": ["Po", "Fa", "TA", "Gd", "Ex", "None"],
    "BsmtQual": ["Po", "Fa", "TA", "Gd", "Ex", "None"],
    "BsmtCond": ["Po", "Fa", "TA", "Gd", "Ex", "None"],
}

# Extract the column names that will use the ordinal encoding
ordinal_cols = [col for col in ordinal_mappings.keys() if col in X.columns]
ordinal_encoder = OrdinalEncoder(categories=[ordinal_mappings[col] for col in ordinal_cols])

# All other object columns are treated as nominal
nominal_cols = X.select_dtypes(include=["object"]).columns.difference(ordinal_cols)

In [29]:
# Build pipeline
preprocessor = ColumnTransformer(
    transformers = [
        # Scale numeric columns
        ("num", StandardScaler(), numeric_cols),

        # Encode ordinal columns
        ("ord", ordinal_encoder, ordinal_cols),

        # One hot encode nominal columns
        ("nom", OneHotEncoder(handle_unknown="ignore"), nominal_cols)
    ]
)

In [30]:
# Train test split
# Using 20% of data with a fixed seed
X_train, X_tst, y_train, y_tst = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
# Defining the full model with Logistic Regression
logistic_regression_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", LogisticRegression(max_iter=1000, multi_class="multinomial"))
    ]
)

In [32]:
# Training the model
logistic_regression_pipeline.fit(X_train, y_train)



In [None]:
# Making predictions on the test set
y_pred = logistic_regression_pipeline.predict(X_tst)

# Compute root mean squared error and R squared
accuracy = accuracy_score(y_tst, y_pred)

print("Logistic Regression Accuracy: ", accuracy)
print("Logistic Regression:")
print(classification_report(y_tst, y_pred))

Logistic Regression Accuracy:  0.8253424657534246
Logistic Regression:
              precision    recall  f1-score   support

           0       0.85      0.89      0.87       110
           1       0.73      0.70      0.72        88
           2       0.88      0.86      0.87        94

    accuracy                           0.83       292
   macro avg       0.82      0.82      0.82       292
weighted avg       0.82      0.83      0.82       292

