In [4]:
# regression-comparison.ipynb

# Required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import (LinearRegression, Lasso, Ridge, ElasticNet, BayesianRidge)
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

In [11]:
# Load California housing dataset (CSV file)
df = pd.read_csv(r"C:\Users\mmedl\Projects\ml-regression\data\AmesHousing.csv")
# df = df.dropna()
df = df.fillna(0) 
X = df.drop(["SalePrice"], axis=1)
y = df["SalePrice"]
print(X.head(10))

   Order        PID  MS SubClass MS Zoning  Lot Frontage  Lot Area Street  \
0      1  526301100           20        RL         141.0     31770   Pave   
1      2  526350040           20        RH          80.0     11622   Pave   
2      3  526351010           20        RL          81.0     14267   Pave   
3      4  526353030           20        RL          93.0     11160   Pave   
4      5  527105010           60        RL          74.0     13830   Pave   
5      6  527105030           60        RL          78.0      9978   Pave   
6      7  527127150          120        RL          41.0      4920   Pave   
7      8  527145080          120        RL          43.0      5005   Pave   
8      9  527146030          120        RL          39.0      5389   Pave   
9     10  527162130           60        RL          60.0      7500   Pave   

  Alley Lot Shape Land Contour  ... Screen Porch Pool Area Pool QC  Fence  \
0     0       IR1          Lvl  ...            0         0       0      0  

In [12]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Models to compare
models = {
    "Linear Regression": LinearRegression(),
    "Lasso (L1) Regression": Lasso(alpha=0.1),
    "Ridge (L2) Regression": Ridge(alpha=1.0),
    "Elastic Net": ElasticNet(alpha=0.1, l1_ratio=0.5),
    "Bayesian Ridge": BayesianRidge(),
    "SVR (Linear Kernel)": SVR(kernel='linear', C=1.0),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBRegressor(n_estimators=100, random_state=42, verbosity=0),
    "LightGBM": lgb.LGBMRegressor(n_estimators=100, random_state=42),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42),
    "KNN Regression": KNeighborsRegressor(n_neighbors=5)
}

In [14]:
# Evaluate models
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results.append({"Model": name, "MSE": mse, "R2": r2})

ValueError: could not convert string to float: 'RL'

In [None]:
# Convert results to DataFrame
results_df = pd.DataFrame(results).sort_values(by="R2", ascending=False)

# Display results
print(results_df)

In [3]:
# Visualization
plt.figure(figsize=(12, 6))
sns.barplot(data=results_df, x="R2", y="Model", hue="Model", palette="viridis", legend=False)
plt.title("Regression Model Comparison (R^2 Score)")
plt.xlabel("R^2 Score")
plt.ylabel("Model")
plt.tight_layout()
plt.show()


NameError: name 'results_df' is not defined

<Figure size 1200x600 with 0 Axes>