---
## 1.Dataset

In [1]:
import pandas as pd
#download csv
url = "https://drive.google.com/file/d/1Je0e4sj5uEh2f86t8SeQweFcwvO6XLjX/view?usp=drive_link"
path = "https://drive.google.com/uc?export=download&id="+url.split("/")[-2]
data = pd.read_csv(path, index_col='Id')
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuilt    

# 2. Data Preprocessing Setup

In [2]:
import numpy as np
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import SGDRegressor, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score, mean_squared_error

# Define target and features
y = data['SalePrice']
X = data.drop(columns=['SalePrice'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Define preprocessing
nums = X.select_dtypes(include='number').columns
cats = X.select_dtypes(exclude='number').columns
drop = ['FireplaceQu', 'Alley', 'MasVnrType', 'PoolQC', 'Fence', 'MiscFeature']

preprocessor = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore', sparse_output=True), cats),
    (SimpleImputer(strategy='mean'), nums),
    ('drop', drop)
)

# Prepare a DataFrame for errors
errors_df = pd.DataFrame(columns=['MAE', 'MAPE', 'RMSE', 'R2'])

# 3. Models

1. Dummy model
2. Decision Tree Regressor with Grid Search
3. SGD Regressor
4. Linear Regression


In [3]:
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_absolute_percentage_error, root_mean_squared_error

# Dummy Regressor - used as a baseline model to compare against more complex models.
dummy = DummyRegressor(strategy='mean')
dummy.fit(X_train, y_train)
dummy_prediction = dummy.predict(X_test)

errors_df.loc['Dummy'] = [
    mean_absolute_error(y_test, dummy_prediction),
    mean_absolute_percentage_error(y_test, dummy_prediction),
    root_mean_squared_error(y_test, dummy_prediction),
    r2_score(y_test, dummy_prediction)
]

# Decision Tree Regressor with Grid Search
dtr = make_pipeline(preprocessor, DecisionTreeRegressor(random_state=123))

param_grid = {
    'decisiontreeregressor__max_depth': [5, 10, 15],
    'decisiontreeregressor__max_leaf_nodes': range(20, 161, 20),
    'decisiontreeregressor__min_samples_split': range(5, 51, 5)
}

dtr_search = GridSearchCV(dtr, param_grid, scoring='r2', n_jobs=-1, verbose=2)
dtr_search.fit(X_train, y_train)
dtr_search_pred = dtr_search.predict(X_test)

errors_df.loc['Decision Tree (Tuned)'] = [
    mean_absolute_error(y_test, dtr_search_pred),
    mean_absolute_percentage_error(y_test, dtr_search_pred),
    root_mean_squared_error(y_test, dtr_search_pred),
    r2_score(y_test, dtr_search_pred)
]

Fitting 5 folds for each of 240 candidates, totalling 1200 fits


  _data = np.array(data, dtype=dtype, copy=copy,


In [4]:
# SGD Regressor
sgd_pipeline = make_pipeline(preprocessor, StandardScaler(with_mean=False), SGDRegressor(random_state=123, max_iter=1000))
sgd_pipeline.fit(X_train, y_train)
sgd_predictions = sgd_pipeline.predict(X_test)

errors_df.loc['SGD Regressor'] = [
    mean_absolute_error(y_test, sgd_predictions),
    mean_absolute_percentage_error(y_test, sgd_predictions),
    root_mean_squared_error(y_test, sgd_predictions),
    r2_score(y_test, sgd_predictions)
]

# Linear Regression
lr_pipeline = make_pipeline(preprocessor, StandardScaler(with_mean=False), LinearRegression())
lr_pipeline.fit(X_train, y_train)
lr_predictions = lr_pipeline.predict(X_test)

errors_df.loc['Linear Regression'] = [
    mean_absolute_error(y_test, lr_predictions),
    mean_absolute_percentage_error(y_test, lr_predictions),
    root_mean_squared_error(y_test, lr_predictions),
    r2_score(y_test, lr_predictions)
]

In [5]:
# Display errors
print(errors_df)

# Display best parameters for Decision Tree
print("\nBest Parameters for Decision Tree:", dtr_search.best_params_)

                                MAE          MAPE          RMSE            R2
Dummy                  5.621416e+04  3.440144e-01  7.861715e+04 -1.685689e-04
Decision Tree (Tuned)  2.503386e+04  1.407838e-01  3.916657e+04  7.517612e-01
SGD Regressor          3.857443e+14  2.471323e+09  3.857812e+14 -2.408356e+19
Linear Regression      1.645584e+04  9.598396e-02  2.505182e+04  8.984413e-01

Best Parameters for Decision Tree: {'decisiontreeregressor__max_depth': 10, 'decisiontreeregressor__max_leaf_nodes': 40, 'decisiontreeregressor__min_samples_split': 30}
