In [None]:
# house prices dataset 
# https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data

import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

Loading data

In [4]:
df = pd.read_csv("./house_prices_ready_analysis_stage23.csv")
df = df.iloc[:, 1: ]
# df = pd.read_csv("./house-prices-advanced-regression-techniques/train.csv")
print(df.shape)

(1351, 17)


Data splitting

In [5]:
X = df.drop('SalePrice', axis=1)  # Replace 'SalePrice' with your target variable
y = df['SalePrice']

# Step 1: Split the data into trainin
# g (90%) and temporary set (10%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Split the temporary set into validation (50% of temp) and testing (50% of temp)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Check the sizes to ensure correct splitting
print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Training set: 1080 samples
Validation set: 135 samples
Testing set: 136 samples


## Regression algorithms

In [18]:
# Step 1: Initialize the Linear Regression model
model = LinearRegression()

# Step 2: Train the model using the training set
model.fit(X_train, y_train)

# Step 3: Make predictions on the validation set
y_pred = model.predict(X_test)

# Step 4: Evaluate the model using RMSE, MAE, and R²
rmse_linear = np.sqrt(mean_squared_error(y_test, y_pred))
mae_linear = mean_absolute_error(y_test, y_pred)
r2_linear = r2_score(y_test, y_pred)

# Print the evaluation metrics
print("linear regression model: ")
print("--------------------------- ")
print(f"Root Mean Squared Error (RMSE): {rmse_linear}")
print(f"Mean Absolute Error (MAE): {mae_linear}")
print(f"R² Score: {r2_linear}")


linear regression model: 
--------------------------- 
Root Mean Squared Error (RMSE): 26334.66931903928
Mean Absolute Error (MAE): 19158.459423223543
R² Score: 0.8302912687775024


If we took the average of RMSE and MAE we can say that on a prediction scale there is around 22,000 dollars variance or error range. 

A trial to see if there is overfitting in the training data. 

In [19]:
# Step 1: Initialize the Linear Regression model
model = LinearRegression()

# Step 2: Train the model using the training set
model.fit(X_train, y_train)

# Step 3: Make predictions on the validation set
y_pred = model.predict(X_train)

# Step 4: Evaluate the model using RMSE, MAE, and R²
rmse_linear = np.sqrt(mean_squared_error(y_train, y_pred))
mae_linear = mean_absolute_error(y_train, y_pred)
r2_linear = r2_score(y_train, y_pred)

# Print the evaluation metrics
print("linear regression model: ")
print("--------------------------- ")
print(f"Root Mean Squared Error (RMSE): {rmse_linear}")
print(f"Mean Absolute Error (MAE): {mae_linear}")
print(f"R² Score: {r2_linear}")

linear regression model: 
--------------------------- 
Root Mean Squared Error (RMSE): 23992.202848990168
Mean Absolute Error (MAE): 17150.462404685582
R² Score: 0.8604958212172376


## Ridge model 

In [11]:
# Step 1: Initialize Ridge regression
ridge = Ridge()

# Step 2: Define the hyperparameter grid
param_grid = {
    'alpha': [0.01, 0.1, 1, 10, 100],
    # 'fit_intercept': [True, False],
    # 'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
    # 'max_iter': [1000, 5000]
}

# Step 3: Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)

# Step 4: Train the model using the training set and find the best hyperparameters
grid_search.fit(X_train, y_train)

# Step 5: Get the best hyperparameters
best_params = grid_search.best_params_
print(f"Best hyperparameters: {best_params}")

# Step 6: Train the Ridge model with the best hyperparameters
best_ridge = Ridge(**best_params)
best_ridge.fit(X_train, y_train)

# Step 7: Make predictions on the testing set
y_pred_ridge = best_ridge.predict(X_test)

# Step 8: Evaluate the model using RMSE, MAE, and R²
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

# Print the evaluation metrics
print("Ridge Regression model with tuned hyperparameters: ")
print("--------------------------------------------------")
print(f"Root Mean Squared Error (RMSE): {rmse_ridge}")
print(f"Mean Absolute Error (MAE): {mae_ridge}")
print(f"R² Score: {r2_ridge}")


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best hyperparameters: {'alpha': 10}
Ridge Regression model with tuned hyperparameters: 
--------------------------------------------------
Root Mean Squared Error (RMSE): 26323.04930744234
Mean Absolute Error (MAE): 19130.826094331842
R² Score: 0.8304410016130064


## lasso regression 

In [13]:
# Step 1: Initialize Lasso regression
lasso = Lasso()

# Step 2: Define the hyperparameter grid
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100],  # Alpha values for L1 regularization
    # 'fit_intercept': [True, False],            # Whether to fit an intercept
    # 'max_iter': [1000, 5000],                  # Number of iterations for convergence
    # 'tol': [1e-4, 1e-3, 1e-2]                 # Tolerance for stopping criteria
}

# Step 3: Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(lasso, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)

# Step 4: Train the model using the training set and find the best hyperparameters
grid_search.fit(X_train, y_train)

# Step 5: Get the best hyperparameters
best_params = grid_search.best_params_
print(f"Best hyperparameters: {best_params}")

# Step 6: Train the Lasso model with the best hyperparameters
best_lasso = Lasso(**best_params)
best_lasso.fit(X_train, y_train)

# Step 7: Make predictions on the validation set
y_pred_lasso = best_lasso.predict(X_test)

# Step 8: Evaluate the model using RMSE, MAE, and R²
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

# Print the evaluation metrics
print("Lasso Regression model with tuned hyperparameters: ")
print("--------------------------------------------------")
print(f"Root Mean Squared Error (RMSE): {rmse_lasso}")
print(f"Mean Absolute Error (MAE): {mae_lasso}")
print(f"R² Score: {r2_lasso}")


Fitting 5 folds for each of 6 candidates, totalling 30 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

Best hyperparameters: {'alpha': 100}
Lasso Regression model with tuned hyperparameters: 
--------------------------------------------------
Root Mean Squared Error (RMSE): 26336.776486077884
Mean Absolute Error (MAE): 19119.5048081803
R² Score: 0.8302641092223836


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


## Decision tree 

In [58]:
# Import necessary libraries for Decision Tree Regression and hyperparameter tuning
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Step 1: Initialize Decision Tree Regressor
tree = DecisionTreeRegressor(random_state=42)

# Step 2: Define the hyperparameter grid
param_grid = {
    'max_depth': [5, 10, 15, 20, None],                # Depth of the tree
    'min_samples_split': [2, 10, 20],                  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 5, 10],                    # Minimum number of samples required to be at a leaf node
    # 'max_features': [None, 'auto', 'sqrt', 'log2'],     # Number of features to consider at each split
    # 'criterion': ['mse', 'friedman_mse']                # Quality measure of the split
}

# Step 3: Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(tree, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)

# Step 4: Train the model using the training set and find the best hyperparameters
grid_search.fit(X_train, y_train)

# Step 5: Get the best hyperparameters
best_params = grid_search.best_params_
print(f"Best hyperparameters: {best_params}")

# Step 6: Train the Decision Tree model with the best hyperparameters
best_tree = DecisionTreeRegressor(**best_params, random_state=42)
best_tree.fit(X_train, y_train)

# Step 7: Make predictions on the validation set
y_pred_tree = best_tree.predict(X_train)

# Step 8: Evaluate the model using RMSE, MAE, and R²
rmse_tree = np.sqrt(mean_squared_error(y_train, y_pred_tree))
mae_tree = mean_absolute_error(y_train, y_pred_tree)
r2_tree = r2_score(y_train, y_pred_tree)

# Print the evaluation metrics
print("Decision Tree Regression model with tuned hyperparameters: ")
print("---------------------------------------------------------")
print(f"Root Mean Squared Error (RMSE): {rmse_tree}")
print(f"Mean Absolute Error (MAE): {mae_tree}")
print(f"R² Score: {r2_tree}")


Fitting 5 folds for each of 45 candidates, totalling 225 fits
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 10, 'min_samples_split': 2}
Decision Tree Regression model with tuned hyperparameters: 
---------------------------------------------------------
Root Mean Squared Error (RMSE): 28225.508669425286
Mean Absolute Error (MAE): 16898.914698222496
R² Score: 0.875628380359854


In [59]:
# Replace these with the actual numeric (float) values from your experiments
metrics_data = {
    'Method': ['Linear Regression', 'Ridge Regression', 'Lasso Regression', 'Decision Tree'],
    'RMSE': [rmse_linear, rmse_ridge, rmse_lasso, rmse_tree],  # Ensure these are float values
    'MAE': [mae_linear, mae_ridge, mae_lasso, mae_tree],        # Ensure these are float values
    'R² Score': [r2_linear, r2_ridge, r2_lasso, r2_tree]        # Ensure these are float values
}

# Create a DataFrame
metrics_df = pd.DataFrame(metrics_data)

# Print the table
print(metrics_df)

# If you want to format the table with two decimal places, use this:
metrics_df = metrics_df.style.format({"RMSE": "{:.2f}", "MAE": "{:.2f}", "R² Score": "{:.2f}"})

# Display the styled table (only works in Jupyter or a notebook interface)
metrics_df.set_caption("Regression Models Performance Comparison")


              Method          RMSE           MAE  R² Score
0  Linear Regression  28410.892119  18544.830309  0.880137
1   Ridge Regression  31340.737451  18934.365768  0.854141
2   Lasso Regression  25052.598313  15519.910027  0.906799
3      Decision Tree  28225.508669  16898.914698  0.875628


Unnamed: 0,Method,RMSE,MAE,R² Score
0,Linear Regression,28410.89,18544.83,0.88
1,Ridge Regression,31340.74,18934.37,0.85
2,Lasso Regression,25052.6,15519.91,0.91
3,Decision Tree,28225.51,16898.91,0.88


As you can notice linear regression, ridge, and lasso regressions are almost the same results. The ridge is much closer to the linear regression which makes sense because the alpha value, the hyperparameter, for ridge is only 10 which means there is not a big bias to the slope change than the linear regression. While the alpha value for the lasso regression is much bigger, value of 100, which means more slope bias than the linear regression and more difference in the prediction. 

Linear regression is the best that means the training and testing datasets are extremely close to each other, almost the same. 



What about training on the whole dataset. 

In [22]:
# data = pd.read_csv("./house_prices_ready_analysis_stage23.csv")
data = pd.read_csv("./house-prices-advanced-regression-techniques/train.csv")

print(data.shape)

(1460, 81)


preprocessing 

In [35]:
# dropping IID columns: 
pre = data.copy()
pre = pre.drop(['Id'], axis=1)

# dropping null values: 
print(pre.isnull().sum())

# test = prepro_data.copy()
null_counts = pre.isnull().mean()*100
print(pre.shape, "\n\n **************** ")

# drop those features having null values more than 500 
columns_with_50nulls = null_counts[null_counts >= 45]
pre = pre.drop(columns = columns_with_50nulls.index)
print("Those are the columns with null values almost greater than 50% to drop the whole feature column from the entire dataset: \n", columns_with_50nulls)
print(pre.shape , "\n\n **************** ")


# Identify columns with fewer than 5 but more than 0 null values
columns_with_nulls = null_counts[(null_counts < 6) & (null_counts > 0)].index
# Either drop the rows 
# # test = test.dropna(subset = columns_with_nulls)
# Or fill them with the most repetitive value in the column which is more preferable to not decrease the dataset. 
for column in columns_with_nulls:
    pre[column].fillna(pre[column].mode()[0], inplace=True)
# columns_with_nulls = columns_with_nulls[columns_with_nulls > 0]
print("Those are the columns with null values almost less than 5% \nI replaced with the mode imputation (most repetitive value in the column): \n\n", columns_with_nulls)
print(pre.shape, " \n\n ************ ")

# 
null_counts = pre.isnull().sum()
null_counts = null_counts[null_counts > 0]
print(pre.shape, null_counts)
        


MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
Street             0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 80, dtype: int64
(1460, 80) 

 **************** 
Those are the columns with null values almost greater than 50% to drop the whole feature column from the entire dataset: 
 Alley          93.767123
MasVnrType     59.726027
FireplaceQu    47.260274
PoolQC         99.520548
Fence          80.753425
MiscFeature    96.301370
dtype: float64
(1460, 74) 

 **************** 
Those are the columns with null values almost less than 5% 
I replaced with the mode imputation (most repetitive value in the column): 

 Index(['MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinType2', 'Electrical', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageQual', 'GarageCond'],
      dtype='object')
(1460, 74)  

 ************ 
(1460, 7

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  pre[column].fillna(pre[column].mode()[0], inplace=True)


In [36]:
print(pre.isnull().sum().sum())
pre = pre.dropna()
print(pre.isnull().sum().sum())

259
0


encdoing categorical features: 

In [49]:
df = pd.get_dummies(pre)

# Convert all boolean values (True/False) to integers (1/0)
df = df.astype({col: 'int' for col in df.select_dtypes(include=['bool']).columns})

# dropping non-numerical features
# df = df.select_dtypes(include=[np.number])

print(df.shape)

(1201, 260)


Dropping all non-numerical features: 

Data Splitting 

In [50]:
X = df.drop('SalePrice', axis=1)  # Replace 'SalePrice' with your target variable
y = df['SalePrice']

# Step 1: Split the data into trainin
# g (90%) and temporary set (10%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Split the temporary set into validation (50% of temp) and testing (50% of temp)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Check the sizes to ensure correct splitting
print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Training set: 960 samples
Validation set: 120 samples
Testing set: 121 samples


In [51]:
# linear regression on hte whole dataset columns

# Step 1: Initialize the Linear Regression model
model = LinearRegression()

# Step 2: Train the model using the training set
model.fit(X_train, y_train)

# Step 3: Make predictions on the validation set
y_pred = model.predict(X_test)

# Step 4: Evaluate the model using RMSE, MAE, and R²
rmse_linear = np.sqrt(mean_squared_error(y_test, y_pred))
mae_linear = mean_absolute_error(y_test, y_pred)
r2_linear = r2_score(y_test, y_pred)

# Print the evaluation metrics
print("linear regression model: ")
print("--------------------------- ")
print(f"Root Mean Squared Error (RMSE): {rmse_linear}")
print(f"Mean Absolute Error (MAE): {mae_linear}")
print(f"R² Score: {r2_linear}")


linear regression model: 
--------------------------- 
Root Mean Squared Error (RMSE): 28410.892118550353
Mean Absolute Error (MAE): 18544.8303086884
R² Score: 0.8801372479249101


In [52]:
# Step 1: Initialize Ridge regression
ridge = Ridge()

# Step 2: Define the hyperparameter grid
param_grid = {
    'alpha': [0.01, 0.1, 1, 10, 100],
    # 'fit_intercept': [True, False],
    # 'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
    # 'max_iter': [1000, 5000]
}

# Step 3: Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)

# Step 4: Train the model using the training set and find the best hyperparameters
grid_search.fit(X_train, y_train)

# Step 5: Get the best hyperparameters
best_params = grid_search.best_params_
print(f"Best hyperparameters: {best_params}")

# Step 6: Train the Ridge model with the best hyperparameters
best_ridge = Ridge(**best_params)
best_ridge.fit(X_train, y_train)

# Step 7: Make predictions on the testing set
y_pred_ridge = best_ridge.predict(X_test)

# Step 8: Evaluate the model using RMSE, MAE, and R²
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

# Print the evaluation metrics
print("Ridge Regression model with tuned hyperparameters: ")
print("--------------------------------------------------")
print(f"Root Mean Squared Error (RMSE): {rmse_ridge}")
print(f"Mean Absolute Error (MAE): {mae_ridge}")
print(f"R² Score: {r2_ridge}")


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best hyperparameters: {'alpha': 10}
Ridge Regression model with tuned hyperparameters: 
--------------------------------------------------
Root Mean Squared Error (RMSE): 31340.737451250054
Mean Absolute Error (MAE): 18934.365767706655
R² Score: 0.8541411028841343


In [55]:
# lasso regression 

# Step 1: Initialize Lasso regression
lasso = Lasso()

# Step 2: Define the hyperparameter grid
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100],  # Alpha values for L1 regularization
    # 'fit_intercept': [True, False],            # Whether to fit an intercept
    # 'max_iter': [1000, 5000],                  # Number of iterations for convergence
    # 'tol': [1e-4, 1e-3, 1e-2]                 # Tolerance for stopping criteria
}

# Step 3: Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(lasso, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)

# Step 4: Train the model using the training set and find the best hyperparameters
grid_search.fit(X_train, y_train)

# Step 5: Get the best hyperparameters
best_params = grid_search.best_params_
print(f"Best hyperparameters: {best_params}")

# Step 6: Train the Lasso model with the best hyperparameters
best_lasso = Lasso(**best_params)
best_lasso.fit(X_train, y_train)

# Step 7: Make predictions on the validation set
y_pred_lasso = best_lasso.predict(X_test)

# Step 8: Evaluate the model using RMSE, MAE, and R²
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

# Print the evaluation metrics
print("Lasso Regression model with tuned hyperparameters: ")
print("--------------------------------------------------")
print(f"Root Mean Squared Error (RMSE): {rmse_lasso}")
print(f"Mean Absolute Error (MAE): {mae_lasso}")
print(f"R² Score: {r2_lasso}")


Fitting 5 folds for each of 6 candidates, totalling 30 fits


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best hyperparameters: {'alpha': 100}
Lasso Regression model with tuned hyperparameters: 
--------------------------------------------------
Root Mean Squared Error (RMSE): 25052.598313042316
Mean Absolute Error (MAE): 15519.910027131218
R² Score: 0.9067991113910054


In [56]:
# Import necessary libraries for Decision Tree Regression and hyperparameter tuning
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Step 1: Initialize Decision Tree Regressor
tree = DecisionTreeRegressor(random_state=42)

# Step 2: Define the hyperparameter grid
param_grid = {
    'max_depth': [5, 10, 15, 20, None],                # Depth of the tree
    'min_samples_split': [2, 10, 20],                  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 5, 10],                    # Minimum number of samples required to be at a leaf node
    # 'max_features': [None, 'auto', 'sqrt', 'log2'],     # Number of features to consider at each split
    # 'criterion': ['mse', 'friedman_mse']                # Quality measure of the split
}

# Step 3: Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(tree, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)

# Step 4: Train the model using the training set and find the best hyperparameters
grid_search.fit(X_train, y_train)

# Step 5: Get the best hyperparameters
best_params = grid_search.best_params_
print(f"Best hyperparameters: {best_params}")

# Step 6: Train the Decision Tree model with the best hyperparameters
best_tree = DecisionTreeRegressor(**best_params, random_state=42)
best_tree.fit(X_train, y_train)

# Step 7: Make predictions on the validation set
y_pred_tree = best_tree.predict(X_test)

# Step 8: Evaluate the model using RMSE, MAE, and R²
rmse_tree = np.sqrt(mean_squared_error(y_test, y_pred_tree))
mae_tree = mean_absolute_error(y_test, y_pred_tree)
r2_tree = r2_score(y_test, y_pred_tree)

# Print the evaluation metrics
print("Decision Tree Regression model with tuned hyperparameters: ")
print("---------------------------------------------------------")
print(f"Root Mean Squared Error (RMSE): {rmse_tree}")
print(f"Mean Absolute Error (MAE): {mae_tree}")
print(f"R² Score: {r2_tree}")


Fitting 5 folds for each of 45 candidates, totalling 225 fits
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 10, 'min_samples_split': 2}
Decision Tree Regression model with tuned hyperparameters: 
---------------------------------------------------------
Root Mean Squared Error (RMSE): 42332.54562087181
Mean Absolute Error (MAE): 27722.23726465462
R² Score: 0.7338887266689347


In [57]:
# Replace these with the actual numeric (float) values from your experiments
metrics_data = {
    'Method': ['Linear Regression', 'Ridge Regression', 'Lasso Regression', 'Decision Tree'],
    'RMSE': [rmse_linear, rmse_ridge, rmse_lasso, rmse_tree],  # Ensure these are float values
    'MAE': [mae_linear, mae_ridge, mae_lasso, mae_tree],        # Ensure these are float values
    'R² Score': [r2_linear, r2_ridge, r2_lasso, r2_tree]        # Ensure these are float values
}

# Create a DataFrame
metrics_df = pd.DataFrame(metrics_data)

# Print the table
print(metrics_df)

# If you want to format the table with two decimal places, use this:
metrics_df = metrics_df.style.format({"RMSE": "{:.2f}", "MAE": "{:.2f}", "R² Score": "{:.2f}"})

# Display the styled table (only works in Jupyter or a notebook interface)
metrics_df.set_caption("Regression Models Performance Comparison")


              Method          RMSE           MAE  R² Score
0  Linear Regression  28410.892119  18544.830309  0.880137
1   Ridge Regression  31340.737451  18934.365768  0.854141
2   Lasso Regression  25052.598313  15519.910027  0.906799
3      Decision Tree  42332.545621  27722.237265  0.733889


Unnamed: 0,Method,RMSE,MAE,R² Score
0,Linear Regression,28410.89,18544.83,0.88
1,Ridge Regression,31340.74,18934.37,0.85
2,Lasso Regression,25052.6,15519.91,0.91
3,Decision Tree,42332.55,27722.24,0.73
