<a href="https://colab.research.google.com/github/mohamadnaji/Machine-Learning-Real-Estate-Price-Prediction-Lebanon/blob/features%2Ftrying-to-enhance/house_prices_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# House Prices Prediction using TensorFlow Decision Forests

## Import the library

In [109]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [110]:
import tensorflow as tf
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Comment this if the data visualisations doesn't work on your side
%matplotlib inline

## Load the dataset


In [111]:
# train_file_path = "/content/drive/Othercomputers/MyMonty Laptop/university M2/M2 final project/Dataset/step 3 - with spatial analysis/SELL_REAL_ESTATE_2024-09-15-2.xlsx"
train_file_path = "/content/drive/Othercomputers/MyMonty Laptop/university M2/M2 final project/Dataset/step 3 - with spatial analysis/SELL_REAL_ESTATE_2024-11-17.xlsx"
# train_file_path = "/content/drive/MyDrive/university M2/M2 final project/Dataset/step 3 - with spatial analysis/SELL_REAL_ESTATE_2024-10-29 - Copy.xlsx"
# Load the Excel file into a DataFrame
data = pd.read_excel(train_file_path)

print("Full train dataset shape is {}".format(data.shape))

Full train dataset shape is (10003, 50)


## preprocess the data

In [112]:
# Function to extract numbers
def extract_number(value):
    # Remove any non-numeric characters
    number = re.sub(r"[^\d]", "", value)
    return int(number) if number else None

# Apply the function to the 'Currency' column
data['Price'] = data['price'].apply(extract_number)
data = data.drop(columns=['price' ,'Listing Type', 'Join_Count', 'creation date',
                          'NEAR_FID', 'OBJECTID', 'TARGET_FID', 'source',
                          'source_name', 'Reference Id', 'img_src', 'description'
                          ,'NEAR_UNIVERSITY_DIST'], inplace=False)

data = data.drop(columns=[ 'X', 'Y'])

In [113]:
data = data.dropna(subset=['Price'])

# Assuming your DataFrame is named 'data' and the columns are 'Type' and 'Size'
data = data[data['Property Type'] == 'Apartment']
data = data[data['Floors'].isna()]
data = data[data['Bedrooms'].notna()]
data = data[data['Bathrooms'].notna()]
data = data[data['Size (m²)'] < 800]

In [114]:
# Count the number of rows for each governorate
governorate_counts = data['governorate'].value_counts()

# Get the top 3 governorates with the highest row counts
top_3_governorates = governorate_counts.head(3).index

# Filter the data for only the top 3 governorates
data = data[data['governorate'].isin(top_3_governorates)]

data.loc[data['governorate'] == 'Keserwan-Jbeil', 'governorate'] = 'Mont-Liban'

In [115]:
print(data.shape)

(7204, 36)


Dropping the column that contains more than 1000 null value

In [116]:
# Filter columns based on null value counts
columns_to_drop = data.columns[data.isnull().sum() > 1500]

# Drop columns with null value counts exceeding the threshold
data = data.drop(columns=columns_to_drop)

# # Remaining columns
data.shape

(7204, 29)

### Use data augmentation method to add more data for high prices

In [117]:
import numpy as np
import pandas as pd

# Identify numerical, binary, and categorical features
numerical_features = data.select_dtypes(include=[np.number]).columns
binary_features = [col for col in numerical_features if data[col].nunique() == 2]
continuous_features = [col for col in numerical_features if col not in binary_features]
categorical_features = data.select_dtypes(exclude=[np.number]).columns

# Filter high-priced data (e.g., Price >= 500k) 600 1.2
high_price_data = data[data['Price'] >= 700000]

# Number of synthetic samples to generate (e.g., 50% of high-price data)
n_samples = int(0.8 * len(high_price_data))
print(f'Number of new samples {n_samples}')

# Generate synthetic samples with Gaussian noise for continuous features only
synthetic_samples = []
for _ in range(n_samples):
    sample = high_price_data.sample(n=1).copy()

    # Add Gaussian noise to continuous features SCALE=0.05
    noise = np.random.normal(loc=0, scale=0.05, size=len(continuous_features))
    sample[continuous_features] += sample[continuous_features] * noise

    # # Ensure X and Y columns are minimally altered (0.001 or less)
    # if 'X' in continuous_features and 'Y' in continuous_features:
    #     xy_noise = np.random.normal(loc=0, scale=0.001, size=2)  # Generate noise for X and Y
    #     sample['X'] += xy_noise[0]
    #     sample['Y'] += xy_noise[1]

    # Ensure binary features remain unchanged
    sample[binary_features] = sample[binary_features].round().clip(0, 1)

    # Append the augmented sample
    synthetic_samples.append(sample)

# Create a DataFrame from synthetic samples
augmented_high_price_data = pd.concat(synthetic_samples, ignore_index=True)

# Concatenate the augmented data with the original data
data = pd.concat([data, augmented_high_price_data], ignore_index=True)

data = data[data['Price'] < 2000000]
data = data[data['Price'] > 15000]

integer_features = ['Bedrooms', 'Bathrooms']
data[integer_features] = np.ceil(data[integer_features]).astype(int)

print("Augmented data shape:", data.shape)

Number of new samples 794
Augmented data shape: (7769, 29)


Encoding the object features using one label encoder

In [118]:
from sklearn.preprocessing import LabelEncoder

# Find categorical columns
categorical_columns = data.select_dtypes(include=['object', 'category']).columns

# Initialize OneHotEncoder
encoder = LabelEncoder()

# Fit and transform the data
for col in categorical_columns:
    data[col] = encoder.fit_transform(data[col])
    print(col)
    print(dict(enumerate(encoder.classes_)))

Property Type
{0: 'Apartment'}
Ownership
{0: 'By Company', 1: 'By Owner', 2: nan}
Payment method
{0: 'Cash', 1: 'Cheque', 2: 'Installments', 3: 'Other', 4: nan}
Condition
{0: 'Ready to move in', 1: 'Under Construction', 2: nan}
LULC
{0: 'Banana', 1: 'Bare Rocks', 2: 'Clear Grasslands', 3: 'Clear Mixed Wooded Lands', 4: 'Clear Oaks', 5: 'Clear Pines', 6: 'Dense Informal Urban Fabric', 7: 'Dense Mixed Wooded Lands', 8: 'Dense Oaks', 9: 'Dense Pines', 10: 'Dense Urban Fabric', 11: 'Diverse Equipment', 12: 'Fruit Trees', 13: 'Green Sports Area', 14: 'Highway', 15: 'Industrial or Commercial Areas', 16: 'Low Density Urban Fabric', 17: 'Medium Density Urban Fabric', 18: 'Meium Denisty Informal Urban Fabric', 19: 'Olives', 20: 'Port Areas', 21: 'Port Basin', 22: 'Protected Agriculture', 23: 'Rocky Outcrops', 24: 'Scrubland', 25: 'Scrubland with Some Dispersed Bigger Trees', 26: 'Uran Extension and/or Construction Sites', 27: 'Urban Vacant Land'}
governorate
{0: 'Beyrouth', 1: 'Mont-Liban'}


Filling null values using iterative imputer

In [119]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer  # Required for IterativeImputer
from sklearn.impute import IterativeImputer

# Identify column types
numerical_features = data.select_dtypes(include=[np.number]).columns
binary_features = [col for col in numerical_features if data[col].nunique() == 2]
integer_features = [col for col in numerical_features if col not in binary_features and pd.api.types.is_integer_dtype(data[col])]

# Apply Iterative Imputer
iterative_imputer = IterativeImputer()
df_imputed = iterative_imputer.fit_transform(data)

# Convert the imputed data back to a DataFrame
data = pd.DataFrame(df_imputed, columns=data.columns)

# Convert binary features back to 0/1
data[binary_features] = data[binary_features].round().clip(0, 1)

# Convert integer features back to integers
data[integer_features] = data[integer_features].round().astype(int)

print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7769 entries, 0 to 7768
Data columns (total 29 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Property Type             7769 non-null   int64  
 1   Ownership                 7769 non-null   int64  
 2   Bedrooms                  7769 non-null   int64  
 3   Bathrooms                 7769 non-null   int64  
 4   Size (m²)                 7769 non-null   float64
 5   Payment method            7769 non-null   int64  
 6   Condition                 7769 non-null   int64  
 7   Near Amenities            7769 non-null   float64
 8   Heating and Cooling       7769 non-null   float64
 9   Outdoor and Landscaping   7769 non-null   float64
 10  Security Features         7769 non-null   float64
 11  Storage and Space         7769 non-null   float64
 12  Views                     7769 non-null   float64
 13  Technology and Utilities  7769 non-null   float64
 14  Luxury a

Removing outliers

In [120]:
from sklearn.ensemble import IsolationForest

def remove_outliers_isolation_forest(data):#0.15
    iso_forest = IsolationForest(contamination=0.15, random_state=42)
    outliers = iso_forest.fit_predict(data.select_dtypes(include=np.number))
    # print(data[outliers == 1])
    return data[outliers == 1]  # Keep only inliers

data = remove_outliers_isolation_forest(data)
print(data.shape)

(6603, 29)


Select the most important features

In [121]:
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt
# from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.linear_model import Lasso
# from sklearn.preprocessing import StandardScaler
# from sklearn.feature_selection import RFE
# from sklearn.linear_model import LinearRegression
# import shap
# from xgboost import XGBRegressor

# # Assuming you have loaded your data into 'data'
# X = data.drop(columns=['Price'])
# y = data['Price']

# # Standardize the data for methods that require scaling
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# # Dictionary to store feature importance scores
# feature_scores = pd.DataFrame(index=X.columns)

# # 1. SelectKBest (f_regression)
# skb = SelectKBest(score_func=f_regression, k='all')
# skb.fit(X, y)
# feature_scores['SelectKBest'] = skb.scores_

# # 2. Recursive Feature Elimination (RFE) with Linear Regression
# rfe = RFE(estimator=LinearRegression(), n_features_to_select=20)
# rfe.fit(X, y)
# # Since RFE provides rankings, we convert them to importance scores (inverse of ranking)
# feature_scores['RFE'] = 1 / rfe.ranking_
# print(feature_scores['RFE'])

# # 3. Random Forest Feature Importance
# rf = RandomForestRegressor()
# rf.fit(X, y)
# feature_scores['RandomForest'] = rf.feature_importances_

# # 4. Lasso Regression
# lasso = Lasso(alpha=0.01)
# lasso.fit(X_scaled, y)
# feature_scores['Lasso'] = np.abs(lasso.coef_)

# # 5. Mutual Information
# mi = mutual_info_regression(X, y)
# feature_scores['MutualInfo'] = mi

# # 6. SHAP Values with XGBoost
# xgb = XGBRegressor()
# xgb.fit(X, y)
# explainer = shap.Explainer(xgb)
# shap_values = explainer(X)
# shap_importance = np.abs(shap_values.values).mean(axis=0)
# feature_scores['SHAP'] = shap_importance

# # Normalize scores for better comparison (0 to 1)
# feature_scores = feature_scores.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))

# # Select top 20 features based on the average score across all methods
# top_features = feature_scores.mean(axis=1).nlargest(20).index

# # Filter scores for top features only
# top_feature_scores = feature_scores.loc[top_features]

# # Plot the grouped bar chart
# top_feature_scores.plot(kind='bar', figsize=(15, 8), width=0.8)
# plt.title('Feature Importance Scores Across Different Methods')
# plt.xlabel('Features')
# plt.ylabel('Normalized Importance Score')
# plt.xticks(rotation=45, ha='right')
# plt.legend(title='Methods')
# plt.grid(axis='y')
# plt.tight_layout()
# plt.show()


In [122]:
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt
# from sklearn.feature_selection import mutual_info_regression

# # Assuming you have loaded your data into 'data'
# X = data.drop(columns=['Price'])
# y = data['Price']

# # Calculate mutual information
# mi_scores = mutual_info_regression(X, y)
# mi_scores = pd.Series(mi_scores, index=X.columns)

# # Select top 20 features
# selected_features = mi_scores.nlargest(25)
# print(selected_features)

# # Plot the mutual information scores for the top 20 features
# plt.figure(figsize=(12, 8))
# bars = plt.barh(selected_features.index, selected_features.values, color='skyblue')
# plt.xlabel('Mutual Information Score')
# plt.title('Top 20 Features Based on Mutual Information')

# # Annotate bars with the score values
# for bar in bars:
#     plt.text(bar.get_width() + 0.001, bar.get_y() + bar.get_height()/2,
#              f'{bar.get_width():.3f}', va='center', ha='left')

# plt.gca().invert_yaxis()  # Invert y-axis to have the highest score at the top
# plt.grid(axis='x', linestyle='--', alpha=0.7)
# plt.tight_layout()
# plt.show()


# X = data[selected_features.index.tolist()]

In [None]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_regression
import matplotlib.pyplot as plt

# Save the DataFrame to an Excel file
data.to_excel('/content/drive/Othercomputers/MyMonty Laptop/university M2/M2 final project/Dataset/step 3 - with spatial analysis/FINAL_SELL_REAL_ESTATE_2024-11-17-2.xlsx', index=False)

# Step 5: Calculate the correlation or other metric
X = data.drop(columns=['Price'])
y = data['Price']

# Perform feature selection using f_regression
selector = SelectKBest(score_func=f_regression, k='all')
selector.fit(X, y)

# Create a DataFrame for feature scores
feature_scores = pd.DataFrame({
    'Feature': X.columns,
    'Score': selector.scores_
}).sort_values(by='Score', ascending=False)  # Sort by score for easy ranking

# Step 6: Select the most important 24 features
selected_features = feature_scores.nlargest(14, 'Score')
print("Top 24 Selected Features:")
print(selected_features['Feature'].tolist())

plt.figure(figsize=(12, 8))
bars = plt.barh(selected_features['Feature'], selected_features['Score'], color='skyblue')
plt.xlabel('Score (F-Regression)')
plt.title('Top 20 Features Based on F-Regression Scores')

# Annotate bars with the score values
for bar in bars:
    plt.text(
        bar.get_width() + 0.1, bar.get_y() + bar.get_height() / 2,
        f'{bar.get_width():.3f}', va='center', ha='left'
    )

# Invert y-axis to display the highest score at the top
plt.gca().invert_yaxis()
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Update X to include only the selected features
X = X[selected_features['Feature'].tolist()]

# Check the shapes of X and y
print("Feature Matrix Shape:", X.shape)
print("Target Variable Shape:", y.shape)


## House Price Distribution

Now let us take a look at how the house prices are distributed.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming your DataFrame is named 'data'
for col in X.columns:
    plt.figure(figsize=(8, 4))
    if pd.api.types.is_numeric_dtype(data[col]):
        # Plot histogram for numerical columns
        plt.hist(data[col].dropna(), bins=30, color='skyblue', edgecolor='black')
        plt.title(f'Histogram of {col}')
        plt.xlabel(col)
        plt.ylabel('Frequency')
    elif pd.api.types.is_categorical_dtype(data[col]) or data[col].dtype == 'object':
        # Plot bar plot for categorical columns
        sns.countplot(data=data, x=col, palette='viridis')
        plt.title(f'Distribution of {col}')
        plt.xlabel(col)
        plt.ylabel('Count')
        plt.xticks(rotation=45)
    else:
        # Skip columns with unsupported data types
        print(f"Skipping column: {col} (unsupported data type)")
        continue

    plt.tight_layout()
    plt.show()

In [None]:
print(data['Price'].describe())
plt.figure(figsize=(9, 8))
sns.distplot(data['Price'], color='g', bins=100, hist_kws={'alpha': 0.4});

## Numerical data distribution

We will now take a look at how the numerical features are distributed. In order to do this, let us first list all the types of data from our dataset and select only the numerical ones.

In [None]:
df_num = X.select_dtypes(include=[np.number])

In [None]:
df_num.hist(figsize=(12, 12), bins=50, xlabelsize=6, ylabelsize=6);

In [None]:
# Set up subplots for each feature
num_plots = len(df_num)

# Plot each feature against the target
for feature in selected_features['Feature'].tolist():
    plt.figure(figsize=(8, 6))  # Adjust the figure size as needed
    sns.scatterplot(x=feature, y=y, data=X)
    plt.title(f'{feature} vs House price')
    plt.xlabel(feature)
    plt.ylabel('House Price')
    plt.show()


In [None]:
# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pandas as pd
import numpy as np

# Define regressors
regressors = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostRegressor(random_state=42),
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0, random_state=42),
    "Lasso": Lasso(alpha=1.0, random_state=42),
    "ElasticNet": ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42),
    "K-Nearest Neighbors": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Extra Trees": ExtraTreesRegressor(n_estimators=100, random_state=42)
}

# Initialize a list to store results
results = []

# Train and evaluate each regressor
for name, reg in regressors.items():
    # Fit the model
    reg.fit(X_train, y_train)

    # Make predictions
    y_pred = reg.predict(X_test)

    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r_squared = reg.score(X_test, y_test)

    # Append the results
    results.append({
        "Model": name,
        "MSE": mse,
        "RMSE": rmse,
        "MAE": mae,
        "R-squared": r_squared
    })

# Create a DataFrame to display the results
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="RMSE").reset_index(drop=True)

# Display the results
print(results_df)

In [None]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import mean_squared_error, mean_absolute_error
# import pandas as pd
# import numpy as np

# # Define regressors and their parameter grids
# param_grids = {
#     "Random Forest": (RandomForestRegressor(random_state=42), {
#         "n_estimators": [100, 200, 500],
#         "max_depth": [None, 10, 20],
#         "min_samples_split": [2, 5, 10],
#         "min_samples_leaf": [1, 2, 4]
#     }),
#     "Extra Trees": (ExtraTreesRegressor(random_state=42), {
#         "n_estimators": [100, 200, 500],
#         "max_depth": [None, 10, 20],
#         "min_samples_split": [2, 5, 10]
#     }),
#     "Gradient Boosting": (GradientBoostingRegressor(random_state=42), {
#         "n_estimators": [100, 200, 500],
#         "learning_rate": [0.01, 0.1, 0.2],
#         "max_depth": [3, 5, 7],
#         "min_samples_split": [2, 5, 10]
#     }),
#     "AdaBoost": (AdaBoostRegressor(random_state=42), {
#         "n_estimators": [50, 100, 200],
#         "learning_rate": [0.01, 0.1, 1.0]
#     }),
#     "Ridge": (Ridge(), {
#         "alpha": [0.001, 0.01, 0.1, 1, 10, 100]
#     }),
#     "Lasso": (Lasso(), {
#         "alpha": [0.001, 0.01, 0.1, 1, 10, 100]
#     }),
#     "ElasticNet": (ElasticNet(random_state=42), {
#         "alpha": [0.001, 0.01, 0.1, 1, 10, 100],
#         "l1_ratio": [0.1, 0.5, 0.9]
#     }),
#     "K-Nearest Neighbors": (KNeighborsRegressor(), {
#         "n_neighbors": [3, 5, 10, 20],
#         "weights": ['uniform', 'distance'],
#         "p": [1, 2]
#     }),
#     "Decision Tree": (DecisionTreeRegressor(random_state=42), {
#         "max_depth": [None, 5, 10, 20],
#         "min_samples_split": [2, 5, 10],
#         "min_samples_leaf": [1, 2, 4]
#     })
# }

# # Initialize a list to store results
# results = []

# # Train and evaluate each regressor using GridSearchCV
# for name, (reg, param_grid) in param_grids.items():
#     print(f"Running GridSearchCV for {name}...")
#     grid_search = GridSearchCV(estimator=reg, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
#     grid_search.fit(X_train, y_train)

#     best_model = grid_search.best_estimator_
#     y_pred = best_model.predict(X_test)

#     # Calculate metrics
#     mse = mean_squared_error(y_test, y_pred)
#     rmse = np.sqrt(mse)
#     mae = mean_absolute_error(y_test, y_pred)
#     r_squared = best_model.score(X_test, y_test)

#     # Display the result for the current model
#     print(f"\nModel: {name}")
#     print(f"Best Parameters: {grid_search.best_params_}")
#     print(f"MSE: {mse:.4f}")
#     print(f"RMSE: {rmse:.4f}")
#     print(f"MAE: {mae:.4f}")
#     print(f"R-squared: {r_squared:.4f}\n")

#     # Append the results to the list
#     results.append({
#         "Model": name,
#         "Best Parameters": grid_search.best_params_,
#         "MSE": mse,
#         "RMSE": rmse,
#         "MAE": mae,
#         "R-squared": r_squared
#     })

# # Create a DataFrame to display the results
# results_df = pd.DataFrame(results)
# results_df = results_df.sort_values(by="RMSE").reset_index(drop=True)

# # Display the final sorted results
# print("\nModel Performance Results:")
# print(results_df)

In [None]:
!pip install optuna

In [None]:
import optuna
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pandas as pd
import numpy as np
from sklearn.ensemble import VotingRegressor, StackingRegressor
from sklearn.model_selection import train_test_split

# Split data into training and test sets (example setup)
# Replace X and y with your actual data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the objective function for each model
def objective(trial, model_name, return_model=False):
    if model_name == "Random Forest":
        from sklearn.ensemble import RandomForestRegressor
        reg = RandomForestRegressor(
            n_estimators=trial.suggest_categorical("n_estimators", [100, 200, 500]),
            max_depth=trial.suggest_categorical("max_depth", [None, 10, 20]),
            min_samples_split=trial.suggest_categorical("min_samples_split", [2, 5, 10]),
            min_samples_leaf=trial.suggest_categorical("min_samples_leaf", [1, 2, 4]),
            random_state=42
        )
    elif model_name == "Extra Trees":
        from sklearn.ensemble import ExtraTreesRegressor
        reg = ExtraTreesRegressor(
            n_estimators=trial.suggest_categorical("n_estimators", [100, 200, 500]),
            max_depth=trial.suggest_categorical("max_depth", [None, 10, 20]),
            min_samples_split=trial.suggest_categorical("min_samples_split", [2, 5, 10]),
            random_state=42
        )
    elif model_name == "Gradient Boosting":
        from sklearn.ensemble import GradientBoostingRegressor
        reg = GradientBoostingRegressor(
            n_estimators=trial.suggest_categorical("n_estimators", [100, 200, 500]),
            learning_rate=trial.suggest_float("learning_rate", 0.01, 0.2),
            max_depth=trial.suggest_categorical("max_depth", [3, 5, 7]),
            min_samples_split=trial.suggest_categorical("min_samples_split", [2, 5, 10]),
            random_state=42
        )
    elif model_name == "AdaBoost":
        from sklearn.ensemble import AdaBoostRegressor
        reg = AdaBoostRegressor(
            n_estimators=trial.suggest_categorical("n_estimators", [50, 100, 200]),
            learning_rate=trial.suggest_float("learning_rate", 0.01, 1.0),
            random_state=42
        )
    elif model_name == "Ridge":
        from sklearn.linear_model import Ridge
        reg = Ridge(
            alpha=trial.suggest_loguniform("alpha", 0.001, 100)
        )
    elif model_name == "Lasso":
        from sklearn.linear_model import Lasso
        reg = Lasso(
            alpha=trial.suggest_loguniform("alpha", 0.001, 100)
        )
    elif model_name == "ElasticNet":
        from sklearn.linear_model import ElasticNet
        reg = ElasticNet(
            alpha=trial.suggest_loguniform("alpha", 0.001, 100),
            l1_ratio=trial.suggest_float("l1_ratio", 0.1, 0.9),
            random_state=42
        )
    elif model_name == "K-Nearest Neighbors":
        from sklearn.neighbors import KNeighborsRegressor
        reg = KNeighborsRegressor(
            n_neighbors=trial.suggest_categorical("n_neighbors", [3, 5, 10, 20]),
            weights=trial.suggest_categorical("weights", ['uniform', 'distance']),
            p=trial.suggest_categorical("p", [1, 2])
        )
    elif model_name == "Decision Tree":
        from sklearn.tree import DecisionTreeRegressor
        reg = DecisionTreeRegressor(
            max_depth=trial.suggest_categorical("max_depth", [None, 5, 10, 20]),
            min_samples_split=trial.suggest_categorical("min_samples_split", [2, 5, 10]),
            min_samples_leaf=trial.suggest_categorical("min_samples_leaf", [1, 2, 4]),
            random_state=42
        )
    else:
        raise ValueError(f"Unsupported model: {model_name}")


    reg.fit(X_train, y_train)

    if return_model:
        return reg  # Return the trained model

    y_pred = reg.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse  # Optuna minimizes the objective

# Models to optimize
models = ["Random Forest", "Extra Trees", "Gradient Boosting", "AdaBoost",
          "Ridge", "Lasso", "ElasticNet", "K-Nearest Neighbors", "Decision Tree"]

# Results storage
results = []

# Run Optuna for each model
for model_name in models:
    print(f"Optimizing {model_name}...")

    study = optuna.create_study(direction="minimize")
    study.optimize(lambda trial: objective(trial, model_name), n_trials=50)

    best_params = study.best_params
    best_value = study.best_value
    print(f"Best parameters for {model_name}: {best_params}")
    print(f"Best MSE: {best_value:.4f}")

    # Re-train the model with the best parameters
    trained_model = objective(study.best_trial, model_name, return_model=True)
    y_pred = trained_model.predict(X_test)

    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r_squared = trained_model.score(X_test, y_test)

    results.append({
        "Model": model_name,
        "MSE": mse,
        "RMSE": rmse,
        "MAE": mae,
        "R-squared": r_squared,
        "Best Parameters": best_params
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="RMSE").reset_index(drop=True)

# Display results
print("\nModel Performance Results:")
print(results_df)

In [None]:
from sklearn.ensemble import VotingRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.base import clone

# Step 1: Prepare base models for ensemble methods using the best parameters
model_instances = []
for res in results:
    model_name = res["Model"]
    best_params = res["Best Parameters"]

    if model_name == "Random Forest":
        from sklearn.ensemble import RandomForestRegressor
        model = RandomForestRegressor(random_state=42, **best_params)
    elif model_name == "Extra Trees":
        from sklearn.ensemble import ExtraTreesRegressor
        model = ExtraTreesRegressor(random_state=42, **best_params)
    elif model_name == "Gradient Boosting":
        from sklearn.ensemble import GradientBoostingRegressor
        model = GradientBoostingRegressor(random_state=42, **best_params)
    elif model_name == "AdaBoost":
        from sklearn.ensemble import AdaBoostRegressor
        model = AdaBoostRegressor(random_state=42, **best_params)
    elif model_name == "Ridge":
        from sklearn.linear_model import Ridge
        model = Ridge(**best_params)
    elif model_name == "Lasso":
        from sklearn.linear_model import Lasso
        model = Lasso(**best_params)
    elif model_name == "ElasticNet":
        from sklearn.linear_model import ElasticNet
        model = ElasticNet(random_state=42, **best_params)
    elif model_name == "K-Nearest Neighbors":
        from sklearn.neighbors import KNeighborsRegressor
        model = KNeighborsRegressor(**best_params)
    elif model_name == "Decision Tree":
        from sklearn.tree import DecisionTreeRegressor
        model = DecisionTreeRegressor(random_state=42, **best_params)
    else:
        raise ValueError(f"Unsupported model: {model_name}")

    model_instances.append((model_name, clone(model)))

# Step 2: Define Voting Regressor
voting_regressor = VotingRegressor(estimators=model_instances)

# Train Voting Regressor
voting_regressor.fit(X_train, y_train)
y_pred_voting = voting_regressor.predict(X_test)

# Calculate metrics
mse_voting = mean_squared_error(y_test, y_pred_voting)
rmse_voting = np.sqrt(mse_voting)
mae_voting = mean_absolute_error(y_test, y_pred_voting)
r_squared_voting = voting_regressor.score(X_test, y_test)

results.append({
    "Model": "Voting Regressor",
    "MSE": mse_voting,
    "RMSE": rmse_voting,
    "MAE": mae_voting,
    "R-squared": r_squared_voting,
    "Best Parameters": "Combined"
})

# Step 3: Define Stacking Regressor
# Using Linear Regression as the final estimator
final_estimator = LinearRegression()
stacking_regressor = StackingRegressor(estimators=model_instances, final_estimator=final_estimator)

# Train Stacking Regressor
stacking_regressor.fit(X_train, y_train)
y_pred_stacking = stacking_regressor.predict(X_test)

# Calculate metrics
mse_stacking = mean_squared_error(y_test, y_pred_stacking)
rmse_stacking = np.sqrt(mse_stacking)
mae_stacking = mean_absolute_error(y_test, y_pred_stacking)
r_squared_stacking = stacking_regressor.score(X_test, y_test)

results.append({
    "Model": "Stacking Regressor",
    "MSE": mse_stacking,
    "RMSE": rmse_stacking,
    "MAE": mae_stacking,
    "R-squared": r_squared_stacking,
    "Best Parameters": "Combined"
})

# Convert results to DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="RMSE").reset_index(drop=True)

# Display results
print("\nModel Performance Results:")
print(results_df)

In [None]:
# Create a Gradient Boosting Regressor model
# gb_model = ExtraTreesRegressor(
#     max_depth=20,
#     min_samples_split=5,
#     n_estimators=500,
#     random_state=42,  # For reproducibility
#     n_jobs=-1         # Utilize all CPU cores for training
# )
gb_model = stacking_regressor
# gb_model = KNeighborsRegressor(
#     n_neighbors=5,  # Number of neighbors
#     p=2,            # Minkowski distance metric (p=2 corresponds to Euclidean distance)
#     weights='distance'  # Weight by distance (closer neighbors have higher influence)
# )

# Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'min_samples_split': 10, 'n_estimators': 100}
# Train the model on the training data
gb_model.fit(X_train, y_train)
y_pred = gb_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

# Calculate R-squared
r_squared = gb_model.score(X_test, y_test)

# print(name)
print("Mean Squared Error:", mse)
print("R-squared:", r_squared)

In [None]:
# Create a scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue', alpha=0.5)

# Add labels and title
plt.title('Actual vs. Predicted Values')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')

# Add a diagonal line representing perfect predictions
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')

# Show plot
plt.show()


## Save the model

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

In [None]:
import sklearn
import joblib
print(sklearn.__version__)
print(joblib.__version__)

In [None]:
import joblib

# Define the path in your Google Drive where you want to save the model
model_path = "/content/drive/MyDrive/university M2/M2 final project/finalModel/"

# Save the model to a file
joblib.dump(gb_model, f'{model_path}StackingRegressor2.pkl')

In [None]:
# Load the model from the file
loaded_model = joblib.load(f'{model_path}ExtraTreesRegressor.pkl')

In [None]:
print(X_valid.iloc[22])
print(y_valid.iloc[22])

In [None]:
# y_pred = loaded_model.predict(X_valid.iloc[[22]])
y_pred = loaded_model.predict(X_valid)
print(y_pred)

# Evaluate the model
mse = mean_squared_error(y_valid, y_pred)
r2 = r2_score(y_valid, y_pred)

# Print results
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")