In [148]:
# Import libraries
try:
    # Importing general libraries
    import glob
    import pandas as pd

    # Importing libraries for model building
    from sklearn.preprocessing import OneHotEncoder, LabelEncoder
    from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error, mean_squared_log_error

    # Importing libraries for data preprocessing
    from scipy.stats import randint

except Exception as e:
    print(f"Error : {e}")

In [149]:
onehot_encoder = OneHotEncoder()
label_encoder = LabelEncoder()

In [150]:
# Find the CSV file in the Datasets directory
data_path = '../Datasets/*.csv'
file_list = glob.glob(data_path)

for file in file_list:
    print(f"Found file: {file}")

# Ensure there is exactly one file
if len(file_list) == 1:
    # Load the dataset
    df = pd.read_csv(file_list[0])
    print(f"Loaded dataset: {file_list[0]}")
else:
    raise FileNotFoundError("No CSV file found or multiple CSV files found in the Datasets directory.")

Found file: ../Datasets/Dataset.csv
Loaded dataset: ../Datasets/Dataset.csv


In [151]:
# Define the categorical columns
categorical_cols_unified = ['partType', 'microstructure', 'seedLocation', 'castType']

In [152]:
# Creating a copy of the dataframe to ensure we maintain the original intact
label_encoded_df = df.copy()
encoder = "Label Encoding"

# Apply Label Encoding to each categorical column
label_encoders = {}
for col in categorical_cols_unified:
    le = LabelEncoder()
    label_encoded_df[col] = le.fit_transform(label_encoded_df[col])
    label_encoders[col] = le  # Store the encoder for inverse transformation if needed later

# Display the first few rows to verify
display(label_encoded_df.head())

Unnamed: 0,Lifespan,partType,microstructure,coolingRate,quenchTime,forgeTime,HeatTreatTime,Nickel%,Iron%,Cobalt%,Chromium%,smallDefects,largeDefects,sliverDefects,seedLocation,castType
0,1469.17,2,1,13,3.84,6.47,46.87,65.73,16.52,16.82,0.93,10,0,0,0,1
1,1793.64,1,2,19,2.62,3.48,44.7,54.22,35.38,6.14,4.26,19,0,0,0,2
2,700.6,0,1,28,0.76,1.34,9.54,51.83,35.95,8.81,3.41,35,3,0,0,2
3,1082.1,2,0,9,2.01,2.19,20.29,57.03,23.33,16.86,2.78,0,1,0,1,0
4,1838.83,0,0,16,4.13,3.87,16.13,59.62,27.37,11.45,1.56,10,0,0,1,1


In [153]:
# Define the target variable and feature set
X = label_encoded_df.drop(columns=['Lifespan'])  # Features
y = label_encoded_df['Lifespan']  # Target

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets to verify
print(f'--- {encoder} Shape ---\n')
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

--- Label Encoding Shape ---

X_train shape: (800, 15)
X_test shape: (200, 15)
y_train shape: (800,)
y_test shape: (200,)


In [154]:
# Initialize the Random Forest Regressor with default parameters
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Fit the model to the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model using RMSE, R² Score, and MAE
rmse = root_mean_squared_error(y_test, y_pred)  # Root Mean Squared Error
r2 = r2_score(y_test, y_pred)  # R² Score
mae = mean_absolute_error(y_test, y_pred)  # Mean Absolute Error
msle = mean_squared_log_error(y_test, y_pred)  # Mean Squared Log Error

print(f"--- Performance of {encoder} with default parameters ---\n")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")
print(f"MAE: {mae:.2f}")
print(f"MSLE: {msle:.2f}")

--- Performance of Label Encoding with default parameters ---

RMSE: 90.95
R² Score: 0.92
MAE: 72.50
MSLE: 0.01


In [155]:
# Define the parameter distributions for RandomizedSearchCV
param_distributions = {
    'n_estimators': randint(100, 500),  # Randomly sample number of trees between 100 and 500
    'max_depth': [None] + list(range(10, 50, 5)),  # None or range from 10 to 50, step 5
    'min_samples_split': randint(2, 20),  # Random split values between 2 and 20
    'min_samples_leaf': randint(1, 10),  # Random leaf values between 1 and 10
    'max_features': ['sqrt', 'log2', None]  # Use predefined feature subsets
}

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)

# Use RandomizedSearchCV to find the best hyperparameters
search_cv = RandomizedSearchCV(estimator=rf_model, param_distributions=param_distributions,
                                   n_iter=100, cv=3, verbose=1, random_state=42, n_jobs=-1,
                                   scoring='neg_mean_squared_error')

# Fit the model to the training data
search_type = "RandomizedSearchCV"
search_cv.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [156]:
# Get the best estimator and parameters
params = search_cv.best_params_

print(f"Best parameters found for {encoder} by {search_type}")
print(f"-" * 62)
for param, value in params.items():
    print(f"    {param}={value},")

Best parameters found for Label Encoding by RandomizedSearchCV
--------------------------------------------------------------
    max_depth=15,
    max_features=None,
    min_samples_leaf=2,
    min_samples_split=3,
    n_estimators=387,


In [157]:
# Initialize the Random Forest Regressor with best parameters
rf_model = RandomForestRegressor(n_estimators=387, max_depth=15, random_state=42)

# Fit the model to the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model using RMSE, R² Score, and MAE
rmse = root_mean_squared_error(y_test, y_pred)  # Root Mean Squared Error
r2 = r2_score(y_test, y_pred)  # R² Score
mae = mean_absolute_error(y_test, y_pred)  # Mean Absolute Error
msle = mean_squared_log_error(y_test, y_pred)  # Mean Squared Log Error

print(f'--- {encoder} Performance with best parameters ---\n')
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")
print(f"MAE: {mae:.2f}")
print(f"MSLE: {msle:.2f}")

--- Label Encoding Performance with best parameters ---

RMSE: 89.18
R² Score: 0.92
MAE: 70.88
MSLE: 0.01


In [158]:
# Separate features for encoding
onehot_features = ['microstructure', 'seedLocation', 'castType']
label_features = ['partType']
encoder = "Hybrid Encoding"

hybrid_encoded_df = df.copy()

hybrid_encoded_df['partType'] = label_encoder.fit_transform(hybrid_encoded_df['partType'])  # Apply label encoding directly

# Fit and transform the specified categorical columns
encoded_array = onehot_encoder.fit_transform(hybrid_encoded_df[onehot_features])

# Convert the encoded array to a DataFrame with appropriate column names
encoded_df = pd.DataFrame(encoded_array.toarray(), columns=onehot_encoder.get_feature_names_out(onehot_features))

# Concatenate the encoded columns with the original DataFrame (excluding original categorical columns)
hybrid_encoded_df = pd.concat([hybrid_encoded_df.drop(columns=onehot_features), encoded_df], axis=1)

# Display the first few rows to verify
display(hybrid_encoded_df.head())


Unnamed: 0,Lifespan,partType,coolingRate,quenchTime,forgeTime,HeatTreatTime,Nickel%,Iron%,Cobalt%,Chromium%,...,largeDefects,sliverDefects,microstructure_colGrain,microstructure_equiGrain,microstructure_singleGrain,seedLocation_Bottom,seedLocation_Top,castType_Continuous,castType_Die,castType_Investment
0,1469.17,2,13,3.84,6.47,46.87,65.73,16.52,16.82,0.93,...,0,0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1793.64,1,19,2.62,3.48,44.7,54.22,35.38,6.14,4.26,...,0,0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
2,700.6,0,28,0.76,1.34,9.54,51.83,35.95,8.81,3.41,...,3,0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1082.1,2,9,2.01,2.19,20.29,57.03,23.33,16.86,2.78,...,1,0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,1838.83,0,16,4.13,3.87,16.13,59.62,27.37,11.45,1.56,...,0,0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [159]:
# Define the target variable and feature set
X = hybrid_encoded_df.drop(columns=['Lifespan'])  # Features
y = hybrid_encoded_df['Lifespan']  # Target

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets to verify
print(f'--- {encoder} Shape ---\n')
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

--- Hybrid Encoding Shape ---

X_train shape: (800, 20)
X_test shape: (200, 20)
y_train shape: (800,)
y_test shape: (200,)


In [160]:
# Initialize the Random Forest Regressor with default parameters
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Fit the model to the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model using RMSE, R² Score, and MAE
rmse = root_mean_squared_error(y_test, y_pred)  # Root Mean Squared Error
r2 = r2_score(y_test, y_pred)  # R² Score
mae = mean_absolute_error(y_test, y_pred)  # Mean Absolute Error
msle = mean_squared_log_error(y_test, y_pred)  # Mean Squared Log Error

print(f'--- {encoder} Performance with default parameters ---\n')
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")
print(f"MAE: {mae:.2f}")
print(f"MSLE: {msle:.2f}")

--- Hybrid Encoding Performance with default parameters ---

RMSE: 91.30
R² Score: 0.92
MAE: 72.78
MSLE: 0.01


In [161]:
# Define the parameter distributions for RandomizedSearchCV
param_distributions = {
    'n_estimators': randint(100, 500),  # Randomly sample number of trees between 100 and 500
    'max_depth': [None] + list(range(10, 50, 5)),  # None or range from 10 to 50, step 5
    'min_samples_split': randint(2, 20),  # Random split values between 2 and 20
    'min_samples_leaf': randint(1, 10),  # Random leaf values between 1 and 10
    'max_features': ['sqrt', 'log2', None]  # Use predefined feature subsets
}

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)

# Use RandomizedSearchCV to find the best hyperparameters
search_cv = RandomizedSearchCV(estimator=rf_model, param_distributions=param_distributions,
                                   n_iter=100, cv=3, verbose=1, random_state=42, n_jobs=-1,
                                   scoring='neg_mean_squared_error')

# Fit the model to the training data
search_type = "RandomizedSearchCV"
search_cv.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [162]:
# Get the best estimator and parameters
params = search_cv.best_params_

print(f"Best parameters found for {encoder} by {search_type}")
print(f"-" * 63)
for param, value in params.items():
    print(f"    {param}={value},")

Best parameters found for Hybrid Encoding by RandomizedSearchCV
---------------------------------------------------------------
    max_depth=15,
    max_features=None,
    min_samples_leaf=2,
    min_samples_split=3,
    n_estimators=387,


In [163]:
# Initialize the Random Forest Regressor with best parameters
rf_model = RandomForestRegressor(n_estimators=387, max_depth=15, random_state=42)

# Fit the model to the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model using RMSE, R² Score, and MAE
rmse = root_mean_squared_error(y_test, y_pred)  # Root Mean Squared Error
r2 = r2_score(y_test, y_pred)  # R² Score
mae = mean_absolute_error(y_test, y_pred)  # Mean Absolute Error
msle = mean_squared_log_error(y_test, y_pred)  # Mean Squared Log Error

print(f'--- {encoder} Performance with best parameters ---\n')
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")
print(f"MAE: {mae:.2f}")
print(f"MSLE: {msle:.2f}")

--- Hybrid Encoding Performance with best parameters ---

RMSE: 89.80
R² Score: 0.92
MAE: 71.53
MSLE: 0.01


In [164]:
# Create a copy of the DataFrame to preserve the original
onehot_encoded_df = df.copy()
encoder = "One-Hot Encoding"

# Fit and transform the specified categorical columns
encoded_array = onehot_encoder.fit_transform(onehot_encoded_df[categorical_cols_unified])

# Convert the encoded array to a DataFrame with appropriate column names
encoded_df = pd.DataFrame(encoded_array.toarray(), columns=onehot_encoder.get_feature_names_out(categorical_cols_unified))

# Concatenate the encoded columns with the original DataFrame (excluding original categorical columns)
onehot_encoded_df = pd.concat([onehot_encoded_df.drop(columns=categorical_cols_unified), encoded_df], axis=1)

# Display the first few rows to verify
display(onehot_encoded_df.head())

Unnamed: 0,Lifespan,coolingRate,quenchTime,forgeTime,HeatTreatTime,Nickel%,Iron%,Cobalt%,Chromium%,smallDefects,...,partType_Nozzle,partType_Valve,microstructure_colGrain,microstructure_equiGrain,microstructure_singleGrain,seedLocation_Bottom,seedLocation_Top,castType_Continuous,castType_Die,castType_Investment
0,1469.17,13,3.84,6.47,46.87,65.73,16.52,16.82,0.93,10,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1793.64,19,2.62,3.48,44.7,54.22,35.38,6.14,4.26,19,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
2,700.6,28,0.76,1.34,9.54,51.83,35.95,8.81,3.41,35,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1082.1,9,2.01,2.19,20.29,57.03,23.33,16.86,2.78,0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,1838.83,16,4.13,3.87,16.13,59.62,27.37,11.45,1.56,10,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [165]:
onehot_encoded_df.shape

(1000, 24)

In [166]:
# Define the target variable and feature set
X = onehot_encoded_df.drop(columns=['Lifespan'])  # Features
y = onehot_encoded_df['Lifespan']  # Target

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets to verify
print(f'--- {encoder} Shape ---\n')
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

--- One-Hot Encoding Shape ---

X_train shape: (800, 23)
X_test shape: (200, 23)
y_train shape: (800,)
y_test shape: (200,)


In [167]:
# Initialize the Random Forest Regressor with default parameters
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

# Fit the model to the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model using RMSE, R² Score, and MAE
rmse = root_mean_squared_error(y_test, y_pred)  # Root Mean Squared Error
r2 = r2_score(y_test, y_pred)  # R² Score
mae = mean_absolute_error(y_test, y_pred)  # Mean Absolute Error
msle = mean_squared_log_error(y_test, y_pred)  # Mean Squared Log Error

print(f'--- {encoder} Performance with default parameters ---\n')
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")
print(f"MAE: {mae:.2f}")
print(f"MSLE: {msle:.2f}")

--- One-Hot Encoding Performance with default parameters ---

RMSE: 85.15
R² Score: 0.93
MAE: 67.46
MSLE: 0.01


In [168]:
# Define the parameter distributions for RandomizedSearchCV
param_distributions = {
    'n_estimators': randint(100, 500),  # Randomly sample number of trees between 100 and 500
    'max_depth': [None] + list(range(10, 50, 5)),  # None or range from 10 to 50, step 5
    'min_samples_split': randint(2, 20),  # Random split values between 2 and 20
    'min_samples_leaf': randint(1, 10),  # Random leaf values between 1 and 10
    'max_features': ['sqrt', 'log2', None]  # Use predefined feature subsets
}

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)

# Use RandomizedSearchCV to find the best hyperparameters
search_cv = RandomizedSearchCV(estimator=rf_model, param_distributions=param_distributions,
                                   n_iter=100, cv=3, verbose=1, random_state=42, n_jobs=-1,
                                   scoring='neg_mean_squared_error')

# Fit the model to the training data
search_type = "RandomizedSearchCV"
search_cv.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [169]:
# Get the best estimator and parameters
params = search_cv.best_params_

print(f"Best parameters found for {encoder} by {search_type}")
print(f"-" * 64)
for param, value in params.items():
    print(f"    {param}={value},")

Best parameters found for One-Hot Encoding by RandomizedSearchCV
----------------------------------------------------------------
    max_depth=15,
    max_features=None,
    min_samples_leaf=2,
    min_samples_split=3,
    n_estimators=387,


In [170]:
# Initialize the Random Forest Regressor with best parameters
rf_model = RandomForestRegressor(n_estimators=387, max_depth=15, random_state=42)

# Fit the model to the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model using RMSE, R² Score, and MAE
rmse = root_mean_squared_error(y_test, y_pred)  # Root Mean Squared Error
r2 = r2_score(y_test, y_pred)  # R² Score
mae = mean_absolute_error(y_test, y_pred)  # Mean Absolute Error
msle = mean_squared_log_error(y_test, y_pred)  # Mean Squared Log Error

print(f'--- {encoder} Performance with best parameters ---\n')
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")
print(f"MAE: {mae:.2f}")
print(f"MSLE: {msle:.2f}")

--- One-Hot Encoding Performance with best parameters ---

RMSE: 84.22
R² Score: 0.93
MAE: 66.66
MSLE: 0.01


In [171]:
# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500],  # Number of trees
    'max_depth': [None, 10, 20, 30, 40, 50, 60],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10, 15, 20],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4, 6, 8],  # Minimum samples required at a leaf node
    'max_features': ['sqrt', 'log2', None, 'auto', 0.5, 0.7],  # More options for max_features
}

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)

# Use GridSearchCV to find the best hyperparameters
search_cv = GridSearchCV(estimator=rf_model, param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=1, scoring='neg_mean_squared_error')

# Fit the model to the training data
search_type = "GridSearchCV"
search_cv.fit(X_train, y_train)

Fitting 3 folds for each of 6300 candidates, totalling 18900 fits




3150 fits failed out of a total of 18900.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1397 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/muhammedazhar/.miniconda3/envs/COMP1801-ML/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/muhammedazhar/.miniconda3/envs/COMP1801-ML/lib/python3.10/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/Users/muhammedazhar/.miniconda3/envs/COMP1801-ML/lib/python3.10/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Users/muhammedazhar/.miniconda3/e

In [172]:
# Get the best estimator and parameters
params = search_cv.best_params_

print(f"Best parameters found for {encoder} by {search_type}")
print(f"-" * 64)
for param, value in params.items():
    print(f"    {param}={value},")

Best parameters found for One-Hot Encoding by GridSearchCV
----------------------------------------------------------------
    max_depth=20,
    max_features=None,
    min_samples_leaf=2,
    min_samples_split=2,
    n_estimators=500,


In [173]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'max_features': ['sqrt', 'log2', None],  # Removed 'auto'
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest Regressor
rf = RandomForestRegressor(random_state=42)

# Initialize RandomizedSearchCV
search_cv = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=100,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the RandomizedSearchCV to the data
search_type = "RandomizedSearchCV"
search_cv.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.1s
[CV] END max_depth=40, max_features=None, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.2s
[CV] END max_depth=40, max_features=None, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.2s
[CV] END max_depth=40, max_features=None, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   0.2s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.1s
[CV] END max_depth=50, max_features=None, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=   0.5s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimator

In [174]:
# Get the best estimator and parameters
params = search_cv.best_params_

print(f"Best parameters found for {encoder} by {search_type}")
print(f"-" * 64)
for param, value in params.items():
    print(f"    {param}={value},")

Best parameters found for One-Hot Encoding by RandomizedSearchCV
----------------------------------------------------------------
    n_estimators=500,
    min_samples_split=2,
    min_samples_leaf=2,
    max_features=None,
    max_depth=None,


In [175]:
# Initialize the Random Forest Regressor with best parameters
rf_model = RandomForestRegressor(n_estimators=500, max_depth=40, random_state=42)

# Fit the model to the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model using RMSE, R² Score, and MAE
rmse = root_mean_squared_error(y_test, y_pred)  # Root Mean Squared Error
r2 = r2_score(y_test, y_pred)  # R² Score
mae = mean_absolute_error(y_test, y_pred)  # Mean Absolute Error
msle = mean_squared_log_error(y_test, y_pred)  # Mean Squared Log Error

print(f'--- {encoder} Performance with best parameters ---\n')
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")
print(f"MAE: {mae:.2f}")
print(f"MSLE: {msle:.2f}")

--- One-Hot Encoding Performance with best parameters ---

RMSE: 84.31
R² Score: 0.93
MAE: 66.63
MSLE: 0.01
