In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet



Helper function to convert Log RMSE back to RMSE

Reverse log transform (np.expm1 is the inverse of np.log1p)

In [2]:

def rmse_from_log_rmse(y_true_log, y_pred_log):
    
    y_true = np.expm1(y_true_log)
    y_pred = np.expm1(y_pred_log)
    return np.sqrt(mean_squared_error(y_true, y_pred))

Loading data

In [3]:
X_train = pd.read_csv('final_processed_train_V2.csv')
X_test = pd.read_csv('final_processed_test_V2.csv')
X_train.fillna(0, inplace=True)
y_train = X_train['HotelValue']
X_train.drop(columns=['HotelValue'], inplace=True)

5 Fold cross validation is standard

In [4]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

Ridge Regression Model

In [5]:
ridge = Ridge(random_state=42)
param_grid_ridge = {
    # Testing 10 values on a logarithmic scale from 0.001 to 1000
    'alpha': np.logspace(-3, 3, 10) 
}

# Perform Randomized Search Cross-Validation
# We use 'neg_mean_squared_error' as the scoring metric since our target is log-transformed.
ridge_search = RandomizedSearchCV(
    estimator=ridge, 
    param_distributions=param_grid_ridge,
    scoring='neg_mean_squared_error',
    cv=kf, # kf is the KFold object defined in the full script
    n_iter=10, 
    random_state=42,
    n_jobs=-1
)

print("Starting Ridge Model Tuning...")
# X_train and y_train_log come from the loaded, preprocessed data
ridge_search.fit(X_train, y_train) 

# Output Results
best_score = np.sqrt(-ridge_search.best_score_) 
print(f"\nRidge - Best Alpha: {ridge_search.best_params_['alpha']:.4f}")
print(f"Ridge - Cross-Validation Log RMSE: {best_score:.4f}")

Starting Ridge Model Tuning...



Ridge - Best Alpha: 46.4159
Ridge - Cross-Validation Log RMSE: 19973.1153


LASSO

In [6]:


# Define the model and parameter space
# We set max_iter high to ensure convergence, especially for small alpha
lasso = Lasso(random_state=42, max_iter=5000) 

param_grid_lasso = {
    # Alpha controls the strength of the penalty (L1)
    'alpha': np.logspace(-5, -1, 20) # Focusing on smaller alpha values common for Lasso
}

# Perform Randomized Search Cross-Validation
lasso_search = RandomizedSearchCV(
    estimator=lasso, 
    param_distributions=param_grid_lasso,
    scoring='neg_mean_squared_error',
    cv=kf,
    n_iter=15, # Sample 15 combinations
    random_state=42,
    n_jobs=-1
)

print("\nStarting Lasso Model Tuning (15 iterations)...")
# X_train and y_train_log are the scaled features and log target
lasso_search.fit(X_train, y_train) 

# Output Results
best_lasso = lasso_search.best_estimator_
best_score = np.sqrt(-lasso_search.best_score_)
print("\n--- Lasso Tuning Complete ---")
print(f"Lasso - Best Alpha: {lasso_search.best_params_['alpha']:.6f}")
print(f"Lasso - Cross-Validation Log RMSE: {best_score:.4f}")


Starting Lasso Model Tuning (15 iterations)...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c


--- Lasso Tuning Complete ---
Lasso - Best Alpha: 0.100000
Lasso - Cross-Validation Log RMSE: 20655.5881


Elastic Net

In [7]:


# Define the model and parameter space
elastic_net = ElasticNet(random_state=42, max_iter=5000)

param_grid_elastic = {
    # Alpha controls the total regularization strength
    'alpha': np.logspace(-5, 0, 10), 
    # l1_ratio controls the mix: 
    # 0 = Pure Ridge (L2), 1 = Pure Lasso (L1)
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9] 
}

# Perform Randomized Search Cross-Validation
elastic_search = RandomizedSearchCV(
    estimator=elastic_net, 
    param_distributions=param_grid_elastic,
    scoring='neg_mean_squared_error',
    cv=kf,
    n_iter=20, # Sample 20 combinations (10 alpha * 5 l1_ratio)
    random_state=42,
    n_jobs=-1
)

print("\nStarting ElasticNet Model Tuning (20 iterations)...")
elastic_search.fit(X_train, y_train) 

# Output Results
best_elastic = elastic_search.best_estimator_
best_score = np.sqrt(-elastic_search.best_score_)
print("\n--- ElasticNet Tuning Complete ---")
print(f"ElasticNet - Best Parameters: {elastic_search.best_params_}")
print(f"ElasticNet - Cross-Validation Log RMSE: {best_score:.4f} ")


Starting ElasticNet Model Tuning (20 iterations)...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c


--- ElasticNet Tuning Complete ---
ElasticNet - Best Parameters: {'l1_ratio': 0.5, 'alpha': np.float64(0.07742636826811278)}
ElasticNet - Cross-Validation Log RMSE: 20018.2698 


In [8]:
# --- 4. Random Forest Tuning ---
print('\nStarting Random Forest Model Tuning (20 iterations)...')
# Initialize the model
random_forest = RandomForestRegressor(random_state=42, n_jobs=-1)

# Define the parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [10, 15, 20, None], # None means nodes are expanded until all leaves are pure
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Randomized Search Cross-Validation
# NOTE: This assumes 'kf' (KFold object) and 'results' dictionary are defined earlier in your notebook.
rf_search = RandomizedSearchCV(
    estimator=random_forest,
    param_distributions=param_grid_rf,
    scoring='neg_mean_squared_error',
    cv=kf,
    n_iter=20, # Number of parameter settings that are sampled
    random_state=42,
    n_jobs=-1
)

rf_search.fit(X_train, y_train)

# Output Results
best_rf = rf_search.best_estimator_
# Convert 'neg_mean_squared_error' score back to positive Log RMSE
best_score = np.sqrt(-rf_search.best_score_) 
print('\n--- Random Forest Tuning Complete ---')
print(f'Random Forest - Best Parameters: {rf_search.best_params_}')
print(f'Random Forest - Best Log RMSE (CV): {best_score:.4f}')


Starting Random Forest Model Tuning (20 iterations)...

--- Random Forest Tuning Complete ---
Random Forest - Best Parameters: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 20}
Random Forest - Best Log RMSE (CV): 23554.3063


xgBoost

In [9]:
# --- 5. XGBoost Tuning ---
print('\nStarting XGBoost Model Tuning (20 iterations)...')
# Initialize the model
# tree_method='hist' is often much faster for large datasets
xgb_model = XGBRegressor(random_state=42, tree_method='hist', n_jobs=-1)

# Define the parameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [300, 500, 700],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.9],
    'colsample_bytree': [0.7, 0.9]
}

# Perform Randomized Search Cross-Validation
# NOTE: Uses the 'kf' and 'results' objects defined earlier in the notebook.
xgb_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid_xgb,
    scoring='neg_mean_squared_error',
    cv=kf,
    n_iter=20, # Sample 20 combinations
    random_state=42,
    n_jobs=-1
)

xgb_search.fit(X_train, y_train)

# Output Results
best_xgb = xgb_search.best_estimator_
# Convert 'neg_mean_squared_error' score back to positive Log RMSE
best_score = np.sqrt(-xgb_search.best_score_)
print('\n--- XGBoost Tuning Complete ---')
print(f'XGBoost - Best Parameters: {xgb_search.best_params_}')
print(f'XGBoost - Best Log RMSE (CV): {best_score:.4f}')


Starting XGBoost Model Tuning (20 iterations)...

--- XGBoost Tuning Complete ---
XGBoost - Best Parameters: {'subsample': 0.7, 'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.05, 'colsample_bytree': 0.7}
XGBoost - Best Log RMSE (CV): 19678.8988


Linear Regression OLS

In [13]:
# --- 13. OLS (Linear Regression) Baseline Test with RandomizedSearchCV ---
from sklearn.linear_model import LinearRegression
print('\nStarting OLS (Linear Regression) Baseline Test (1 iteration, 5 folds)...')
# Define the model (OLS has NO hyperparameters to tune)
ols_model = LinearRegression(n_jobs=-1)

# The parameter grid is empty. 
# RandomizedSearchCV will just run the model once for n_iter=1.
param_grid_ols = {}

# Perform Randomized Search Cross-Validation
# NOTE: n_iter is set to 1 since there are no parameters to sample.
ols_search = RandomizedSearchCV(
    estimator=ols_model,
    param_distributions=param_grid_ols, 
    scoring='neg_mean_squared_error',
    cv=kf,
    n_iter=1, # Fixed to 1 since we are not tuning
    random_state=42,
    n_jobs=-1
)

# Run the OLS baseline test
ols_search.fit(X_train, y_train)

# Output Results
best_ols = ols_search.best_estimator_
# Convert 'neg_mean_squared_error' score back to positive Log RMSE
best_score = np.sqrt(-ols_search.best_score_)
print('\n--- OLS Baseline Complete ---')
print(f"OLS - Cross-Validation Log RMSE: {best_score:.4f}")


Starting OLS (Linear Regression) Baseline Test (1 iteration, 5 folds)...

--- OLS Baseline Complete ---
OLS - Cross-Validation Log RMSE: 20655.5357
