In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet



Helper function to convert Log RMSE back to RMSE

Reverse log transform (np.expm1 is the inverse of np.log1p)

In [2]:

def rmse_from_log_rmse(y_true_log, y_pred_log):

    y_true = np.expm1(y_true_log)
    y_pred = np.expm1(y_pred_log)
    return np.sqrt(mean_squared_error(y_true, y_pred))

Loading data

In [3]:
X_train = pd.read_csv('final_processed_train.csv')
X_test = pd.read_csv('final_processed_test.csv')
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)
y_train_log = X_train['HotelValue_Log']
X_train.drop(columns=['HotelValue_Log'], inplace=True)

5 Fold cross validation is standard

In [4]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

Linear Regression Model

In [5]:

# Define the model (OLS has no hyperparameters to tune)
ols_model = LinearRegression(n_jobs=-1)

# The parameter grid is empty since OLS has no tuning parameters.
# GridSearchCV will simply use the model as-is across all CV folds.
param_grid_ols = {}

# Perform Grid Search Cross-Validation
ols_grid_search = GridSearchCV(
    estimator=ols_model,
    param_grid=param_grid_ols,  # Empty grid just runs the model once per CV fold
    scoring='neg_mean_squared_error',
    cv=kf,
    n_jobs=-1,
    verbose=1
)

# Run the OLS baseline test
ols_grid_search.fit(X_train, y_train_log)

# Output Results
best_ols = ols_grid_search.best_estimator_
# Convert 'neg_mean_squared_error' score back to positive Log RMSE
best_score = np.sqrt(-ols_grid_search.best_score_)
print('\n--- OLS Baseline Complete ---')
print(f"OLS - Cross-Validation Log RMSE: {best_score:.4f}")

Fitting 5 folds for each of 1 candidates, totalling 5 fits

--- OLS Baseline Complete ---
OLS - Cross-Validation Log RMSE: 0.1222


Ridge Regression Model

In [6]:
# Define the model
ridge = Ridge(random_state=42)

# Define a narrow, targeted parameter grid for GridSearchCV
# We focus the search around the best alpha of 215.4435
param_grid_ridge_fine = {
    # Creating 10 steps for alpha between 100 and 300
    'alpha': np.linspace(100, 300, 10)
}

# Perform Grid Search Cross-Validation
ridge_grid_search = GridSearchCV(
    estimator=ridge,
    param_grid=param_grid_ridge_fine,
    scoring='neg_mean_squared_error',
    cv=kf,
    n_jobs=-1,
    verbose=1
)

# Run the grid search
# X_train and y_train_log are the scaled features and log target
ridge_grid_search.fit(X_train, y_train_log)

# Output Results
best_ridge = ridge_grid_search.best_estimator_
# Convert 'neg_mean_squared_error' score back to positive Log RMSE
best_score = np.sqrt(-ridge_grid_search.best_score_)
print('\n--- Ridge Fine-Tuning Complete ---')
print(f"Ridge - Best Alpha (Grid): {ridge_grid_search.best_params_['alpha']:.4f}")
print(f"Ridge - Best Log RMSE (Grid CV): {best_score:.4f}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits

--- Ridge Fine-Tuning Complete ---
Ridge - Best Alpha (Grid): 211.1111
Ridge - Best Log RMSE (Grid CV): 0.1134


LASSO

In [7]:
# Define the model and set max_iter high for convergence
lasso = Lasso(random_state=42, max_iter=5000)

# Define a narrow, targeted parameter grid for GridSearchCV
# Focusing the search around the best alpha of 0.002069
param_grid_lasso_fine = {
    # Creating 10 steps for alpha between 0.001 and 0.003
    'alpha': np.linspace(0.001, 0.003, 10)
}

# Perform Grid Search Cross-Validation
lasso_grid_search = GridSearchCV(
    estimator=lasso,
    param_grid=param_grid_lasso_fine,
    scoring='neg_mean_squared_error',
    cv=kf,
    n_jobs=-1,
    verbose=1
)

# Run the grid search
# X_train and y_train_log are the scaled features and log target
lasso_grid_search.fit(X_train, y_train_log)

# Output Results
best_lasso = lasso_grid_search.best_estimator_
# Convert 'neg_mean_squared_error' score back to positive Log RMSE
best_score = np.sqrt(-lasso_grid_search.best_score_)
print('\n--- Lasso Fine-Tuning Complete ---')
print(f"Lasso - Best Alpha (Grid): {lasso_grid_search.best_params_['alpha']:.6f}")
print(f"Lasso - Best Log RMSE (Grid CV): {best_score:.4f}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits

--- Lasso Fine-Tuning Complete ---
Lasso - Best Alpha (Grid): 0.003000
Lasso - Best Log RMSE (Grid CV): 0.1081


Elastic Net

In [8]:
# Initialize the model with max_iter for convergence
elastic_net = ElasticNet(random_state=42, max_iter=5000)

# Define a narrow, targeted parameter grid for GridSearchCV
# We focus the search around the best alpha (0.0215) and keep l1_ratio close to 0.1
param_grid_elastic_fine = {
    # Narrow range for alpha around 0.0215
    'alpha': np.linspace(0.01, 0.04, 10),
    # Narrow range for l1_ratio around 0.1
    'l1_ratio': [0.05, 0.1, 0.15]
}

# Perform Grid Search Cross-Validation
elastic_grid_search = GridSearchCV(
    estimator=elastic_net,
    param_grid=param_grid_elastic_fine,
    scoring='neg_mean_squared_error',
    cv=kf,
    n_jobs=-1,
    verbose=1
)

# Run the grid search
elastic_grid_search.fit(X_train, y_train_log)

# Output Results
best_elastic = elastic_grid_search.best_estimator_
# Convert 'neg_mean_squared_error' score back to positive Log RMSE
best_score = np.sqrt(-elastic_grid_search.best_score_)
print('\n--- ElasticNet Fine-Tuning Complete ---')
print(f'ElasticNet - Best Parameters (Grid): {elastic_grid_search.best_params_}')
print(f'ElasticNet - Best Log RMSE (Grid CV): {best_score:.4f} ')

Fitting 5 folds for each of 30 candidates, totalling 150 fits

--- ElasticNet Fine-Tuning Complete ---
ElasticNet - Best Parameters (Grid): {'alpha': np.float64(0.02333333333333333), 'l1_ratio': 0.15}
ElasticNet - Best Log RMSE (Grid CV): 0.1078 


Random Forest

In [9]:
# Initialize the model
# Use a reasonable base based on what RandomizedSearch may have suggested
random_forest = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)

# Define a narrow, targeted parameter grid for GridSearchCV
# These values should be based on the best results from your prior RandomizedSearchCV
param_grid_rf_fine = {
    # Check around the best n_estimators (e.g., 250, 300, 350)
    'n_estimators': [250, 300, 350],
    # Check slightly above and below the best max_depth
    'max_depth': [12, 14, 16],
    # Fine-tune min_samples_leaf
    'min_samples_leaf': [1, 2]
}

# Perform Grid Search Cross-Validation
rf_grid_search = GridSearchCV(
    estimator=random_forest,
    param_grid=param_grid_rf_fine,
    scoring='neg_mean_squared_error',
    cv=kf,
    n_jobs=-1,
    verbose=1
)

# Run the grid search
rf_grid_search.fit(X_train, y_train_log)

# Output Results
best_rf = rf_grid_search.best_estimator_
# Convert 'neg_mean_squared_error' score back to positive Log RMSE
best_score = np.sqrt(-rf_grid_search.best_score_)
print('\n--- Random Forest Fine-Tuning Complete ---')
print(f'Random Forest - Best Parameters (Grid): {rf_grid_search.best_params_}')
print(f'Random Forest - Best Log RMSE (Grid CV): {best_score:.4f}')

Fitting 5 folds for each of 18 candidates, totalling 90 fits

--- Random Forest Fine-Tuning Complete ---
Random Forest - Best Parameters (Grid): {'max_depth': 16, 'min_samples_leaf': 1, 'n_estimators': 350}
Random Forest - Best Log RMSE (Grid CV): 0.1266


XGboost

In [10]:
# Initialize the model with good default parameters
# Use 'gbtree' booster which is standard tree-based model
xgb_model = XGBRegressor(random_state=42, booster='gbtree', n_jobs=-1)

# Define a narrow, targeted parameter grid for GridSearchCV
# These values should be based on the best results from your prior RandomizedSearchCV
param_grid_xgb_fine = {
    # Check around the best learning rate
    'learning_rate': [0.03, 0.05, 0.07],
    # Check around the best tree complexity
    'max_depth': [3, 4, 5],
    # Check around the best number of trees
    'n_estimators': [500, 600, 700]
}

# Perform Grid Search Cross-Validation
xgb_grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid_xgb_fine,
    scoring='neg_mean_squared_error',
    cv=kf,
    n_jobs=-1,
    verbose=1
)

# Run the grid search
xgb_grid_search.fit(X_train, y_train_log)

# Output Results
best_xgb = xgb_grid_search.best_estimator_
# Convert 'neg_mean_squared_error' score back to positive Log RMSE
best_score = np.sqrt(-xgb_grid_search.best_score_)
print('\n--- XGBoost Fine-Tuning Complete ---')
print(f'XGBoost - Best Parameters (Grid): {xgb_grid_search.best_params_}')
print(f'XGBoost - Best Log RMSE (Grid CV): {best_score:.4f}')

Fitting 5 folds for each of 27 candidates, totalling 135 fits

--- XGBoost Fine-Tuning Complete ---
XGBoost - Best Parameters (Grid): {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 700}
XGBoost - Best Log RMSE (Grid CV): 0.1135


Lasso Submission

In [12]:
# --- 10. Final Prediction and Submission (Corrected) ---
print('\nStarting Final Prediction using Tuned Lasso Model...')
from sklearn.linear_model import Lasso
import pandas as pd
import numpy as np

# 1. LOAD THE IDs from the original RAW test file
# This file is guaranteed to have the original 'Id' column.
test_ids_df = pd.read_csv('test.csv')
test_ids = test_ids_df['Id'] # Extract the Id series

# 2. LOAD the PROCESSED features for prediction
# This file contains all the scaled and encoded features without 'Id'
X_test_processed = pd.read_csv('final_processed_test.csv')
X_test_processed.fillna(0, inplace=True)

# 3. Use the best Lasso model found by GridSearchCV
# NOTE: This assumes the variable 'best_lasso' was defined and trained in a previous cell.
final_lasso_model = best_lasso

# 4. Make predictions on the processed test data
# The prediction will be in the log-transformed space (Log(1 + HotelValue))
predictions_log = final_lasso_model.predict(X_test_processed)

# 5. Reverse the transformation: exp(x) - 1
predictions = np.expm1(predictions_log)

# Ensure predictions are non-negative
predictions = np.maximum(0, predictions)

# 6. Create the submission DataFrame by combining the raw IDs and the predictions
submission_df = pd.DataFrame({
    'Id': test_ids,
    'HotelValue': predictions
})

# 7. Save the file in the required format
submission_df.to_csv('submission_lasso_final.csv', index=False)

print("\n--- Submission File Created ---")
print("File Name: submission_lasso_final.csv")
print("Submission Head:")
print(submission_df.head())


Starting Final Prediction using Tuned Lasso Model...

--- Submission File Created ---
File Name: submission_lasso_final.csv
Submission Head:
     Id     HotelValue
0   893  151324.515321
1  1106  326795.295705
2   414  106447.127882
3   523  156977.966693
4  1037  292659.232959


Linear Submission

In [13]:
# --- 11. Final Prediction and Submission (OLS) ---
print('\nStarting Final Prediction using OLS Linear Regression Model...')
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np

# 1. LOAD the IDs from the original RAW test file
# This is crucial as the processed file drops 'Id'.
test_ids_df = pd.read_csv('test.csv')
test_ids = test_ids_df['Id'] # Extract the Id series

# 2. LOAD the PROCESSED features for prediction
# This file contains all the scaled and encoded features
X_test_processed = pd.read_csv('final_processed_test.csv')
X_test_processed.fillna(0, inplace=True)

# 3. Use the best OLS model found by GridSearchCV
# NOTE: This assumes the variable 'best_ols' was defined and trained in a previous cell.
final_ols_model = best_ols

# 4. Make predictions on the processed test data
# The prediction will be in the log-transformed space (Log(1 + HotelValue))
predictions_log = final_ols_model.predict(X_test_processed)

# 5. Reverse the transformation: exp(x) - 1
predictions = np.expm1(predictions_log)

# Ensure predictions are non-negative
predictions = np.maximum(0, predictions)

# 6. Create the submission DataFrame by combining the raw IDs and the predictions
submission_df = pd.DataFrame({
    'Id': test_ids,
    'HotelValue': predictions
})

# 7. Save the file in the required format
submission_df.to_csv('submission_ols_final.csv', index=False)

print("\n--- OLS Submission File Created ---")
print("File Name: submission_ols_final.csv")
print("Submission Head:")
print(submission_df.head())


Starting Final Prediction using OLS Linear Regression Model...

--- OLS Submission File Created ---
File Name: submission_ols_final.csv
Submission Head:
     Id     HotelValue
0   893  151343.562522
1  1106  341336.599794
2   414  101185.942667
3   523  166420.682297
4  1037  300134.214359


xgBoost Submission

In [14]:
# --- 14. Final Prediction and Submission (XGBoost) ---
print('\nStarting Final Prediction using Tuned XGBoost Model...')
# 1. LOAD the IDs from the original RAW test file
# This is crucial as the processed file drops 'Id'.
test_ids_df = pd.read_csv('test.csv')
test_ids = test_ids_df['Id'] # Extract the Id series

# 2. LOAD the PROCESSED features for prediction
# This file contains all the scaled and encoded features
X_test_processed = pd.read_csv('final_processed_test.csv')
X_test_processed.fillna(0, inplace=True)

# 3. Use the best XGBoost model found by GridSearchCV/RandomizedSearchCV
# NOTE: This assumes the variable 'best_xgb' was defined and trained in a previous cell.
final_xgb_model = best_xgb

# 4. Make predictions on the processed test data
# The prediction will be in the log-transformed space (Log(1 + HotelValue))
predictions_log = final_xgb_model.predict(X_test_processed)

# 5. Reverse the transformation: exp(x) - 1
predictions = np.expm1(predictions_log)

# Ensure predictions are non-negative
predictions = np.maximum(0, predictions)

# 6. Create the submission DataFrame by combining the raw IDs and the predictions
submission_df = pd.DataFrame({
    'Id': test_ids,
    'HotelValue': predictions
})

# 7. Save the file in the required format
submission_df.to_csv('submission_xgb_final.csv', index=False)

print("\n--- XGBoost Submission File Created ---")
print("File Name: submission_xgb_final.csv")
print("Submission Head:")
print(submission_df.head())


Starting Final Prediction using Tuned XGBoost Model...

--- XGBoost Submission File Created ---
File Name: submission_xgb_final.csv
Submission Head:
     Id     HotelValue
0   893  148385.640625
1  1106  346520.500000
2   414  102781.953125
3   523  156082.156250
4  1037  336281.250000


Elastic net Submission

In [15]:
# --- 15. Final Prediction and Submission (ElasticNet) ---
print('\nStarting Final Prediction using Tuned ElasticNet Model...')
# 1. LOAD the IDs from the original RAW test file
# This is crucial as the processed file drops 'Id'.
test_ids_df = pd.read_csv('test.csv')
test_ids = test_ids_df['Id'] # Extract the Id series

# 2. LOAD the PROCESSED features for prediction
# This file contains all the scaled and encoded features
X_test_processed = pd.read_csv('final_processed_test.csv')
X_test_processed.fillna(0, inplace=True)

# 3. Use the best ElasticNet model found by GridSearchCV
# NOTE: This assumes the variable 'best_elastic' was defined and trained in a previous cell.
final_elastic_model = best_elastic

# 4. Make predictions on the processed test data
# The prediction will be in the log-transformed space (Log(1 + HotelValue))
predictions_log = final_elastic_model.predict(X_test_processed)

# 5. Reverse the transformation: exp(x) - 1
predictions = np.expm1(predictions_log)

# Ensure predictions are non-negative
predictions = np.maximum(0, predictions)

# 6. Create the submission DataFrame by combining the raw IDs and the predictions
submission_df = pd.DataFrame({
    'Id': test_ids,
    'HotelValue': predictions
})

# 7. Save the file in the required format
submission_df.to_csv('submission_elastic_final.csv', index=False)

print("\n--- ElasticNet Submission File Created ---")
print("File Name: submission_elastic_final.csv")
print("Submission Head:")
print(submission_df.head())


Starting Final Prediction using Tuned ElasticNet Model...

--- ElasticNet Submission File Created ---
File Name: submission_elastic_final.csv
Submission Head:
     Id     HotelValue
0   893  150333.617267
1  1106  324441.623139
2   414  107186.999218
3   523  155746.235155
4  1037  293219.443373


Ridge Submission

In [16]:
# --- Final Prediction and Submission using Tuned Ridge Model ---
print('\nStarting Final Prediction using Tuned Ridge Model...')
from sklearn.linear_model import Ridge # Import the Ridge model

import pandas as pd
import numpy as np

# 1. LOAD the IDs from the original RAW test file
# This file is guaranteed to have the original 'Id' column.
# NOTE: Ensure 'test.csv' is available in the current directory.
test_ids_df = pd.read_csv('test.csv')
test_ids = test_ids_df['Id'] # Extract the Id series

# 2. LOAD the PROCESSED features for prediction
# This file contains all the scaled and encoded features without 'Id'
# NOTE: Ensure 'final_processed_test.csv' is available and contains the correct features.
X_test_processed = pd.read_csv('final_processed_test.csv')
X_test_processed.fillna(0, inplace=True) # Assuming NaN imputation strategy is 0

# 3. Use the best Ridge model found by GridSearchCV/RandomizedSearchCV
# IMPORTANT: This assumes a variable named 'best_ridge' was defined and trained
# in a preceding step (e.g., using a cross-validation script).
# Replace 'best_ridge' with your actual best model variable name if different.
final_ridge_model = best_ridge

# 4. Make predictions on the processed test data
# The prediction will be in the log-transformed space (Log(1 + HotelValue))
predictions_log = final_ridge_model.predict(X_test_processed)

# 5. Reverse the transformation: exp(x) - 1
predictions = np.expm1(predictions_log)

# Ensure predictions are non-negative, as HotelValue cannot be less than zero
predictions = np.maximum(0, predictions)

# 6. Create the submission DataFrame by combining the raw IDs and the predictions
submission_df = pd.DataFrame({
    'Id': test_ids,
    'HotelValue': predictions
})

# 7. Save the file in the required format
submission_df.to_csv('submission_ridge_final.csv', index=False)

print("\n--- Submission File Created ---")
print("File Name: submission_ridge_final.csv")
print("Submission Head:")
print(submission_df.head())


Starting Final Prediction using Tuned Ridge Model...

--- Submission File Created ---
File Name: submission_ridge_final.csv
Submission Head:
     Id     HotelValue
0   893  147277.722743
1  1106  324375.355492
2   414  102665.984532
3   523  156372.304749
4  1037  295728.816615


Random Forest Submission

In [17]:
# --- Final Prediction and Submission using Tuned Random Forest Model ---
print('\nStarting Final Prediction using Tuned Random Forest Model...')
from sklearn.ensemble import RandomForestRegressor # Import the Random Forest Regressor model

import pandas as pd
import numpy as np

# 1. LOAD the IDs from the original RAW test file
# This file is guaranteed to have the original 'Id' column.
# NOTE: Ensure 'test.csv' is available in the current directory.
test_ids_df = pd.read_csv('test.csv')
test_ids = test_ids_df['Id'] # Extract the Id series

# 2. LOAD the PROCESSED features for prediction
# This file contains all the scaled and encoded features without 'Id'
# NOTE: Ensure 'final_processed_test.csv' is available and contains the correct features.
X_test_processed = pd.read_csv('final_processed_test.csv')
X_test_processed.fillna(0, inplace=True) # Assuming NaN imputation strategy is 0

# 3. Use the best Random Forest model found by GridSearchCV/RandomizedSearchCV
# IMPORTANT: This assumes a variable named 'best_rf' was defined and trained
# in a preceding step (e.g., using a cross-validation script).
# Replace 'best_rf' with your actual best model variable name if different.
final_rf_model = best_rf

# 4. Make predictions on the processed test data
# The prediction will be in the log-transformed space (Log(1 + HotelValue))
predictions_log = final_rf_model.predict(X_test_processed)

# 5. Reverse the transformation: exp(x) - 1
predictions = np.expm1(predictions_log)

# Ensure predictions are non-negative, as HotelValue cannot be less than zero
predictions = np.maximum(0, predictions)

# 6. Create the submission DataFrame by combining the raw IDs and the predictions
submission_df = pd.DataFrame({
    'Id': test_ids,
    'HotelValue': predictions
})

# 7. Save the file in the required format
submission_df.to_csv('submission_rf_final.csv', index=False)

print("\n--- Submission File Created ---")
print("File Name: submission_rf_final.csv")
print("Submission Head:")
print(submission_df.head())


Starting Final Prediction using Tuned Random Forest Model...

--- Submission File Created ---
File Name: submission_rf_final.csv
Submission Head:
     Id     HotelValue
0   893  143329.964509
1  1106  321564.227682
2   414  109989.776342
3   523  154115.632842
4  1037  299901.453113
