In [1]:
import pandas as pd

# Load imputed dataset
X_other_imputed_xgb = pd.read_csv("../data/X_other_imputed_xgb.csv")
X_test_imputed_xgb = pd.read_csv("../data/X_test_imputed_xgb.csv")

In [None]:
import pandas as pd

# Load target variable
y_other = pd.read_csv("../data/y_other.csv").squeeze().to_numpy()
y_test = pd.read_csv("../data/y_test.csv").squeeze().to_numpy()

(3207,)
(802,)


In [18]:
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

def MLpipeline(X_other, y_other, X_test, y_test, ML_algo, param_grid, scoring):
    '''
    This function directly uses the provided train, validation, and test splits.
    It evaluates models using MAE, RMSE, and R², and records the best model for each metric.

    Args:
    - X_train, y_train: Training dataset
    - X_val, y_val: Validation dataset for GridSearchCV
    - X_test, y_test: Test dataset for evaluation
    - ML_algo: Machine Learning algorithm
    - param_grid: Parameter grid for hyperparameter tuning

    Returns:
    - A dictionary containing test scores (MAE, RMSE, R²)
    - The best model for each metric
    '''
    # Lists to store results
    test_scores = {
        'MAE': [],
        'RMSE': [],
        'R2': []
    }

    # GridSearchCV
    print("\nPerforming GridSearchCV...")
    grid = GridSearchCV(
        estimator=ML_algo,
        param_grid=param_grid,
        scoring=scoring,  # Single scoring metric
        refit=True,
        cv=4,  # 4-fold cross-validation
        return_train_score=True,
        n_jobs=-1,
        verbose=True
    )
    grid.fit(X_other, y_other)

    # Save the best model
    best_model = grid.best_estimator_
    validation_score = -grid.best_score_ if 'neg' in scoring else grid.best_score_
    print('Best model parameters:', grid.best_params_)
    print(f'Validation score for {scoring}:', validation_score)

    # Predictions and metrics on the test set
    y_test_pred = best_model.predict(X_test)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_rmse = root_mean_squared_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    print('Test MAE:', test_mae)
    print('Test RMSE:', test_rmse)
    print('Test R²:', test_r2)

    # Store test scores
    test_scores = {
        'MAE': test_mae,
        'RMSE': test_rmse,
        'R2': test_r2
    }

    # Return results
    return test_scores, best_model, validation_score

In [13]:
import numpy as np
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score

y_mean = np.mean(y_test)  # Baseline: Predict the mean for all instances
baseline_rmse = root_mean_squared_error(y_test, [y_mean] * len(y_test))
baseline_mae = mean_absolute_error(y_test, [y_mean] * len(y_test))
baseline_r2 = r2_score(y_test, [y_mean] * len(y_test))

print("Baseline Metrics:")
print(f"Baseline RMSE: {baseline_rmse:.4f}")
print(f"Baseline MAE: {baseline_mae:.4f}")
print(f"Baseline R²: {baseline_r2:.4f}")

Baseline Metrics:
Baseline RMSE: 0.3833
Baseline MAE: 0.2889
Baseline R²: 0.0000


In [20]:
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

# Parameter grids
param_grids = {
    'Lasso': {'alpha': [0.01, 0.1, 1, 10, 100]},
    'Ridge': {'alpha': [0.01, 0.1, 1, 10, 100]},
    'ElasticNet': {
        'alpha': [0.01, 0.1, 1, 10, 100],
        'l1_ratio': [0.2, 0.4, 0.6, 0.8]
    },
    'RandomForestRegressor': {
        'max_depth': [1, 3, 10, 30, 100],
        'max_features': [0.25, 0.5, 0.75, 1.0]
    },
    # 'SVR': {
    #     'C': [0.1, 1, 10, 100],
    #     'epsilon': [0.1, 0.2, 0.5],
    #     'kernel': ['linear', 'rbf'],
    #     'gamma': ['scale', 'auto', 0.01, 0.1, 1]
    # },
    'KNeighborsRegressor': {
        'n_neighbors': [3, 5, 10, 20],
        'weights': ['uniform', 'distance']
    },
    'XGBRegressor': {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    }
}

# Models to train
models = {
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet(),
    'RandomForestRegressor': RandomForestRegressor(random_state=42),
    # 'SVR': SVR(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'XGBRegressor': XGBRegressor()
}

In [21]:
# Initialize results dictionary and track best overall models
results = {}
best_models_overall = {
    'MAE': {'method': None, 'model': None, 'value': float('inf')},  # Lower is better
    'RMSE': {'method': None, 'model': None, 'value': float('inf')},  # Lower is better
    'R2': {'method': None, 'model': None, 'value': float('-inf')}   # Higher is better
}
scorings = ['neg_mean_absolute_error', 'neg_root_mean_squared_error', 'r2']

# Training pipeline
for s in scorings:
    print(f"\nUsing scoring metric: {s}")
    for model_name, model in models.items():
        print(f"\nTraining {model_name}...")
        try:
            test_scores, best_model, validation_score = MLpipeline(
                X_other_imputed_xgb, y_other, X_test_imputed_xgb, y_test, model, param_grids[model_name], s
            )
            # Save results for each model
            results[model_name] = results.get(model_name, {})
            results[model_name][s] = {
                'Best Parameters': best_model.get_params(),
                'Validation': validation_score,
                'Test MAE': test_scores['MAE'],
                'Test RMSE': test_scores['RMSE'],
                'Test R²': test_scores['R2']
            }

            # Update best overall models for each metric
            if test_scores['MAE'] < best_models_overall['MAE']['value']:
                best_models_overall['MAE'] = {'method': model_name, 'model': best_model, 'value': test_scores['MAE']}
            if test_scores['RMSE'] < best_models_overall['RMSE']['value']:
                best_models_overall['RMSE'] = {'method': model_name, 'model': best_model, 'value': test_scores['RMSE']}
            if test_scores['R2'] > best_models_overall['R2']['value']:
                best_models_overall['R2'] = {'method': model_name, 'model': best_model, 'value': test_scores['R2']}
        except Exception as e:
            print(f"Error training {model_name} with scoring {s}: {e}")

# Print the best overall models for each metric
print("\nBest Overall Models:")
for metric, info in best_models_overall.items():
    print(f"{metric} - Method: {info['method']}, Value: {info['value']:.4f}")


Using scoring metric: neg_mean_absolute_error

Training Lasso...

Performing GridSearchCV...
Fitting 4 folds for each of 5 candidates, totalling 20 fits
Best model parameters: {'alpha': 0.01}
Validation score for neg_mean_absolute_error: 0.12842513368452407
Test MAE: 0.129476189512615
Test RMSE: 0.1972147845355382
Test R²: 0.7352044595938109

Training Ridge...

Performing GridSearchCV...
Fitting 4 folds for each of 5 candidates, totalling 20 fits
Best model parameters: {'alpha': 1}
Validation score for neg_mean_absolute_error: 0.1066015768296514
Test MAE: 0.11100750474710881
Test RMSE: 0.17864719491602327
Test R²: 0.7827178097071597

Training ElasticNet...

Performing GridSearchCV...
Fitting 4 folds for each of 20 candidates, totalling 80 fits
Best model parameters: {'alpha': 0.01, 'l1_ratio': 0.2}
Validation score for neg_mean_absolute_error: 0.1190792523725422
Test MAE: 0.11982591739455209
Test RMSE: 0.18673270589167684
Test R²: 0.7626044855834969

Training RandomForestRegressor...


In [24]:
# Save results as a text file
with open("../results/xgb_imputed_best_models_results.txt", "w") as f:
    # Write detailed results for each model
    f.write("Detailed Results for Each Model:\n\n")
    for model_name, model_results in results.items():
        f.write(f"Model: {model_name}\n")
        for scoring_metric, metrics in model_results.items():
            f.write(f"  Scoring Metric: {scoring_metric}\n")
            f.write(f"    Best Parameters: {metrics['Best Parameters']}\n")
            f.write(f"    Validation Score: {metrics['Validation']:.4f}\n")
            f.write(f"    Test MAE: {metrics['Test MAE']:.4f}\n")
            f.write(f"    Test RMSE: {metrics['Test RMSE']:.4f}\n")
            f.write(f"    Test R²: {metrics['Test R²']:.4f}\n")
        f.write("\n")

    # Write the best overall models
    f.write("Best Overall Models:\n")
    for metric, info in best_models_overall.items():
        f.write(f"{metric} - Method: {info['method']}, Value: {info['value']:.4f}\n")
