In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import SGDRegressor, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
import joblib
import os
import numpy as np

# Define the configuration for each result column
configurations = {
    'PR': {
        'data_columns': ['F43', 'F44', 'F45', 'F5', 'OF18', 'OF27', 'OF34', 'S4'],
        'model': GradientBoostingRegressor,
        'hyperparameters': {
            'gradientboostingregressor__learning_rate': 0.01,
            'gradientboostingregressor__loss': 'squared_error',
            'gradientboostingregressor__n_estimators': 200,
            'gradientboostingregressor__warm_start': False
        },
        'csv_file': '../../Data_ML/4_out_csvs_regression/output_bfill_imputed.csv',
        'model_directory': 'PR'
    },
    'NR': {
        'data_columns': ['F43', 'F44', 'F45','F5', 'OF18', 'OF27','OF34', 'S4'],
        'model': MLPRegressor,
        'hyperparameters': {
            'mlpregressor__activation': 'tanh',
            'mlpregressor__hidden_layer_sizes': (50, 50, 50),
            'mlpregressor__learning_rate': 'invscaling',
            'mlpregressor__solver': 'lbfgs'
        },
        'csv_file': '../../Data_ML/4_out_csvs_regression/output_iterative_imputed.csv',
        'model_directory': 'NR'
    },
    'SR': {
        'data_columns': ['F24', 'F28', 'F31', 'F43', 'F44', 'F45', 'F5', 'OF18', 'OF28'],
        'model': SGDRegressor,
        'hyperparameters': {
            'sgdregressor__learning_rate': 'optimal',
            'sgdregressor__loss': 'squared_error',
            'sgdregressor__penalty': 'elasticnet',
            'sgdregressor__warm_start': True
        },
        'csv_file': '../../Data_ML/4_out_csvs_regression/output_mean_imputed.csv',
        'model_directory': 'SR'
    },
    'WS': {
        'data_columns': ['F22', 'F31', 'F43', 'F44', 'F46', 'F5', 'OF18', 'OF27'],
        'model': GradientBoostingRegressor,
        'hyperparameters': {
            'gradientboostingregressor__learning_rate': 0.1,
            'gradientboostingregressor__loss': 'squared_error',
            'gradientboostingregressor__n_estimators': 200,
            'gradientboostingregressor__warm_start': True
        },
        'csv_file': '../../Data_ML/4_out_csvs_regression/output_bfill_imputed.csv',
        'model_directory': 'WS'
    },
    'SFST': {
        'data_columns': [ 'F43', 'F44', 'F45', 'F46', 'OF18', 'OF27'],
        'model': DecisionTreeRegressor,
        'hyperparameters': {
            'decisiontreeregressor__criterion': 'squared_error',
            'decisiontreeregressor__max_features': None,
            'decisiontreeregressor__min_samples_split': 5,
            'decisiontreeregressor__splitter': 'best'
        },
        'csv_file': '../../Data_ML/4_out_csvs_regression/output_bfill_imputed.csv',
        'model_directory': 'SFST'
    },
    'PR_Benefit': {
        'data_columns': ['F14', 'F3_c', 'F41','F44', 'OF18', 'OF19', 'OF27'],
        'model': DecisionTreeRegressor,
        'hyperparameters': {
            'decisiontreeregressor__criterion': 'squared_error',
            'decisiontreeregressor__max_features': 'log2',
            'decisiontreeregressor__min_samples_split': 2,
            'decisiontreeregressor__splitter': 'random'
        },
        'csv_file': '../../Data_ML/4_out_csvs_regression/output_iterative_imputed.csv',
        'model_directory': 'PR_Benefit'
    },
    'NR_Benefit': {
        'data_columns': ['F41','F5', 'OF10', 'OF18', 'OF22', 'OF27', 'OF30'],
        'model': SVR,
        'hyperparameters': {
            'svr__C': 10.0,
            'svr__degree': 1,
            'svr__gamma': 'scale',
            'svr__kernel': 'rbf'
        },
        'csv_file': '../../Data_ML/4_out_csvs_regression/output_iterative_imputed.csv',
        'model_directory': 'NR_Benefit'
    },
    'SR_Benefit': {
        'data_columns': ['F12', 'F41', 'OF18', 'OF22', 'OF27','OF30'],
        'model': Ridge,
        'hyperparameters': {
            'ridge__alpha': 0.1,
            'ridge__solver': 'auto'
        },
        'csv_file': '../../Data_ML/4_out_csvs_regression/output_iterative_imputed.csv',
        'model_directory': 'SR_Benefit'
    },
    'WS_Benefit': {
        'data_columns': ['F5', 'OF17', 'OF18', 'OF23', 'OF27', 'OF34', 'S4'],
        'model': DecisionTreeRegressor,
        'hyperparameters': {
            'decisiontreeregressor__criterion': 'squared_error',
            'decisiontreeregressor__max_features': None,
            'decisiontreeregressor__min_samples_split': 5,
            'decisiontreeregressor__splitter': 'random'
        },
        'csv_file': '../../Data_ML/4_out_csvs_regression/output_iterative_imputed.csv',
        'model_directory': 'WS_Benefit'
    },
    'SFST_Benefit': {
        'data_columns': ['F12', 'F43', 'F44', 'F45', 'F5', 'OF18', 'OF27', 'OF30' ],
        'model': DecisionTreeRegressor,
        'hyperparameters': {
            'decisiontreeregressor__criterion': 'squared_error',
            'decisiontreeregressor__max_features': None,
            'decisiontreeregressor__min_samples_split': 3,
            'decisiontreeregressor__splitter': 'random'
        },
        'csv_file': '../../Data_ML/4_out_csvs_regression/output_bfill_imputed.csv',
        'model_directory': 'SFST_Benefit'
    }
}

# Create the directories if they don't exist
for config in configurations.values():
    if not os.path.exists(config['model_directory']):
        os.makedirs(config['model_directory'])

# Function to process the CSV file
def process_csv(result_column, config_key, config):
    data = pd.read_csv(config['csv_file'])
    X = data[config['data_columns']]
    y = data[result_column]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

    pipeline = make_pipeline(StandardScaler(), config['model']())

    # Set hyperparameters directly in the model
    pipeline.set_params(**config['hyperparameters'])

    # Fit the model
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"{config['model'].__name__} MSE for {result_column}: {mse}")

    # Save the model
    model_filename = os.path.join(config['model_directory'], f"{os.path.basename(config['csv_file'])}_{config['model'].__name__}_model.pkl")
    joblib.dump(pipeline, model_filename)

    # Save the predictions and actual values
    results_df = pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred.flatten(), 'Model': config['model'].__name__})
    results_filename = f"output_{os.path.basename(config['csv_file'])}_{result_column}.csv"
    #results_df.to_csv(results_filename, index=False)

    # Combine feature values, predicted values, and actual values
    combined_df = pd.DataFrame(X_test)
    combined_df['Actual'] = y_test.values
    combined_df['Predicted'] = y_pred

    # Group by unique feature combinations and count occurrences
    grouped_df = combined_df.groupby(config['data_columns'] + ['Actual', 'Predicted']).size().reset_index(name='Occurrences')

    return {
        'csv_file': os.path.basename(config['csv_file']),
        'model_name': config['model'].__name__,
        'hyperparameters': config['hyperparameters'],
        'mse': mse,
        'grouped_df': grouped_df
    }

# Process each configuration
all_grouped_results = {}
for result_column, config_key in zip(configurations.keys(), configurations.keys()):
    best_model_info = process_csv(result_column, config_key, configurations[result_column])
    all_grouped_results[config_key] = best_model_info['grouped_df']

# Generate LaTeX table with occurrences column and save all tables in one file
def generate_latex_table(df, config_key):
    buffer = []

    buffer.append("\\begin{table}[htbp]")
    buffer.append("\\centering")
    buffer.append("\\begin{tabular}{|l|" + "c|" * (len(df.columns) - 2) + "c|}")
    buffer.append("\\hline")
    buffer.append(" & ".join(df.columns) + " \\\\")
    buffer.append("\\hline")

    for index, row in df.iterrows():
        feature_values = " & ".join(map(str, row))
        buffer.append(f"{feature_values} \\\\")

    buffer.append("\\hline")
    buffer.append("\\end{tabular}")
    buffer.append(f"\\caption{{Results for {config_key}}}")
    buffer.append(f"\\label{{tab:{config_key}_results}}")
    buffer.append("\\end{table}")
    buffer.append("\\clearpage")

    return "\n".join(buffer)

# Combine all tables into one .tex file
with open("combined_results_tables.tex", "w") as f:
    for config_key, grouped_df in all_grouped_results.items():
        latex_table = generate_latex_table(grouped_df, config_key)
        f.write(latex_table + "\n")

print("Combined LaTeX tables saved to combined_results_tables.tex")


GradientBoostingRegressor MSE for PR: 0.39447187233831427


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPRegressor MSE for NR: 0.40652212294757273
SGDRegressor MSE for SR: 5.67812257289403e+25
GradientBoostingRegressor MSE for WS: 0.2380870603004818
DecisionTreeRegressor MSE for SFST: 0.4247549064153437
DecisionTreeRegressor MSE for PR_Benefit: 1.837624142857143
SVR MSE for NR_Benefit: 4.612989194408565
Ridge MSE for SR_Benefit: 3.9253735321666916
DecisionTreeRegressor MSE for WS_Benefit: 3.678602899801587
DecisionTreeRegressor MSE for SFST_Benefit: 0.7709285119047619
Combined LaTeX tables saved to combined_results_tables.tex
