In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
import joblib
import os
import numpy as np

# Define the configuration for each result column
configurations = {
    'PR': {
        'data_columns': ['F43', 'F45'],
        'model': GradientBoostingClassifier,
        'hyperparameters': {
            'gradientboostingclassifier__learning_rate': 0.01,
            'gradientboostingclassifier__loss': 'log_loss',
            'gradientboostingclassifier__n_estimators': 200,
            'gradientboostingclassifier__warm_start': False
        },
        'csv_file': '../../Data_ML/4_out_csvs_classification/output_bfill_imputed.csv',
        'model_directory': 'PR'
    },
    'NR': {
        'data_columns': ['F24', 'F43', 'F44', 'F45'],
        'model': MLPClassifier,
        'hyperparameters': {
            'mlpclassifier__activation': 'tanh',
            'mlpclassifier__hidden_layer_sizes': (50, 50, 50),
            'mlpclassifier__learning_rate': 'invscaling',
            'mlpclassifier__solver': 'lbfgs'
        },
        'csv_file': '../../Data_ML/4_out_csvs_classification/output_iterative_imputed.csv',
        'model_directory': 'NR'
    },
    'SR': {
        'data_columns': ['OF22', 'F35', 'F43', 'F44', 'F45', 'F49'],
        'model': SGDClassifier,
        'hyperparameters': {
            'sgdclassifier__learning_rate': 'optimal',
            'sgdclassifier__loss': 'modified_huber',
            'sgdclassifier__penalty': 'elasticnet',
            'sgdclassifier__warm_start': True
        },
        'csv_file': '../../Data_ML/4_out_csvs_classification/output_mean_imputed.csv',
        'model_directory': 'SR'
    },
    'WS': {
        'data_columns': ['F31', 'F43', 'F44', 'F45'],
        'model': GradientBoostingClassifier,
        'hyperparameters': {
            'gradientboostingclassifier__learning_rate': 0.1,
            'gradientboostingclassifier__loss': 'log_loss',
            'gradientboostingclassifier__n_estimators': 200,
            'gradientboostingclassifier__warm_start': True
        },
        'csv_file': '../../Data_ML/4_out_csvs_classification/output_bfill_imputed.csv',
        'model_directory': 'WS'
    },
    'SFST': {
        'data_columns': ['F43', 'F44'],
        'model': DecisionTreeClassifier,
        'hyperparameters': {
            'decisiontreeclassifier__criterion': 'entropy',
            'decisiontreeclassifier__max_features': None,
            'decisiontreeclassifier__min_samples_split': 5,
            'decisiontreeclassifier__splitter': 'best'
        },
        'csv_file': '../../Data_ML/4_out_csvs_classification/output_bfill_imputed.csv',
        'model_directory': 'SFST'
    },
    'PR_Benefit': {
        'data_columns': ['OF22', 'F41'],
        'model': DecisionTreeClassifier,
        'hyperparameters': {
            'decisiontreeclassifier__criterion': 'entropy',
            'decisiontreeclassifier__max_features': 'log2',
            'decisiontreeclassifier__min_samples_split': 2,
            'decisiontreeclassifier__splitter': 'random'
        },
        'csv_file': '../../Data_ML/4_out_csvs_classification/output_iterative_imputed.csv',
        'model_directory': 'PR_Benefit'
    },
    'NR_Benefit': {
        'data_columns': ['OF19', 'OF21', 'F41'],
        'model': SVC,
        'hyperparameters': {
            'svc__C': 10.0,
            'svc__degree': 1,
            'svc__gamma': 'scale',
            'svc__kernel': 'rbf'
        },
        'csv_file': '../../Data_ML/4_out_csvs_classification/output_iterative_imputed.csv',
        'model_directory': 'NR_Benefit'
    },
    'SR_Benefit': {
        'data_columns': ['OF19', 'OF21', 'F41'],
        'model': RidgeClassifier,
        'hyperparameters': {
            'ridgeclassifier__alpha': 0.1,
            'ridgeclassifier__solver': 'auto'
        },
        'csv_file': '../../Data_ML/4_out_csvs_classification/output_iterative_imputed.csv',
        'model_directory': 'SR_Benefit'
    },
    'WS_Benefit': {
        'data_columns': ['OF17', 'OF23'],
        'model': DecisionTreeClassifier,
        'hyperparameters': {
            'decisiontreeclassifier__criterion': 'entropy',
            'decisiontreeclassifier__max_features': None,
            'decisiontreeclassifier__min_samples_split': 5,
            'decisiontreeclassifier__splitter': 'random'
        },
        'csv_file': '../../Data_ML/4_out_csvs_classification/output_iterative_imputed.csv',
        'model_directory': 'WS_Benefit'
    },
    'SFST_Benefit': {
        'data_columns': ['F43', 'F44'],
        'model': DecisionTreeClassifier,
        'hyperparameters': {
            'decisiontreeclassifier__criterion': 'entropy',
            'decisiontreeclassifier__max_features': None,
            'decisiontreeclassifier__min_samples_split': 3,
            'decisiontreeclassifier__splitter': 'random'
        },
        'csv_file': '../../Data_ML/4_out_csvs_classification/output_bfill_imputed.csv',
        'model_directory': 'SFST_Benefit'
    }
}

# Create the directories if they don't exist
for config in configurations.values():
    if not os.path.exists(config['model_directory']):
        os.makedirs(config['model_directory'])

# Function to process the CSV file
def process_csv(result_column, config_key, config):
    data = pd.read_csv(config['csv_file'])
    X = data[config['data_columns']]
    y = data[result_column]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

    pipeline = make_pipeline(StandardScaler(), config['model']())

    # Set hyperparameters directly in the model
    pipeline.set_params(**config['hyperparameters'])

    # Fit the model
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{config['model'].__name__} Accuracy for {result_column}: {accuracy}")

    # Save the model
    model_filename = os.path.join(config['model_directory'], f"{os.path.basename(config['csv_file'])}_{config['model'].__name__}_model.pkl")
    joblib.dump(pipeline, model_filename)

    # Save the predictions and actual values
    results_df = pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred.flatten(), 'Model': config['model'].__name__})
    results_filename = f"output_{os.path.basename(config['csv_file'])}_{result_column}.csv"
    #results_df.to_csv(results_filename, index=False)

    # Combine feature values, predicted values, and actual values
    combined_df = pd.DataFrame(X_test)
    combined_df['Actual'] = y_test.values
    combined_df['Predicted'] = y_pred

    # Group by unique feature combinations and count occurrences
    grouped_df = combined_df.groupby(config['data_columns'] + ['Actual', 'Predicted']).size().reset_index(name='Occurrences')

    return {
        'csv_file': os.path.basename(config['csv_file']),
        'model_name': config['model'].__name__,
        'hyperparameters': config['hyperparameters'],
        'accuracy': accuracy,
        'grouped_df': grouped_df
    }

# Process each configuration
all_grouped_results = {}
for result_column, config_key in zip(configurations.keys(), configurations.keys()):
    best_model_info = process_csv(result_column, config_key, configurations[result_column])
    all_grouped_results[config_key] = best_model_info['grouped_df']

# Generate LaTeX table with occurrences column and save all tables in one file
def generate_latex_table(df, config_key):
    buffer = []

    buffer.append("\\begin{table}[htbp]")
    buffer.append("\\centering")
    buffer.append("\\begin{tabular}{|l|" + "c|" * (len(df.columns) - 2) + "c|}")
    buffer.append("\\hline")
    buffer.append(" & ".join(df.columns) + " \\\\")
    buffer.append("\\hline")

    for index, row in df.iterrows():
        row_values = row.tolist()
        feature_values = []
        for i, value in enumerate(row_values):
            if df.columns[i] in ['Actual', 'Predicted'] and row_values[-3] != row_values[-2]:  # Check if Actual != Predicted
                feature_values.append(f"\\textbf{{{value}}}")
            else:
                feature_values.append(str(value))
        buffer.append(" & ".join(feature_values) + " \\\\")

    buffer.append("\\hline")
    buffer.append("\\end{tabular}")
    buffer.append(f"\\caption{{Results for {config_key}}}")
    buffer.append(f"\\label{{tab:{config_key}_results}}")
    buffer.append("\\end{table}")
    buffer.append("\\clearpage")

    return "\n".join(buffer)

# Combine all tables into one .tex file
with open("combined_results_tables.tex", "w") as f:
    for config_key, grouped_df in all_grouped_results.items():
        latex_table = generate_latex_table(grouped_df, config_key)
        f.write(latex_table + "\n")

print("Combined LaTeX tables saved to combined_results_tables.tex")



GradientBoostingClassifier Accuracy for PR: 0.8571428571428571
MLPClassifier Accuracy for NR: 0.7142857142857143
SGDClassifier Accuracy for SR: 0.6190476190476191
GradientBoostingClassifier Accuracy for WS: 0.8571428571428571
DecisionTreeClassifier Accuracy for SFST: 0.9523809523809523
DecisionTreeClassifier Accuracy for PR_Benefit: 0.8571428571428571
SVC Accuracy for NR_Benefit: 0.7619047619047619
RidgeClassifier Accuracy for SR_Benefit: 1.0
DecisionTreeClassifier Accuracy for WS_Benefit: 0.9523809523809523
DecisionTreeClassifier Accuracy for SFST_Benefit: 0.8095238095238095
Combined LaTeX tables saved to combined_results_tables.tex
