# Machine Learning Algorithm Performance Comparison

This notebook analyzes the performance of different machine learning algorithms based on metrics in CSV files.

In [59]:
# Import necessary libraries
import pandas as pd
import os
from pathlib import Path


In [60]:
csv_files = []
output_dir = Path("../output")

for file in output_dir.rglob("*.csv"):
    if file.name.startswith("results"):
        csv_files.append(file)

print(csv_files)

[PosixPath('../output/mlr/results.csv'), PosixPath('../output/gru/results.csv'), PosixPath('../output/lstm/results.csv'), PosixPath('../output/mpr/results.csv')]


In [61]:
def extract_algorithm_name(file_path):
    """Extracts algorithm name from folder structure (e.g., ../output/gru/results.csv -> gru)"""
    return os.path.basename(os.path.dirname(file_path))


In [62]:
algorithm_data = {}

for file in csv_files:
    algorithm_name = extract_algorithm_name(file)

    if algorithm_name:
        df = pd.read_csv(file)
        algorithm_data[algorithm_name] = df
        print(f"Loaded {file} for algorithm '{algorithm_name}' with {len(df)} stations")

Loaded ../output/mlr/results.csv for algorithm 'mlr' with 19 stations
Loaded ../output/gru/results.csv for algorithm 'gru' with 19 stations
Loaded ../output/lstm/results.csv for algorithm 'lstm' with 19 stations
Loaded ../output/mpr/results.csv for algorithm 'mpr' with 19 stations


In [63]:
algorithm_metrics = []

for algorithm, df in algorithm_data.items():

    print(algorithm)

    metrics_columns = ['mae', 'mse', 'rmse', 'r2']
    metrics = df[metrics_columns].mean()

    algorithm_metrics.append({
        'algorithm': algorithm,
        'avg_mae': metrics['mae'],
        'avg_mse': metrics['mse'],
        'avg_rmse': metrics['rmse'],
        'avg_r2': metrics['r2']
    })

metrics_df = pd.DataFrame(algorithm_metrics)
metrics_df

mlr
gru
lstm
mpr


Unnamed: 0,algorithm,avg_mae,avg_mse,avg_rmse,avg_r2
0,mlr,0.479654,0.560587,0.622177,0.356808
1,gru,0.419946,0.396521,0.559296,0.31994
2,lstm,0.414779,0.38502,0.545646,0.391826
3,mpr,0.321654,0.291103,0.450385,0.635857


In [64]:
ranked_df = metrics_df.copy()

ranked_df['rank_mae'] = ranked_df['avg_mae'].rank()
ranked_df['rank_mse'] = ranked_df['avg_mse'].rank()
ranked_df['rank_rmse'] = ranked_df['avg_rmse'].rank()
ranked_df['rank_r2'] = ranked_df['avg_r2'].rank(ascending=False)

ranked_df['overall_rank'] = ranked_df[['rank_mae', 'rank_mse', 'rank_rmse', 'rank_r2']].mean(axis=1)

ranked_df = ranked_df.sort_values('overall_rank')

ranked_df

Unnamed: 0,algorithm,avg_mae,avg_mse,avg_rmse,avg_r2,rank_mae,rank_mse,rank_rmse,rank_r2,overall_rank
3,mpr,0.321654,0.291103,0.450385,0.635857,1.0,1.0,1.0,1.0,1.0
2,lstm,0.414779,0.38502,0.545646,0.391826,2.0,2.0,2.0,2.0,2.0
1,gru,0.419946,0.396521,0.559296,0.31994,3.0,3.0,3.0,4.0,3.25
0,mlr,0.479654,0.560587,0.622177,0.356808,4.0,4.0,4.0,3.0,3.75


In [65]:
summary_df = ranked_df[['algorithm', 'avg_mae', 'avg_mse', 'avg_rmse', 'avg_r2', 'overall_rank']].copy()
summary_df = summary_df.sort_values('overall_rank')

summary_df['avg_mae'] = summary_df['avg_mae'].round(4)
summary_df['avg_mse'] = summary_df['avg_mse'].round(4)
summary_df['avg_rmse'] = summary_df['avg_rmse'].round(4)
summary_df['avg_r2'] = summary_df['avg_r2'].round(4)
summary_df['overall_rank'] = summary_df['overall_rank'].round(2)

summary_df = summary_df.reset_index(drop=True)

print("Algorithm Performance Summary (Best to Worst):")
summary_df

Algorithm Performance Summary (Best to Worst):


Unnamed: 0,algorithm,avg_mae,avg_mse,avg_rmse,avg_r2,overall_rank
0,mpr,0.3217,0.2911,0.4504,0.6359,1.0
1,lstm,0.4148,0.385,0.5456,0.3918,2.0
2,gru,0.4199,0.3965,0.5593,0.3199,3.25
3,mlr,0.4797,0.5606,0.6222,0.3568,3.75


In [66]:
summary_df.to_csv("../output/data/result-comparison.csv")
print("successful to save summary")

successful to save summary
