In [None]:
import pandas as pd

In [None]:
import pandas as pd
import glob
import os

# Find all files ending with _metrics.csv in ../Results
files = glob.glob("Results/*_metrics.csv")

# Option 1: Load into a dictionary (keep separate)
dfs = {os.path.basename(f): pd.read_csv(f) for f in files}

# Example: access one file
# dfs["experiment1_metrics.csv"].head()

# Option 2: Concatenate into one dataframe (add filename column)
all_metrics = []
for f in files:
    df = pd.read_csv(f)
    df["source_file"] = os.path.basename(f)  # keep track of origin
    all_metrics.append(df)

all_metrics_df = pd.concat(all_metrics, ignore_index=True)

# Now you have all results in one dataframe
all_metrics_df.head()


In [None]:
all_metrics_df.groupby(["source_file"]).mean().sort_values(by=["PR AUC"])

In [None]:
# Group by 'source_file' and take the mean
grouped_means = all_metrics_df.groupby("source_file").mean()

# Filter based on the mean values
filtered_df = grouped_means[
    (grouped_means["Specificity"] > 0.6) &
    (grouped_means["Recall"] > 0.6)
]


In [None]:
source_list = filtered_df.reset_index()["source_file"].tolist()

In [None]:
filtered_df = all_metrics_df[all_metrics_df["source_file"].isin(source_list)]

In [None]:
filtered_df.groupby("source_file").mean()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Keep only numeric columns + source_file
numeric_cols = filtered_df.select_dtypes(include="number").columns.tolist()
metrics = [c for c in numeric_cols if c not in ["TN", "TP", "FP", "FN"]]

# Melt into long format
melted = filtered_df.melt(
    id_vars=["source_file"],
    value_vars=metrics,
    var_name="metric",
    value_name="value"
)

# Ensure numeric type
melted["value"] = pd.to_numeric(melted["value"], errors="coerce")

# Drop NaNs (just in case)
melted = melted.dropna(subset=["value"])

# Choose a palette with more colors
unique_sources = melted["source_file"].nunique()
palette = sns.color_palette("tab20", n_colors=unique_sources)

# Plot: distribution of metrics across folds and source files
plt.figure(figsize=(12, 6))
sns.boxplot(
    data=melted,
    x="metric", y="value",
    hue="source_file",
    palette=palette
)

plt.xticks(rotation=45, ha="right")
plt.ylabel("Score")
plt.xlabel("Metric")
plt.title("Metric Distribution Across Folds per Models")

# Remove legend
plt.legend([], [], frameon=False)

plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Keep only numeric columns + source_file
numeric_cols = filtered_df.select_dtypes(include="number").columns.tolist()
metrics = [c for c in numeric_cols if c not in ["TN", "TP", "FP", "FN"]]

# Melt into long format
melted = filtered_df.melt(
    id_vars=["source_file"],
    value_vars=metrics,
    var_name="metric",
    value_name="value"
)

# Ensure numeric type
melted["value"] = pd.to_numeric(melted["value"], errors="coerce")
melted = melted.dropna(subset=["value"])

# Choose a palette with enough colors
unique_sources = melted["source_file"].nunique()
palette = sns.color_palette("tab20", n_colors=unique_sources)

# Plot: separate subplot for each metric
num_metrics = len(metrics)
fig, axes = plt.subplots(1, num_metrics, figsize=(5*num_metrics, 6), sharey=False)

for ax, metric in zip(axes, metrics):
    sns.boxplot(
        data=melted[melted["metric"] == metric],
        x="source_file", y="value",
        palette=palette,
        ax=ax
    )
    ax.set_title(metric)
    ax.set_xlabel("")
    ax.set_ylabel("Score")
    ax.set_xticks([])  # Remove x-ticks

plt.tight_layout()
plt.savefig("Metricsvisualisation.pdf")
