In [31]:
from pathlib import Path
import re
import pandas as pd
from correlation_analysis import reports_dir

In [32]:
pair_corr_dir = reports_dir
out_dir = reports_dir / "total_pairs"
out_dir.mkdir(parents=True, exist_ok=True)

In [33]:
file_pattern = "PAIR*_correlations*.csv"

def extract_pair_id_from_filename(path: Path) -> str:
    m = re.search(r"(PAIR\d+)", path.stem, re.IGNORECASE)
    if m:
        return m.group(1).upper()
    return ""

In [34]:
def clean_metric_name(m: str) -> str:
    m = m.lower()
    m = m.replace("(", "").replace(")", "")
    m = m.replace(" ", "_")
    m = m.replace("/", "_")
    m = m.replace("-", "_")
    return m

In [35]:
def main():
    csv_files = sorted(pair_corr_dir.glob(file_pattern))
    all_long_rows = []

    for path in csv_files:
        try:
            df = pd.read_csv(path)
        except:
            continue

        if "pair_id" not in df.columns:
            pid = extract_pair_id_from_filename(path)
            if not pid:
                continue
            df["pair_id"] = pid

        expected_cols = ["pair_id", "metric", "n_overlap", "pearson", "spearman"]
        if any(col not in df.columns for col in expected_cols):
            continue

        df = df[expected_cols].copy()
        all_long_rows.append(df)

    if not all_long_rows:
        return

    long_df = pd.concat(all_long_rows, ignore_index=True)
    long_df["pair_id"] = long_df["pair_id"].astype(str)
    long_df = long_df.sort_values(["pair_id", "metric", "n_overlap"], ascending=[True, True, False])

    out_long = out_dir / "all_pairs_correlation_long.csv"
    long_df.to_csv(out_long, index=False)

    long_df["metric_clean"] = long_df["metric"].astype(str).apply(clean_metric_name)

    pearson_wide = long_df.pivot_table(
        index="pair_id",
        columns="metric_clean",
        values="pearson"
    ).add_prefix("pearson_")

    spearman_wide = long_df.pivot_table(
        index="pair_id",
        columns="metric_clean",
        values="spearman"
    ).add_prefix("spearman_")

    wide_df = pd.concat([pearson_wide, spearman_wide], axis=1).reset_index()

    out_wide = out_dir / "all_pairs_correlation_wide.csv"
    wide_df.to_csv(out_wide, index=False)


In [36]:
if __name__ == "__main__":
    main()