In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix,multilabel_confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import coo_matrix

In [10]:
from pathlib import Path
import re
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score

# -----------------------
# CONFIG
# -----------------------
BASE_DIRS = [
    Path(r"outputs\results_raw_ROS_HV-master"),
    Path(r"outputs\results_raw_ROS-master"),
    Path(r"outputs\results_transcripciones-segunda-vuelta\highVol"),
    Path(r"outputs\results_transcripciones-segunda-vuelta\random"),
]

# Mapeo directorio -> corpus (según tu definición)
DIR_TO_CORPUS = {
    Path(r"outputs\results_raw_ROS_HV-master").resolve(): "raw_ROS_HV",
    Path(r"outputs\results_raw_ROS-master").resolve(): "raw_ROS",
    Path(r"outputs\results_transcripciones-segunda-vuelta\highVol").resolve(): r"segunda-vuelta\highVol",
    Path(r"outputs\results_transcripciones-segunda-vuelta\random").resolve(): r"segunda-vuelta\random",
}

TYPES = ["KCHI", "OCH", "FEM", "MAL", "SIL"]

NS_MAP = {
    "nsb": "bajo",
    "nsm": "medio",
}

FNAME_RE = re.compile(r"^(?P<stem>.+)_results\.csv$", re.IGNORECASE)

def parse_filename(csv_path: Path):
    """
    Formato esperado:
      name-audio-nsb_results.csv   o   name-audio-nsm_results.csv
    Donde name puede contener guiones.
    """
    m = FNAME_RE.match(csv_path.name)
    if not m:
        return None, None, None, None, f"bad_filename: {csv_path.name}"

    stem = m.group("stem")
    parts = stem.split("-")
    if len(parts) < 3:
        return None, None, None, None, f"bad_stem_parts({len(parts)}): {stem}"

    ns_code = parts[-1].lower()
    audio = parts[-2]
    name = "-".join(parts[:-2])

    if ns_code not in NS_MAP:
        return name, audio, ns_code, None, f"bad_ns_code: {ns_code}"

    nivel_socioeco = NS_MAP[ns_code]
    return name, audio, ns_code, nivel_socioeco, "ok"

def compute_metrics(df: pd.DataFrame, types=TYPES):
    rows = []
    n = len(df)

    for t in types:
        col_true = f"Elan_{t}"
        col_pred = f"Diar_{t}"

        if col_true not in df.columns or col_pred not in df.columns:
            rows.append({
                "Tipo": t,
                "Precisión": None,
                "Recall": None,
                "F1-score": None,
                "support_true": None,
                "support_pred": None,
                "n": n,
                "status": f"missing_cols: {col_true if col_true not in df.columns else ''} {col_pred if col_pred not in df.columns else ''}".strip()
            })
            continue

        y_true = df[col_true].astype(int)
        y_pred = df[col_pred].astype(int)

        rows.append({
            "Tipo": t,
            "Precisión": precision_score(y_true, y_pred, zero_division=0),
            "Recall": recall_score(y_true, y_pred, zero_division=0),
            "F1-score": f1_score(y_true, y_pred, zero_division=0),
            "support_true": int(y_true.sum()),
            "support_pred": int(y_pred.sum()),
            "n": n,
            "status": "ok"
        })

    return pd.DataFrame(rows)

# -----------------------
# MAIN
# -----------------------
all_rows = []
csv_files = []

for d in BASE_DIRS:
    if d.exists():
        csv_files.extend(sorted(d.glob("*_results.csv")))
    else:
        print(f"[WARN] No existe: {d}")

print(f"Encontré {len(csv_files)} archivos *_results.csv en los 4 directorios.")

for csv_path in tqdm(csv_files, desc="Procesando archivos"):
    name, audio, ns_code, nivel_socioeco, parse_status = parse_filename(csv_path)

    # corpus sale del directorio padre
    parent_resolved = csv_path.parent.resolve()
    corpus = DIR_TO_CORPUS.get(parent_resolved, "DESCONOCIDO")

    try:
        df_results = pd.read_csv(csv_path)

        df_m = compute_metrics(df_results, types=TYPES)
        df_m.insert(0, "name", name)
        df_m.insert(1, "audio", audio)
        df_m.insert(2, "ns_code", ns_code)
        df_m.insert(3, "nivel_socioeco", nivel_socioeco)
        df_m.insert(4, "corpus", corpus)              # <-- NUEVO
        df_m.insert(5, "parse_status", parse_status)
        df_m.insert(6, "dataset_dir", str(csv_path.parent))
        df_m.insert(7, "file", csv_path.name)
        df_m.insert(8, "path", str(csv_path))

        all_rows.append(df_m)

    except Exception as e:
        all_rows.append(pd.DataFrame([{
            "name": name,
            "audio": audio,
            "ns_code": ns_code,
            "nivel_socioeco": nivel_socioeco,
            "corpus": corpus,                          # <-- NUEVO
            "parse_status": parse_status,
            "dataset_dir": str(csv_path.parent),
            "file": csv_path.name,
            "path": str(csv_path),
            "Tipo": None,
            "Precisión": None,
            "Recall": None,
            "F1-score": None,
            "support_true": None,
            "support_pred": None,
            "n": None,
            "status": f"error: {repr(e)}"
        }]))

df_metrics_all = pd.concat(all_rows, ignore_index=True)
df_metrics_all = df_metrics_all.sort_values(["corpus", "name", "audio", "ns_code", "Tipo"], na_position="last").reset_index(drop=True)

out_path = Path("outputs") / "metrics_por_archivo_y_tipo.csv"
out_path.parent.mkdir(parents=True, exist_ok=True)
df_metrics_all.to_csv(out_path, index=False)

print(f"\nListo. Guardado en: {out_path}")
df_metrics_all


Encontré 22 archivos *_results.csv en los 4 directorios.


Procesando archivos: 100%|██████████| 22/22 [00:15<00:00,  1.46it/s]


Listo. Guardado en: outputs\metrics_por_archivo_y_tipo.csv





Unnamed: 0,name,audio,ns_code,nivel_socioeco,corpus,parse_status,dataset_dir,file,path,Tipo,Precisión,Recall,F1-score,support_true,support_pred,n,status
0,Gaell,a1,nsb,bajo,raw_ROS,ok,outputs\results_raw_ROS-master,Gaell-a1-nsb_results.csv,outputs\results_raw_ROS-master\Gaell-a1-nsb_re...,FEM,0.258619,0.150674,0.190412,39330.0,22914.0,180000,ok
1,Gaell,a1,nsb,bajo,raw_ROS,ok,outputs\results_raw_ROS-master,Gaell-a1-nsb_results.csv,outputs\results_raw_ROS-master\Gaell-a1-nsb_re...,KCHI,0.126050,0.081728,0.099162,27359.0,17739.0,180000,ok
2,Gaell,a1,nsb,bajo,raw_ROS,ok,outputs\results_raw_ROS-master,Gaell-a1-nsb_results.csv,outputs\results_raw_ROS-master\Gaell-a1-nsb_re...,MAL,0.138408,0.003579,0.006977,33532.0,867.0,180000,ok
3,Gaell,a1,nsb,bajo,raw_ROS,ok,outputs\results_raw_ROS-master,Gaell-a1-nsb_results.csv,outputs\results_raw_ROS-master\Gaell-a1-nsb_re...,OCH,0.000000,0.000000,0.000000,0.0,0.0,180000,ok
4,Gaell,a1,nsb,bajo,raw_ROS,ok,outputs\results_raw_ROS-master,Gaell-a1-nsb_results.csv,outputs\results_raw_ROS-master\Gaell-a1-nsb_re...,SIL,,,,,,180000,missing_cols: Elan_SIL Diar_SIL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,camilob,a1,nsm,medio,segunda-vuelta\random,ok,outputs\results_transcripciones-segunda-vuelta...,camilob-a1-nsm_results.csv,outputs\results_transcripciones-segunda-vuelta...,FEM,0.196280,0.563433,0.291138,40452.0,116120.0,180000,ok
106,camilob,a1,nsm,medio,segunda-vuelta\random,ok,outputs\results_transcripciones-segunda-vuelta...,camilob-a1-nsm_results.csv,outputs\results_transcripciones-segunda-vuelta...,KCHI,0.154154,0.146861,0.150420,15198.0,14479.0,180000,ok
107,camilob,a1,nsm,medio,segunda-vuelta\random,ok,outputs\results_transcripciones-segunda-vuelta...,camilob-a1-nsm_results.csv,outputs\results_transcripciones-segunda-vuelta...,MAL,0.015411,0.063269,0.024785,2276.0,9344.0,180000,ok
108,camilob,a1,nsm,medio,segunda-vuelta\random,ok,outputs\results_transcripciones-segunda-vuelta...,camilob-a1-nsm_results.csv,outputs\results_transcripciones-segunda-vuelta...,OCH,0.000000,0.000000,0.000000,0.0,0.0,180000,ok


In [5]:
df_metrics_all

Unnamed: 0,name,audio,ns_code,nivel_socioeco,parse_status,dataset_dir,file,path,Tipo,Precisión,Recall,F1-score,support_true,support_pred,n,status
0,Gaell,a1,nsb,bajo,ok,outputs\results_raw_ROS-master,Gaell-a1-nsb_results.csv,outputs\results_raw_ROS-master\Gaell-a1-nsb_re...,FEM,0.258619,0.150674,0.190412,39330.0,22914.0,180000,ok
1,Gaell,a1,nsb,bajo,ok,outputs\results_raw_ROS-master,Gaell-a1-nsb_results.csv,outputs\results_raw_ROS-master\Gaell-a1-nsb_re...,KCHI,0.126050,0.081728,0.099162,27359.0,17739.0,180000,ok
2,Gaell,a1,nsb,bajo,ok,outputs\results_raw_ROS-master,Gaell-a1-nsb_results.csv,outputs\results_raw_ROS-master\Gaell-a1-nsb_re...,MAL,0.138408,0.003579,0.006977,33532.0,867.0,180000,ok
3,Gaell,a1,nsb,bajo,ok,outputs\results_raw_ROS-master,Gaell-a1-nsb_results.csv,outputs\results_raw_ROS-master\Gaell-a1-nsb_re...,OCH,0.000000,0.000000,0.000000,0.0,0.0,180000,ok
4,Gaell,a1,nsb,bajo,ok,outputs\results_raw_ROS-master,Gaell-a1-nsb_results.csv,outputs\results_raw_ROS-master\Gaell-a1-nsb_re...,SIL,,,,,,180000,missing_cols: Elan_SIL Diar_SIL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,verag,a2,nsm,medio,ok,outputs\results_raw_ROS_HV-master,verag-a2-nsm_results.csv,outputs\results_raw_ROS_HV-master\verag-a2-nsm...,FEM,0.756727,0.893037,0.819251,25981.0,30661.0,168000,ok
106,verag,a2,nsm,medio,ok,outputs\results_raw_ROS_HV-master,verag-a2-nsm_results.csv,outputs\results_raw_ROS_HV-master\verag-a2-nsm...,KCHI,0.718064,0.894803,0.796750,30248.0,37693.0,168000,ok
107,verag,a2,nsm,medio,ok,outputs\results_raw_ROS_HV-master,verag-a2-nsm_results.csv,outputs\results_raw_ROS_HV-master\verag-a2-nsm...,MAL,0.623563,0.513132,0.562983,10356.0,8522.0,168000,ok
108,verag,a2,nsm,medio,ok,outputs\results_raw_ROS_HV-master,verag-a2-nsm_results.csv,outputs\results_raw_ROS_HV-master\verag-a2-nsm...,OCH,0.000000,0.000000,0.000000,0.0,0.0,168000,ok


In [12]:
tipos_obj = ["FEM", "MAL", "KCHI"]

df_por_archivo = (
    df_metrics_all[
        (df_metrics_all["status"] == "ok") &
        (df_metrics_all["Tipo"].isin(tipos_obj))
    ]
    .groupby(["name", "corpus", "path"], as_index=False)
    .agg(
        f1_avg_archivo=("F1-score", "mean"),
        nivel_socioeco=("nivel_socioeco", "first"),
        ns_code=("ns_code", "first"),
    )
)

df_f1_por_ninio_y_corpus = (
    df_por_archivo
    .groupby(["name", "corpus"], as_index=False)
    .agg(
        f1_avg_FEM_MAL_KCHI=("f1_avg_archivo", "mean"),
        n_archivos=("f1_avg_archivo", "size"),
        nivel_socioeco=("nivel_socioeco", lambda s: s.dropna().unique()[0] if len(s.dropna().unique()) == 1 else "MIXTO"),
        ns_code=("ns_code", lambda s: s.dropna().unique()[0] if len(s.dropna().unique()) == 1 else "MIXTO"),
    )
    .sort_values(["corpus", "f1_avg_FEM_MAL_KCHI"], ascending=[True, False])
)
df_f1_por_ninio_y_corpus = df_f1_por_ninio_y_corpus.sort_values(
    "f1_avg_FEM_MAL_KCHI", ascending=False
).reset_index(drop=True)

df_f1_por_ninio_y_corpus

Unnamed: 0,name,corpus,f1_avg_FEM_MAL_KCHI,n_archivos,nivel_socioeco,ns_code
0,verag,raw_ROS_HV,0.726328,1,medio,nsm
1,franciscop,raw_ROS_HV,0.67321,1,medio,nsm
2,dantep,raw_ROS_HV,0.656136,1,medio,nsm
3,julietap,raw_ROS_HV,0.614153,1,medio,nsm
4,dantec,raw_ROS,0.584581,1,medio,nsm
5,angelinap,raw_ROS,0.564382,1,medio,nsm
6,alma,raw_ROS,0.557003,1,bajo,nsb
7,ummaq,raw_ROS,0.524177,1,bajo,nsb
8,lucior,raw_ROS,0.521247,1,medio,nsm
9,francescam,raw_ROS_HV,0.485962,1,medio,nsm
