In [14]:
import json
import pandas as pd
import numpy as np

def cohens_d(a: pd.Series, b: pd.Series) -> float:
    """Cohen's d para dos series numéricas, permitiendo n=1 en uno de los grupos."""
    m1, m2 = a.mean(), b.mean()
    s1, s2 = a.std(ddof=1), b.std(ddof=1)
    n1, n2 = len(a), len(b)
    # al menos 3 observaciones en total (p.ej. 2 orig + 1 rep)
    if n1 + n2 < 3:
        return np.nan
    # desviación agrupada; si un grupo tiene n=1, su s=0
    s1 = 0.0 if n1 < 2 else s1
    s2 = 0.0 if n2 < 2 else s2
    s_pooled = np.sqrt(((n1-1)*s1**2 + (n2-1)*s2**2) / (n1+n2-2))
    # si s_pooled=0 sale nan
    return np.nan if s_pooled == 0 else (m1 - m2) / s_pooled

if __name__ == "__main__":
    # 1) Carga tu JSON
    with open("Griegos_f.json", "r", encoding="utf-8") as f:
        mps = json.load(f)

    # 2) Preparar acumuladores
    orig_sent, rep_sent = [], []
    orig_hate, rep_hate = [], []
    emotions = ["optimism", "joy", "anger"]
    orig_em = {e: [] for e in emotions}
    rep_em  = {e: [] for e in emotions}

    # 3) Recorre cada MP y sus tweets/respuestas
    for mp in mps:
        for tw in mp.get("tweets", []):
            ana = tw.get("analysis") or {}
            # sentiment original
            if isinstance(ana.get("sentiment"), dict):
                val = ana["sentiment"].get("score")
                if isinstance(val, (int,float)):
                    orig_sent.append(val)
            # hate original
            if isinstance(ana.get("hate"), dict):
                val = ana["hate"].get("score")
                if isinstance(val, (int,float)):
                    orig_hate.append(val)
            # emociones originales
            for emo in ana.get("emotions") or []:
                if isinstance(emo, dict):
                    lab, sc = emo.get("label"), emo.get("score")
                    if lab in emotions and isinstance(sc, (int,float)):
                        orig_em[lab].append(sc)

            # respuestas
            for rep in tw.get("replies") or []:
                r = rep.get("analysis") or {}
                # sentiment respuesta
                if isinstance(r.get("sentiment"), dict):
                    val = r["sentiment"].get("score")
                    if isinstance(val, (int,float)):
                        rep_sent.append(val)
                # hate respuesta
                if isinstance(r.get("hate"), dict):
                    val = r["hate"].get("score")
                    if isinstance(val, (int,float)):
                        rep_hate.append(val)
                # emociones respuesta
                for emo2 in r.get("emotions") or []:
                    if isinstance(emo2, dict):
                        lab2, sc2 = emo2.get("label"), emo2.get("score")
                        if lab2 in emotions and isinstance(sc2, (int,float)):
                            rep_em[lab2].append(sc2)

    # 4) Convertir a pandas.Series
    orig_sent = pd.Series(orig_sent)
    rep_sent  = pd.Series(rep_sent)
    orig_hate = pd.Series(orig_hate)
    rep_hate  = pd.Series(rep_hate)
    for e in emotions:
        orig_em[e] = pd.Series(orig_em[e])
        rep_em[e]  = pd.Series(rep_em[e])

    # 5) Calcula Cohen’s d para cada categoría
    results = {
        "sentiment": cohens_d(orig_sent, rep_sent),
        "hate":      cohens_d(orig_hate, rep_hate)
    }
    for e in emotions:
        results[e] = cohens_d(orig_em[e], rep_em[e])

    # 6) Crear la matriz final
    matriz = pd.DataFrame.from_dict(
        {"promoted_vs_received": results},
        orient="index",
        columns=["d_cohen"]
    )
    matriz.index.name = "category"

    # 7) Mostrar y guardar
    print(matriz.round(3))
    matriz.to_csv("cohen_mps_original_vs_replies.csv")


Empty DataFrame
Columns: [d_cohen]
Index: []


In [15]:
import json, random
import pandas as pd
from sklearn.metrics import confusion_matrix

# ---------- 1. Carga ----------
with open("Griegos_f.json", encoding="utf-8") as f:
    mps = json.load(f)

# ---------- 2. Aplanado ----------
LABEL_MAP = {"-1": "negative", "0": "neutral", "1": "positive"}

rows = []
for mp in mps:
    gender = mp.get("gender") or mp.get("genero")
    age = mp.get("age")
    age_group = "<40" if age and age < 40 else "40-60" if age and age <= 60 else ">60"
    ideologia = mp.get("ideologia") or mp.get("party")

    for tw in mp.get("tweets", []):
        real = LABEL_MAP.get(str(tw.get("label")))
        pred = tw.get("analysis", {}).get("sentiment", {}).get("label", "").lower()
        if real is None or pred not in {"negative", "neutral", "positive"}:
            continue
        rows.append({
            "gender": gender,
            "age_group": age_group,
            "ideologia": ideologia,
            "y_true": real,
            "y_pred": pred
        })

full_df = pd.DataFrame(rows)

# ---------- 3. Muestreo balanceado ----------
random.seed(42)
TARGET = {"negative": 300, "neutral": 400, "positive": 300}
sample_idx = []

for label, target in TARGET.items():
    idxs = full_df[full_df["y_true"] == label].index.tolist()
    if not idxs:
        raise ValueError(f"No hay muestras para {label}")
    sample_idx += random.sample(idxs, target) if len(idxs) >= target else idxs + random.choices(idxs, k=target - len(idxs))

df = full_df.loc[sample_idx].reset_index(drop=True)

# ---------- 4. Matrices ----------
LABELS = ["negative", "neutral", "positive"]

def confusion_df(data):
    cm = confusion_matrix(data["y_true"], data["y_pred"], labels=LABELS)
    return pd.DataFrame(cm, index=LABELS, columns=LABELS)

# Global
print("######## MATRICES – Modelo: CARDIFF – Grecia ########\n")
print("===== Global =====")
print(confusion_df(df))

# Por grupos protegidos
for attr in ["gender", "age_group", "ideologia"]:
    for val, group in df.groupby(attr):
        print(f"\n===== {attr} = {val} =====")
        print(confusion_df(group))


######## MATRICES – Modelo: CARDIFF – Grecia ########

===== Global =====
          negative  neutral  positive
negative         3      297         0
neutral          0      400         0
positive         0      292         8

===== gender = femenino =====
          negative  neutral  positive
negative         1       57         0
neutral          0      114         0
positive         0       81         1

===== gender = masculino =====
          negative  neutral  positive
negative         2      240         0
neutral          0      286         0
positive         0      211         7

===== age_group = 40-60 =====
          negative  neutral  positive
negative         0      228         0
neutral          0      273         0
positive         0      204         5

===== age_group = <40 =====
          negative  neutral  positive
negative         0        8         0
neutral          0        7         0
positive         0        5         0

===== age_group = >60 =====
          nega