In [6]:
import pandas as pd
import numpy as np

def cohens_d(a, b):
    """Calcula d de Cohen entre dos arrays numéricos."""
    m1, m2 = a.mean(), b.mean()
    s1, s2 = a.std(ddof=1), b.std(ddof=1)
    n1, n2 = len(a), len(b)
    s_pooled = np.sqrt(((n1-1)*s1**2 + (n2-1)*s2**2) / (n1+n2-2))
    return (m1 - m2) / s_pooled

# definimos los pares que queremos comparar
pares = {
    'gender':    ('femenino', 'masculino'),
    'age':       ('<40',       '>60'),
    'ideologia': ('izquierda','derecha')
}

if __name__ == "__main__":
    # 1) carga tu CSV
    df = pd.read_csv("resultados_analisis.csv")
    
    # 2) unifica tipos de análisis:
    mapeo = {
        'promovido':                'promoted',
        'promoted':                 'promoted',
        'prmoved':                  'promoted',  # por si hay typo
        'recibido':                 'received',
        'received':                 'received',
        'promovido_vs_recibido':    'promoted_vs_received',
        'promoted_vs_recibido':     'promoted_vs_received',
        'promoted_vs_received':     'promoted_vs_received'
    }
    df['tipo_analisis'] = (
        df['tipo_analisis']
        .astype(str)
        .str.strip()
        .str.lower()
        .map(mapeo)
    )
    
    # 3) definimos filas y columnas de la matriz
    filas = ['sentiment','emotion','hate']
    tipos = ['promoted','received']
    columnas = [f"{agr}_{t}" for agr in pares for t in tipos]
    matriz = pd.DataFrame(index=filas, columns=columnas, dtype=float)
    
    # 4) rellenamos la matriz
    for categoria in filas:
        sub_cat = df[df['categoria'] == categoria]
        for agr, (g1, g2) in pares.items():
            sub_agr = sub_cat[sub_cat['agrupacion'] == agr]
            for tipo in tipos:
                sel = sub_agr[sub_agr['tipo_analisis'] == tipo]
                v1 = sel[sel['grupo'] == g1]['score_medio']
                v2 = sel[sel['grupo'] == g2]['score_medio']
                if len(v1) > 1 and len(v2) > 1:
                    matriz.at[categoria, f"{agr}_{tipo}"] = cohens_d(v1, v2)
                else:
                    matriz.at[categoria, f"{agr}_{tipo}"] = np.nan
    
    # 5) mostramos y guardamos
    print(matriz.round(3))
    matriz.to_csv("matriz_cohens_espana.csv")


           gender_promoted  gender_received  age_promoted  age_received  \
sentiment           -0.023            0.075         0.365         0.031   
emotion             -0.186            0.130         0.119         0.149   
hate                -0.019            0.037         0.385        -0.039   

           ideologia_promoted  ideologia_received  
sentiment               0.035               0.012  
emotion                 0.191               0.135  
hate                    0.028               0.024  


In [7]:
import json
import pandas as pd

# 1) Carga tu JSON
with open("Spain_Completo.json", "r", encoding="utf-8") as f:
    mps = json.load(f)

# 2) Aplana la estructura: cada tweet como una fila
rows = []
for mp in mps:
    gender = mp.get("gender")
    ideologia = mp.get("ideologia") or mp.get("party")   # ajusta si tu campo se llama distinto
    age_group = None
    edad = mp.get("age")
    if edad is not None:
        if edad < 40:
            age_group = "<40"
        elif edad <= 60:
            age_group = "40-60"
        else:
            age_group = ">60"
    for tw in mp.get("tweets", []):
        sent_label = tw.get("analysis", {}) \
                      .get("sentiment", {}) \
                      .get("label")
        hate_label = tw.get("analysis", {}) \
                       .get("hate", {}) \
                       .get("label")
        # elegimos solo la primera emoción como etiqueta (opcional)
        emotions = tw.get("analysis", {}).get("emotions", [])
        emo_label = emotions[0]["label"] if emotions else None
        rows.append({
            "gender": gender,
            "ideologia": ideologia,
            "age_group": age_group,
            "sentiment": sent_label,
            "hate": hate_label,
            "emotion": emo_label
        })

df = pd.DataFrame(rows)

# 3) Recuento global de sentimiento
print("== Sentimiento global ==")
print(df["sentiment"].value_counts(), "\n")

# 4) Recuento por género
print("== Sentimiento por Género ==")
print(df.groupby("gender")["sentiment"].value_counts().unstack(fill_value=0), "\n")

# 5) Recuento por ideología
print("== Sentimiento por Ideología ==")
print(df.groupby("ideologia")["sentiment"].value_counts().unstack(fill_value=0), "\n")

# 6) Recuento por grupo de edad
print("== Sentimiento por Edad ==")
print(df.groupby("age_group")["sentiment"].value_counts().unstack(fill_value=0), "\n")

# 7) Recuento de odio global y por grupos (igual que con sentimiento)
print("== Odio global ==")
print(df["hate"].value_counts(), "\n")

print("== Odio por Género ==")
print(df.groupby("gender")["hate"].value_counts().unstack(fill_value=0), "\n")

# 8) Recuento de emoción global y por grupos
print("== Emoción global ==")
print(df["emotion"].value_counts(), "\n")

print("== Emoción por Género ==")
print(df.groupby("gender")["emotion"].value_counts().unstack(fill_value=0), "\n")


== Sentimiento global ==
sentiment
neutral     637
positive     31
negative      4
Name: count, dtype: int64 

== Sentimiento por Género ==
sentiment  negative  neutral  positive
gender                                
femenino          2      247        15
masculino         2      390        16 

== Sentimiento por Ideología ==
sentiment  negative  neutral  positive
ideologia                             
derecha           2      327        21
izquierda         2      310        10 

== Sentimiento por Edad ==
sentiment  negative  neutral  positive
age_group                             
40-60             3      458        22
<40               0       49         5
>60               1      130         4 

== Odio global ==
hate
not hate    582
hate         90
Name: count, dtype: int64 

== Odio por Género ==
hate       hate  not hate
gender                   
femenino     35       229
masculino    55       353 

== Emoción global ==
emotion
optimism    460
joy         155
anger        34


In [8]:
"""
Crea matrices de confusión del análisis de sentimiento para España
a partir de la estructura JSON de tweets/políticos mostrada arriba.

• Usa `annotations.Total` (-1, 0, 1) como etiqueta real.
• Usa `analysis.sentiment.label` ("negative", "neutral", "positive")  
  como etiqueta predicha por el modelo.
• Agrupa por cada característica protegida:
    - gender        (femenino / masculino)
    - age_group     (<40 / 40-60 />60)
    - ideologia     (izquierda / derecha)

Imprime cada matriz de confusión en consola (y opcionalmente en LaTeX).
"""

import json
import pandas as pd
from sklearn.metrics import confusion_matrix

# ---------- 1.  Carga del fichero ----------
PATH = "Spain_Completo.json"        # ← pon aquí tu ruta
with open(PATH, "r", encoding="utf-8") as f:
    mps = json.load(f)

# ---------- 2.  Aplanar a un DataFrame ----------
rows = []
for mp in mps:
    gender = mp.get("gender")
    edad   = mp.get("age")
    if edad is not None:
        age_group = "<40" if edad < 40 else "40-60" if edad <= 60 else ">60"
    else:
        age_group = None
    ideologia = mp.get("ideologia") or mp.get("party")

    for tw in mp.get("tweets", []):
        pred = tw.get("analysis", {}).get("sentiment", {}).get("label")
        real_val = tw.get("annotations", {}).get("Total")

        if pred is None or real_val is None:
            continue                                # salta tuplas incompletas

        # normalizamos a etiquetas texto
        pred = pred.lower().strip()
        real = {-1: "negative", 0: "neutral", 1: "positive"}.get(real_val)
        if real is None:
            continue

        rows.append({
            "gender": gender,
            "age_group": age_group,
            "ideologia": ideologia,
            "y_true": real,
            "y_pred": pred
        })

df = pd.DataFrame(rows)
labels = ["negative", "neutral", "positive"]

# ---------- 3.  Función auxiliar ----------
def print_confusion(sub_df: pd.DataFrame, title: str, to_latex: bool = False):
    cm = confusion_matrix(sub_df["y_true"], sub_df["y_pred"], labels=labels)
    cm_df = pd.DataFrame(cm, index=labels, columns=labels)
    print(f"\n===== {title} =====")
    print(cm_df)
    if to_latex:
        print("\n\\begin{table}[ht]")
        print("\\centering")
        print(cm_df.to_latex(escape=False))
        print(f"\\caption{{Matriz de confusión: {title}.}}")
        print("\\end{table}\n")

# ---------- 4.  Matrices por característica protegida ----------
for attr in ["gender", "age_group", "ideologia"]:
    for val, grp in df.groupby(attr):
        titulo = f"{attr} = {val}"
        print_confusion(grp, titulo, to_latex=False)     # cámbialo a True si quieres LaTeX

# ---------- 5.  Matriz global (sin agrupar) ----------
print_confusion(df, "Global España", to_latex=False)



===== gender = femenino =====
          negative  neutral  positive
negative         0       48         1
neutral          2      157         4
positive         0       40        10

===== gender = masculino =====
          negative  neutral  positive
negative         1       88         0
neutral          1      246         5
positive         0       56        11

===== age_group = 40-60 =====
          negative  neutral  positive
negative         1       95         0
neutral          2      291         7
positive         0       70        15

===== age_group = <40 =====
          negative  neutral  positive
negative         0        5         0
neutral          0       36         2
positive         0        8         3

===== age_group = >60 =====
          negative  neutral  positive
negative         0       36         1
neutral          1       76         0
positive         0       18         3

===== ideologia = derecha =====
          negative  neutral  positive
negative         

In [16]:
"""
Matrices de confusión (España) con las tres salidas de sentimiento
– sentiment           → analysis.sentiment.label           (string)
– sentiment_alt       → sentiment_alt_label                (string)
– sentiment_stanza    → analysis.sentiment_stanza (0/1/2)  (num)

Se equilibran los 1 000 tuits reales a 300 / 400 / 300
y se imprimen todas las matrices (global + protegidas).
"""

import json, random
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix

# ---------- 1.  Carga ----------
DATA = Path("Spain_Completo_analizado.json")
with DATA.open(encoding="utf-8") as f:
    mps = json.load(f)

# ---------- 2.  Aplanado ----------
LABELS      = ["negative", "neutral", "positive"]               # orden fijo
REAL_MAP    = {-1: "negative", 0: "neutral", 1: "positive"}     # anotaciones
STANZA_MAP  = { 0: "negative", 1: "neutral", 2: "positive"}     # salida stanza

def stanza_label(val):           # convierte 0/1/2 → texto
    return STANZA_MAP.get(val)   # None si val es None

def get_pred(tw, which):
    if which == "sentiment":
        return tw.get("analysis", {}).get("sentiment", {}).get("label")
    if which == "sentiment_alt":
        return tw.get("sentiment_alt_label")
    if which == "sentiment_stanza":
        num = tw.get("analysis", {}).get("sentiment_stanza")
        return stanza_label(num)
    return None

filas = []
for mp in mps:
    gender     = mp.get("gender")
    edad       = mp.get("age")
    age_group  = "<40" if edad and edad < 40 else "40-60" if edad and edad <= 60 else ">60"
    ideologia  = mp.get("ideologia") or mp.get("party")

    for tw in mp.get("tweets", []):
        real = REAL_MAP.get(tw.get("annotations", {}).get("Total"))
        if real is None:
            continue                                               # descarta sin anotación

        filas.append({
            "gender":     gender,
            "age_group":  age_group,
            "ideologia":  ideologia,
            "y_true":     real,
            "pred_sentiment":        get_pred(tw, "sentiment"),
            "pred_sentiment_alt":    get_pred(tw, "sentiment_alt"),
            "pred_sentiment_stanza": get_pred(tw, "sentiment_stanza")
        })

full = pd.DataFrame(filas)

# ---------- 3.  Remuestreo a 1 000 tuits (300/400/300) ----------
random.seed(42)
TARGET = {"negative": 300, "neutral": 400, "positive": 300}
idx = []

for lab, target in TARGET.items():
    pool = full[full.y_true == lab].index.tolist()
    if len(pool) >= target:
        idx += random.sample(pool, target)
    else:                                   # oversampling si faltan
        idx += pool + random.choices(pool, k=target - len(pool))

df = full.loc[idx].reset_index(drop=True)

# ---------- 4.  Funciones ----------
def conf(df_sub, col_pred):
    sub = df_sub[df_sub[col_pred].notna()]              # quita Nones
    cm  = confusion_matrix(sub.y_true, sub[col_pred], labels=LABELS)
    return pd.DataFrame(cm, LABELS, LABELS)

def show(title, cm):
    print(f"\n===== {title} =====")
    print(cm)

# ---------- 5.  Cálculo ----------
MODELS = {
    "sentiment":        "pred_sentiment",
    "sentiment_alt":    "pred_sentiment_alt",
    "sentiment_stanza": "pred_sentiment_stanza"
}

for mdl, col in MODELS.items():
    print(f"\n######## MATRICES – Modelo: {mdl.upper()} ########")

    # global
    show("Global", conf(df, col))

    # protegidos
    for attr in ["gender", "age_group", "ideologia"]:
        for val, grp in df.groupby(attr):
            show(f"{attr} = {val}", conf(grp, col))



######## MATRICES – Modelo: SENTIMENT ########

===== Global =====
          negative  neutral  positive
negative         3      296         1
neutral          3      390         7
positive         0      242        58

===== gender = femenino =====
          negative  neutral  positive
negative         0       93         1
neutral          2      151         2
positive         0      101        25

===== gender = masculino =====
          negative  neutral  positive
negative         3      203         0
neutral          1      239         5
positive         0      141        33

===== age_group = 40-60 =====
          negative  neutral  positive
negative         3      214         0
neutral          2      285         5
positive         0      190        44

===== age_group = <40 =====
          negative  neutral  positive
negative         0       10         0
neutral          0       35         2
positive         0       16         8

===== age_group = >60 =====
          negative  