In [8]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Fine-tuning multimodal RoBERTa + tabular para GENDER.
Guarda checkpoints por fold, el mejor modelo, y un JSON
con los tuits que clasifica correctamente con confianza ≥ CONF_TH,
incluyendo la traducción automática al español.
"""

# ─────────────────── IMPORTS ───────────────────
import pathlib, json
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from googletrans import Translator        # pip install googletrans==4.0.0-rc1
from joblib import dump

tr = Translator()

# ─────────────────── CONFIG ────────────────────
MODEL_NAME   = "cardiffnlp/twitter-roberta-base"
INPUT_CSV    = "features_linguisticas_en_con_glove.csv"

BATCH_SIZE   = 16
EPOCHS       = 10
LR           = 1e-5
FOLDS        = 5
CONF_TH      = 0.70           # confianza mínima para “tweets claros”

ckpt_dir = pathlib.Path("checkpoints_gender"); ckpt_dir.mkdir(exist_ok=True)
out_dir  = pathlib.Path("outputs_gender");     out_dir.mkdir(exist_ok=True)

# ──────────────── CARGA Y PRE-PROCESADO ─────────
df = pd.read_csv(INPUT_CSV)

# → sólo femenino / masculino
df = df[df["gender"].isin(["Female", "Male"])].copy()
df["label"] = df["gender"].map({"Female": 0, "Male": 1})


# Variables tabulares elegidas (basadas en Cohen’s d y viabilidad)
selected_vars = ['Xtwice', 'Xstop', 'Xdet', 'Xprep', 'Xmentions']
print("🔎 Variables seleccionadas:", selected_vars)

text_col = "clean_text"
req_cols = [text_col, "label"] + selected_vars
missing  = [c for c in req_cols if c not in df.columns]
if missing:
    raise KeyError(f"❌ Faltan columnas en el CSV: {missing}")

# descarta filas con NaN en cualquiera de las columnas requeridas
print("Filas antes del dropna:", len(df))
df = df[req_cols].dropna().reset_index(drop=True)
print("Filas después del dropna:", len(df))

if df.empty:
    raise ValueError("❌ El DataFrame quedó vacío tras el filtrado; "
                     "revisa las columnas y los NaN.")

# Escalado de las variables numéricas
scaler = StandardScaler()
df[selected_vars] = scaler.fit_transform(df[selected_vars])

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# ───────────────────── DATASET ──────────────────
class MultiModalDataset(Dataset):
    def __init__(self, texts, nums, labels):
        self.texts, self.nums, self.labels = texts, nums, labels

    def __len__(self): return len(self.labels)

    def __getitem__(self, idx):
        enc = tokenizer(
            self.texts[idx],
            truncation=True, padding="max_length", max_length=128,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["nums"]   = torch.tensor(self.nums[idx], dtype=torch.float32)
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# ───────────────────── MODELO ───────────────────
class TransformerWithTabular(nn.Module):
    def __init__(self, backbone, n_tab):
        super().__init__()
        self.transformer = AutoModel.from_pretrained(backbone)
        self.tabular_net = nn.Sequential(
            nn.Linear(n_tab, 64), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(64, 32)
        )
        self.classifier  = nn.Sequential(
            nn.Linear(768 + 32, 64), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(64, 2)
        )

    def forward(self, input_ids, attention_mask, nums):
        cls = self.transformer(input_ids, attention_mask).last_hidden_state[:, 0]
        tab = self.tabular_net(nums)
        return self.classifier(torch.cat([cls, tab], dim=1))

# ────────────────── TRAIN / EVAL ────────────────
texts    = df[text_col].tolist()
features = df[selected_vars].to_numpy(dtype=np.float32)
labels   = df["label"].to_numpy()
device   = torch.device("cuda" if torch.cuda.is_available() else "cpu")

weights   = compute_class_weight("balanced", classes=np.unique(labels), y=labels)
loss_fn   = nn.CrossEntropyLoss(weight=torch.tensor(weights, dtype=torch.float32).to(device))
skf       = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

f1_scores, selected = [], []
best_f1, best_state, best_fold = 0.0, None, None

for fold, (tr_idx, va_idx) in enumerate(skf.split(texts, labels), 1):
    print(f"\n📦 Fold {fold}/{FOLDS}")
    train_ds = MultiModalDataset([texts[i] for i in tr_idx], features[tr_idx], labels[tr_idx])
    val_ds   = MultiModalDataset([texts[i] for i in va_idx], features[va_idx], labels[va_idx])
    train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_dl   = DataLoader(val_ds,   batch_size=BATCH_SIZE)

    model = TransformerWithTabular(MODEL_NAME, len(selected_vars)).to(device)
    opt   = torch.optim.AdamW(model.parameters(), lr=LR)

    # —— entrenamiento —— #
    model.train()
    for ep in range(1, EPOCHS + 1):
        tot_loss = 0
        for batch in tqdm(train_dl, desc=f"Fold {fold}·Ep {ep}", leave=False):
            ids   = batch["input_ids"].to(device)
            attn  = batch["attention_mask"].to(device)
            nums  = batch["nums"].to(device)
            lbls  = batch["labels"].to(device)

            loss = loss_fn(model(ids, attn, nums), lbls)
            opt.zero_grad(); loss.backward(); opt.step()
            tot_loss += loss.item()
        print(f"   Ep {ep} → loss={tot_loss/len(train_dl):.4f}")

    # —— validación —— #
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for b_idx, batch in enumerate(val_dl):
            ids  = batch["input_ids"].to(device)
            attn = batch["attention_mask"].to(device)
            nums = batch["nums"].to(device)
            lbls = batch["labels"].to(device)

            probs = torch.softmax(model(ids, attn, nums), dim=1)
            confs, preds = probs.max(dim=1)

            y_true.extend(lbls.cpu().numpy())
            y_pred.extend(preds.cpu().numpy())

            # tuits claramente clasificados
            mask = (preds == lbls) & (confs >= CONF_TH)
            for loc in torch.where(mask)[0]:
                idx = va_idx[b_idx * BATCH_SIZE + loc.item()]
                txt = texts[idx]
                try:
                    txt_es = tr.translate(txt, dest="es").text
                except Exception:
                    txt_es = ""
                selected.append({
                    "row": int(idx), "text_en": txt, "text_es": txt_es,
                    "prob": round(confs[loc].item(), 4),
                    "pred": int(preds[loc]), "true": int(lbls[loc]),
                    "fold": fold
                })
    f1 = f1_score(y_true, y_pred, average="macro")
    f1_scores.append(f1)
    print(f"✅ Fold {fold} F1={f1:.4f}")
    print(classification_report(y_true, y_pred, target_names=["femenino", "masculino"]))

    if f1 > best_f1:
        best_f1, best_fold = f1, fold
        best_state = {
            "model_state": model.state_dict(),
            "backbone": MODEL_NAME,
            "selected_vars": selected_vars
        }

    # checkpoint de la rama transformer
    ckpt_fold = ckpt_dir / f"fold{fold}"
    ckpt_fold.mkdir(exist_ok=True)
    model.transformer.save_pretrained(ckpt_fold); tokenizer.save_pretrained(ckpt_fold)

# ────────────────── RESULTADOS GLOBALES ─────────
print("\n📊 F1 por fold →", [round(x,4) for x in f1_scores])
print(f"🏁 F1 macro medio: {np.mean(f1_scores):.4f}")

# —— guardar tweets claros —— #
json_path = out_dir / "clear_tweets_gender.json"
with open(json_path, "w", encoding="utf-8") as fh:
    json.dump(selected, fh, ensure_ascii=False, indent=2)
print(f"📝 Tweets claros → {json_path}")

# —— persistir mejor modelo y scaler —— #
if best_state:
    torch.save(best_state, out_dir / "best_model_gender.pt")
    print(f"🔒 Mejor modelo guardado (fold {best_fold}, F1={best_f1:.4f})")
dump(scaler, out_dir / "scaler_gender.pkl")
print("💾 Scaler guardado en scaler_gender.pkl")


🔎 Variables seleccionadas: ['Xtwice', 'Xstop', 'Xdet', 'Xprep', 'Xmentions']
Filas antes del dropna: 961
Filas después del dropna: 960

📦 Fold 1/5


                                                            

   Ep 1 → loss=0.6942


                                                            

   Ep 2 → loss=0.6904


                                                            

   Ep 3 → loss=0.6876


                                                            

   Ep 4 → loss=0.6759


                                                            

   Ep 5 → loss=0.6236


                                                            

   Ep 6 → loss=0.4902


                                                            

   Ep 7 → loss=0.2770


                                                            

   Ep 8 → loss=0.1218


                                                            

   Ep 9 → loss=0.0589


  txt_es = ""


   Ep 10 → loss=0.0346
✅ Fold 1 F1=0.5613
              precision    recall  f1-score   support

    femenino       0.49      0.60      0.54        82
   masculino       0.64      0.54      0.58       110

    accuracy                           0.56       192
   macro avg       0.57      0.57      0.56       192
weighted avg       0.58      0.56      0.56       192


📦 Fold 2/5


                                                            

   Ep 1 → loss=0.6926


                                                            

   Ep 2 → loss=0.6886


                                                            

   Ep 3 → loss=0.6847


                                                            

   Ep 4 → loss=0.6661


                                                            

   Ep 5 → loss=0.5775


                                                            

   Ep 6 → loss=0.3825


                                                            

   Ep 7 → loss=0.1833


                                                            

   Ep 8 → loss=0.0682


                                                            

   Ep 9 → loss=0.0460


  txt_es = ""


   Ep 10 → loss=0.0278
✅ Fold 2 F1=0.5720
              precision    recall  f1-score   support

    femenino       0.51      0.53      0.52        83
   masculino       0.63      0.61      0.62       109

    accuracy                           0.58       192
   macro avg       0.57      0.57      0.57       192
weighted avg       0.58      0.58      0.58       192


📦 Fold 3/5


                                                            

   Ep 1 → loss=0.6957


                                                            

   Ep 2 → loss=0.6912


                                                            

   Ep 3 → loss=0.6883


                                                            

   Ep 4 → loss=0.6836


                                                            

   Ep 5 → loss=0.6578


                                                            

   Ep 6 → loss=0.5656


                                                            

   Ep 7 → loss=0.3956


                                                            

   Ep 8 → loss=0.2267


                                                            

   Ep 9 → loss=0.1287


  txt_es = ""


   Ep 10 → loss=0.0671
✅ Fold 3 F1=0.5843
              precision    recall  f1-score   support

    femenino       0.55      0.45      0.49        83
   masculino       0.63      0.72      0.68       109

    accuracy                           0.60       192
   macro avg       0.59      0.59      0.58       192
weighted avg       0.60      0.60      0.60       192


📦 Fold 4/5


                                                            

   Ep 1 → loss=0.6954


                                                            

   Ep 2 → loss=0.6938


                                                            

   Ep 3 → loss=0.6806


                                                            

   Ep 4 → loss=0.6440


                                                            

   Ep 5 → loss=0.5349


                                                            

   Ep 6 → loss=0.3474


                                                            

   Ep 7 → loss=0.2006


                                                            

   Ep 8 → loss=0.1094


                                                            

   Ep 9 → loss=0.0567


  txt_es = ""


   Ep 10 → loss=0.0489
✅ Fold 4 F1=0.5304
              precision    recall  f1-score   support

    femenino       0.47      0.57      0.51        83
   masculino       0.60      0.50      0.55       109

    accuracy                           0.53       192
   macro avg       0.53      0.54      0.53       192
weighted avg       0.54      0.53      0.53       192


📦 Fold 5/5


                                                            

   Ep 1 → loss=0.6946


                                                            

   Ep 2 → loss=0.6905


                                                            

   Ep 3 → loss=0.6749


                                                            

   Ep 4 → loss=0.6409


                                                            

   Ep 5 → loss=0.5254


                                                            

   Ep 6 → loss=0.3349


                                                            

   Ep 7 → loss=0.1659


                                                            

   Ep 8 → loss=0.0988


                                                            

   Ep 9 → loss=0.0761


  txt_es = ""


   Ep 10 → loss=0.0399
✅ Fold 5 F1=0.5343
              precision    recall  f1-score   support

    femenino       0.47      0.54      0.50        83
   masculino       0.60      0.53      0.57       109

    accuracy                           0.54       192
   macro avg       0.54      0.54      0.53       192
weighted avg       0.55      0.54      0.54       192


📊 F1 por fold → [0.5613, 0.572, 0.5843, 0.5304, 0.5343]
🏁 F1 macro medio: 0.5565
📝 Tweets claros → outputs_gender/clear_tweets_gender.json
🔒 Mejor modelo guardado (fold 3, F1=0.5843)
💾 Scaler guardado en scaler_gender.pkl


In [10]:
# ──────────────────── RESULTADOS FINALES ─────────
print("\n📊 F1 macro por fold:", [round(f, 4) for f in f1_scores])
print(f"🏁 F1 macro promedio final: {np.mean(f1_scores):.4f}")
out_dir    = pathlib.Path("outputs")
# ----------- JSON con tweets claros --------------
json_path = out_dir / "clear_tweets_gender.json"
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(selected, f, ensure_ascii=False, indent=2)

print(f"📝 Guardados {len(selected)} tweets claros en {json_path.resolve()}")

# ----------- Guardar mejor modelo ---------------
# Guardamos el mejor fold completo: arquitectura + pesos + vars
torch.save(best_state, out_dir / "best_model.pt")
print(f"🔒 Guardado modelo completo en best_model.pt (fold {best_fold}, F1={best_f1:.4f})")

# ----------- Guardar scaler ---------------------
from joblib import dump
dump(scaler, out_dir / "scaler.pkl")
print("💾 Guardado StandardScaler en scaler.pkl")



📊 F1 macro por fold: [0.5613, 0.572, 0.5843, 0.5304, 0.5343]
🏁 F1 macro promedio final: 0.5565
📝 Guardados 492 tweets claros en /home/jupyter-lquijano/marcos/ingleses/outputs/clear_tweets_gender.json
🔒 Guardado modelo completo en best_model.pt (fold 3, F1=0.5843)
💾 Guardado StandardScaler en scaler.pkl


In [9]:
import pandas as pd

# ──────── CARGA DE DATOS ────────
df_raw = pd.read_csv("features_linguisticas_en_con_glove.csv")

# a) columnas reales
print("\n🔍 Columnas reales:")
print(sorted(df_raw.columns.tolist()))

# b) distribución de género (sin filtrar)
print("\n👥 Distribución de gender:")
print(df_raw["gender"].value_counts(dropna=False))

# c) filtrar a solo "Male" y "Female"
df_g = df_raw[df_raw["gender"].isin(["Male", "Female"])].copy()
print("\nFilas tras filtrar género:", len(df_g))

# d) % de NaN por cada variable candidata
vars_try = ['Xtwice', 'Xstop', 'Xdet', 'Xprep', 'Xmentions']
print("\n📊 Porcentaje de NaN por columna:")
print(df_g[vars_try].isna().mean().round(3) * 100)



🔍 Columnas reales:
['XWE-GloVe_1', 'XWE-GloVe_10', 'XWE-GloVe_11', 'XWE-GloVe_12', 'XWE-GloVe_13', 'XWE-GloVe_14', 'XWE-GloVe_15', 'XWE-GloVe_16', 'XWE-GloVe_17', 'XWE-GloVe_18', 'XWE-GloVe_19', 'XWE-GloVe_2', 'XWE-GloVe_20', 'XWE-GloVe_21', 'XWE-GloVe_22', 'XWE-GloVe_23', 'XWE-GloVe_24', 'XWE-GloVe_25', 'XWE-GloVe_26', 'XWE-GloVe_27', 'XWE-GloVe_28', 'XWE-GloVe_29', 'XWE-GloVe_3', 'XWE-GloVe_30', 'XWE-GloVe_31', 'XWE-GloVe_32', 'XWE-GloVe_33', 'XWE-GloVe_34', 'XWE-GloVe_35', 'XWE-GloVe_36', 'XWE-GloVe_37', 'XWE-GloVe_38', 'XWE-GloVe_39', 'XWE-GloVe_4', 'XWE-GloVe_40', 'XWE-GloVe_41', 'XWE-GloVe_42', 'XWE-GloVe_43', 'XWE-GloVe_44', 'XWE-GloVe_45', 'XWE-GloVe_46', 'XWE-GloVe_47', 'XWE-GloVe_48', 'XWE-GloVe_49', 'XWE-GloVe_5', 'XWE-GloVe_50', 'XWE-GloVe_51', 'XWE-GloVe_52', 'XWE-GloVe_53', 'XWE-GloVe_54', 'XWE-GloVe_55', 'XWE-GloVe_56', 'XWE-GloVe_57', 'XWE-GloVe_58', 'XWE-GloVe_59', 'XWE-GloVe_6', 'XWE-GloVe_60', 'XWE-GloVe_61', 'XWE-GloVe_62', 'XWE-GloVe_63', 'XWE-GloVe_64', 'XWE-GloV