# Fase 3b: Deep Learning Models (BiLSTM)

En este notebook, usamos `Word2Vec` para entrenar embeddings y `BiLSTM` para la clasificaci√≥n.

In [None]:
%load_ext autoreload
%autoreload 2
import sys
import os
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Add src
sys.path.append(os.path.abspath("../src"))
from dl_models import AdvancedDLManager

## 1. Load Data

In [None]:
data_path = Path("../data/processed_corpus.csv")
df_full = pd.read_csv(data_path)
df_full = df_full.dropna(subset=['clean_text', 'sentiment_score'])

# Definimos columnas a probar
input_columns = ['clean_text', 'lemmas_text']

# Usamos clean_text solo para sacar los √≠ndices, luego usaremos la columna que toque
X_indices = df_full['clean_text'] 
y = df_full['sentiment_score']

# Test Set Intocable
X_train_raw, X_test_real, y_train_raw, y_test_real = train_test_split(
    X_indices, y, test_size=0.2, random_state=42, stratify=y
)
train_idx = X_train_raw.index
test_idx = X_test_real.index

print(f"Indices fijados -> Train Total: {len(train_idx)}, Test Intocable: {len(test_idx)}")

Train: 94186, Test: 23547


## 2. Entrenamiento de modelos de deep learning

In [None]:
experiments_config = [
    {
        'name': 'Baseline: Word2Vec + BiLSTM',
        'strategy': 'w2v',
        'model': None
    },
    {
        'name': 'SOTA: All-MiniLM + MLP',
        'strategy': 'transformer',
        'model': 'sentence-transformers/all-MiniLM-L6-v2' 
    },
    {
        'name': 'SOTA: BGE-Small + MLP',
        'strategy': 'transformer',
        'model': 'BAAI/bge-small-en-v1.5'
    },
    {
        'name': 'GenAI: Gemma-Embed + MLP',
        'strategy': 'ollama',
        'model': 'embeddinggemma:latest'
    }
]

In [None]:
results_dl = []

for col in input_columns:
    print(f"\n{'='*60}")
    print(f">>> PROCESANDO FEATURE: {col.upper()} <<<")
    print(f"{'='*60}")
    
    X_full_col = df_full[col].astype(str) 
    
    X_train_curr = X_full_col.loc[train_idx]
    y_train_curr = y.loc[train_idx]
    
    X_test_curr = X_full_col.loc[test_idx]
    y_test_curr = y.loc[test_idx]
    
    # Creamos DF temporal para facilitar el sampleo
    train_df_temp = pd.DataFrame({'feature': X_train_curr, 'target': y_train_curr})
    min_c = train_df_temp['target'].value_counts().min()
    
    print(f"Balanceando Train a {min_c} muestras por clase...")
    
    balanced_train = train_df_temp.groupby('target').apply(
        lambda x: x.sample(min_c, random_state=42)
    ).reset_index(drop=True)
    
    X_train_bal = balanced_train['feature']
    y_train_bal = balanced_train['target']
    
    X_tr_final, X_val_final, y_tr_final, y_val_final = train_test_split(
        X_train_bal, y_train_bal, test_size=0.1, random_state=42, stratify=y_train_bal
    )
    
    print(f"   Datos Finales DL -> Train: {len(X_tr_final)}, Val: {len(X_val_final)}")
    
    for exp in experiments_config:
        exp_id = f"{exp['name']} ({col})"
        print(f"\n   >>> Entrenando: {exp_id}")
        
        try:
            # Instanciar
            dl_man = AdvancedDLManager(strategy=exp['strategy'], model_name=exp['model'])
            
            # Entrenar W2V (si toca)
            if exp['strategy'] == 'w2v':
                # Entrenamos W2V con TODO el train balanceado (incluyendo val) para mejor vocabulario
                dl_man.train_w2v(X_train_bal)
            
            # Entrenar Red Neuronal
            history = dl_man.train(X_tr_final, y_tr_final, X_val_final, y_val_final, 
                                 epochs=5, batch_size=32)
            
            # Evaluar en Test Real
            print("      Evaluando en Test Set...")
            rep = dl_man.evaluate(X_test_curr, y_test_curr)
            
            # Guardar
            results_dl.append({
                'Feature': col,
                'Model': exp['name'],
                'Report_Raw': rep,
                'History': history
            })
            
            # Print r√°pido de resultados
            lines = rep.split('\n')
            print(f"RESULTADO: {lines[-4].strip()} | {lines[-3].strip()}")
            
        except Exception as e:
            print(f"ERROR en {exp_id}: {e}")

Using device: cpu
Training Word2Vec...
Training Word2Vec...
Word2Vec trained.
Training BiLSTM...


Epoch 1/5:   3%|‚ñé         | 41/1325 [00:11<05:49,  3.68it/s]


KeyboardInterrupt: 

## 3. Evaluation

In [None]:
for res in results_dl:
    print(f"\n[{res['Feature']}] {res['Model']}")
    lines = res['Report_Raw'].split('\n')
    print(f"   Accuracy: {lines[-4].split()[1]}")
    print(f"   Macro F1: {lines[-3].split()[-2]}")