In [1]:
import unicodedata
import re
import random
from datasets import load_dataset
from collections import defaultdict
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from collections import Counter
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

  from .autonotebook import tqdm as notebook_tqdm


# Ejercicio a)

In [None]:
# linea que arregla algunos errores de loadeo de datasets
!pip install --upgrade datasets

In [2]:
DATA_URLS = {
    "train": "https://huggingface.co/datasets/PlanTL-GOB-ES/SQAC/resolve/main/train.json",
    "dev":   "https://huggingface.co/datasets/PlanTL-GOB-ES/SQAC/resolve/main/dev.json",
    "test":  "https://huggingface.co/datasets/PlanTL-GOB-ES/SQAC/resolve/main/test.json",
}

raw = load_dataset(
    "json",
    data_files=DATA_URLS,
    field="data",
)

questions = []

for i in range(0, len(raw["train"])):
  for p in raw["train"][i]['paragraphs']:
    p_questions = [qas['question'] for qas in p['qas']]
    questions += p_questions

N_QUESTIONS = 5000
questions = questions[:N_QUESTIONS]
print(f"Se descargaron {len(questions)} preguntas en Español.")

Se descargaron 5000 preguntas en Español.


In [3]:
dataset_rnn = load_dataset("google/wmt24pp", "en-es_MX", split="train")
oraciones_rnn = dataset_rnn['target'][1:]

print(f"Se descargaron {len(oraciones_rnn)} oraciones en Español (del dataset del notebook 10).")

Se descargaron 997 oraciones en Español (del dataset del notebook 10).


In [4]:
oraciones_sinteticas = []
import json
with open('./datasets.json', 'r') as file:
  data = json.load(file)

oraciones_sinteticas = data['otros'] + data['marcas']
print(f"Hay {len(oraciones_sinteticas)} oraciones sintéticas.")

Hay 1413 oraciones sintéticas.


In [5]:
oraciones_raw = oraciones_rnn + questions + oraciones_sinteticas

In [6]:
def build_vocabulario(oraciones):
    text = " ".join(oraciones)
    text = unicodedata.normalize("NFD", text)
    text = text.encode("ascii", "ignore").decode("utf-8")
    words = re.findall(r"[A-Za-z]+", text.lower())

    unique = dict.fromkeys(words)
    return {word: idx for idx, word in enumerate(unique)}

word_to_index = build_vocabulario(oraciones_raw)
print(f"construido un vocabulario de {len(word_to_index)} palabras.")

construido un vocabulario de 14423 palabras.


## Funciones

In [7]:
def _tiene_acento(word):
    return any(unicodedata.category(c) == 'Mn' for c in unicodedata.normalize('NFD', word))

def _get_capitalization_type(word):
    if not word or word.islower(): return 0
    if word.istitle(): return 1
    if word.isupper(): return 3
    if any(c.isupper() for c in word[1:]): return 2
    return 0

def _get_punctuation_type(sentence, start, end):
    before = sentence[start - 1] if start > 0 else ""
    after = sentence[end] if end < len(sentence) else ""
    has_apertura = (before in '¿¡')
    has_cierre = (after in '?!')
    if has_apertura and has_cierre: return 0
    if has_apertura: return 1
    if has_cierre: return 2
    if '.' in (before, after): return 3
    if ',' in (before, after): return 4
    return 5

def procesar_oracion_sintetizado(sentence, word_to_index):
    matches = list(re.finditer(r'\b\w+\b', sentence))
    if not matches:
        return []

    total_words = len(matches)
    cleaned_words = [unicodedata.normalize('NFD', m.group(0)).encode('ascii', 'ignore').decode('utf-8').lower() for m in matches]
    tokens = [word_to_index.get(cw, -1) for cw in cleaned_words]

    return [{
        "word": cleaned_words[i],
        "token": tokens[i],
        "prev_token": tokens[i - 1] if i > 0 else -1,
        "next_token": tokens[i + 1] if i < total_words - 1 else -1,
        "has_accent": 1 if _tiene_acento(match.group(0)) else 0,
        "position": round(i / (total_words - 1), 2) if total_words > 1 else 0.0,
        "punctuation_type": _get_punctuation_type(sentence, match.start(), match.end()),
        "capitalization_type": _get_capitalization_type(match.group(0)),
    } for i, match in enumerate(matches)]

In [284]:
def reconstruct_input(oracion, clf_cap, clf_punc):
    processed = procesar_oracion_sintetizado(oracion, word_to_index)
    # , item['position']
    X = [
        [item['token'], item['prev_token'], item['next_token'], item['has_accent']]
        for item in processed
    ]
    caps = clf_cap.predict(X)
    puncs = clf_punc.predict(X)

    def cap2(word):
        return (word[0].upper() + word[1:-1].lower() + word[-1].upper()) if len(word) > 1 else word.upper()

    cap_funcs = {
        0: str.lower,
        1: str.capitalize,
        2: cap2,
        3: str.upper,
    }

    punc_templates = {
        0: lambda w: f"¿{w}?",
        1: lambda w: f"¿{w}",
        2: lambda w: f"{w}?",
        3: lambda w: f"{w}.",
        4: lambda w: f"{w},",
    }

    palabras = []
    for item, c, p in zip(processed, caps, puncs):
        word = cap_funcs.get(c, lambda w: w)(item['word'])
        word = punc_templates.get(p, lambda w: w)(word)
        palabras.append(word)

    return " ".join(palabras)


In [285]:
def undersample_classes(X, y, targets, seed=None):
    if seed is not None:
        random.seed(seed)

    groups = defaultdict(list)
    for xi, yi in zip(X, y):
        groups[yi].append((xi, yi))

    new_pairs = []
    for cls, items in groups.items():
        if cls in targets:
            n = min(len(items), targets[cls])
            new_pairs.extend(random.sample(items, n))
        else:
            new_pairs.extend(items)

    random.shuffle(new_pairs)
    X_new, y_new = zip(*new_pairs)
    return list(X_new), list(y_new)


## Armamos el dataset


In [286]:
curated_dataset = []

for sentence in oraciones_raw:
    curated_dataset.append(procesar_oracion_sintetizado(sentence, word_to_index))

master_dataset = [item for sublist in curated_dataset for item in sublist]

print(f"Tamaño de dataset: {len(master_dataset)}")

Tamaño de dataset: 91365


## RF

In [287]:
# , item['position']
X = [[item['token'], item['prev_token'], item['next_token'], item['has_accent']] for item in master_dataset]

y_capitalization = [item['capitalization_type'] for item in master_dataset]
y_punctuation = [item['punctuation_type'] for item in master_dataset]

In [288]:
# manejo desbalanceo en puntuacion

targets = {5: 20000, 1: 10000, 2: 10000}
X_punctuation, y_punctuation = undersample_classes(X, y_punctuation, targets, seed=42)

Counter(y_punctuation)

Counter({5: 20000, 1: 7447, 2: 5475, 3: 3277, 4: 2259, 0: 48})

In [289]:
# manejo desbalanceo en capitalizacion
targets = {0: 50000}
X_cap, y_cap = undersample_classes(X, y_capitalization, targets, seed=42)


Counter(y_cap)

Counter({0: 50000, 1: 17009, 3: 917, 2: 178})

In [290]:
# ARBOL PARA CAPITALIZACION
X_train, X_test, y_train, y_test = train_test_split(X_cap, y_cap, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

score = clf.score(X_test, y_test)
print(f"Accuracy: {score}")

Accuracy: 0.8939138095587695


In [291]:
# ARBOL PARA PUNTUACION
X_train, X_test, y_train, y_test = train_test_split(X_punctuation, y_punctuation, test_size=0.2, random_state=42)
clf_punctuation = RandomForestClassifier(n_estimators=100, random_state=42)

clf_punctuation.fit(X_train, y_train)

score = clf_punctuation.score(X_test, y_test)
print(f"Accuracy: {score}")

Accuracy: 0.8631524279407946


In [307]:
resultado = reconstruct_input("chau llamaron de la nasa para ver cómo sacar plata de paypal", clf, clf_punctuation)
print(resultado)

chau. ¿llamaron de la NASA para ver como sacar plata de PaypaL.


# Ejericio b)

In [8]:
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')

torch.set_default_device(device)
print(f"Using device = {torch.get_default_device()}")

Using device = cuda:0


In [85]:
import torch.nn.functional as F

class SimpleAttention(nn.Module):
    def __init__(self, feature_dim):
        super(SimpleAttention, self).__init__()
        self.feature_dim = feature_dim
        self.query_proj = nn.Linear(feature_dim, feature_dim)
        self.key_proj = nn.Linear(feature_dim, feature_dim)
        self.value_proj = nn.Linear(feature_dim, feature_dim)

    def forward(self, query, keys, values):
        q = self.query_proj(query)
        k = self.key_proj(keys)
        v = self.value_proj(values)

        d_k = self.feature_dim
        attention_scores = (q @ k.transpose(-2, -1)) / torch.sqrt(d_k)
        attention_weights = F.softmax(attention_scores, dim=-1)

        context_vector = torch.matmul(attention_weights, v)
        return context_vector, attention_weights

In [None]:
class CapitalizationRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(CapitalizationRNN, self).__init__()

        self.model_name = "bert-base-multilingual-cased"
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.bert = BertModel.from_pretrained(self.model_name)
        self.bert.eval()

        self.rnn = nn.LSTM(
            input_size=self.bert.config.hidden_size,
            hidden_size=hidden_size,
            num_layers=4,
            batch_first=True,
            bidirectional=False
        )

        self.attention = SimpleAttention(hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input_ids):
        batch_size, seq_len = input_ids.size()
        device = input_ids.device

        with torch.no_grad():
            bert_out = self.bert(input_ids, attention_mask=mask)
            embeddings = bert_out.last_hidden_state
        h0 = torch.zeros(4, batch_size, self.rnn.hidden_size).to(device)
        c0 = torch.zeros(4, batch_size, self.rnn.hidden_size).to(device)

        rnn_out, _ = self.rnn(embeddings, (h0, c0))

        context_out, _ = self.attention(rnn_out, rnn_out, rnn_out)

        out = self.fc(context_out) 
        return out


In [10]:
class CapitalizationDataset(Dataset):
    def __init__(self, tokenized_sentences, labels, tokenizer, pad_label=-100):
        """
        tokenized_sentences: lista de listas de tokens BERT
        labels: lista de listas de ints (misma longitud que cada oración)
        """
        self.tokenizer = tokenizer
        self.sentences = tokenized_sentences
        self.labels = labels
        self.pad_token_id = tokenizer.pad_token_id
        self.pad_label = pad_label

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        tokens = self.sentences[idx]
        label_seq = self.labels[idx]

        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        return input_ids, label_seq

In [11]:
def collate_batch(batch, pad_token_id, pad_label=-100):
    input_ids, labels = zip(*batch)
    max_len = max(len(x) for x in input_ids)

    padded_ids = []
    padded_labels = []

    for ids, lbls in zip(input_ids, labels):
        pad_len = max_len - len(ids)
        padded_ids.append(ids + [pad_token_id] * pad_len)
        padded_labels.append(lbls + [pad_label] * pad_len)

    return torch.tensor(padded_ids), torch.tensor(padded_labels)


## Armo el dataset

In [73]:
tokenized_sentences = []
labels_list = []

for oracion in oraciones_raw:
    if len(oracion.split(" ")) > 6:
        continue
    palabras = oracion.split()
    tokens_por_palabra = [tokenizer.tokenize(palabra) for palabra in palabras]
    cap_labels = [_get_capitalization_type(palabra) for palabra in palabras]

    tokenized = [token for sublist in tokens_por_palabra for token in sublist]
    labels = [label for label, tokens in zip(cap_labels, tokens_por_palabra) for _ in tokens]

    if len(tokenized) > 512:
        print(f"⚠️ Oración demasiado larga (descartada): {oracion}")
        continue

    minority_ratio = sum(lbl != 0 for lbl in cap_labels) / len(cap_labels)

    tokenized_sentences.append(tokenized)
    labels_list.append(labels)

In [75]:
from collections import Counter
import torch

# Aplanar todas las etiquetas
all_labels = [label for seq in labels_list for label in seq]
counts = Counter(all_labels)
print("Label counts:", counts)


Label counts: Counter({0: 9286, 1: 7568, 2: 412, 3: 344})


In [86]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
dataset = CapitalizationDataset(tokenized_sentences, labels_list, tokenizer)
dataloader = DataLoader(
    dataset,
    batch_size=16,
    shuffle=True,
    generator=torch.Generator(device=device),
    collate_fn=lambda x: collate_batch(x, tokenizer.pad_token_id)
)

model = CapitalizationRNN(hidden_size=128, output_size=4)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
import torch
import torch.nn.functional as F

# Definición de focal loss
def focal_loss(inputs, targets, alpha, gamma=2, ignore_index=-100):
    """
    inputs: Tensor [N, C] logits sin softmax
    targets: Tensor [N] con labels en {0,…,C-1} o ignore_index
    alpha: Tensor [C] pesos por clase
    gamma: coeficiente de focalización
    """
    # Calculamos la CE por elemento (sin reducción)
    ce = F.cross_entropy(inputs, targets,
                         reduction='none',
                         weight=alpha,
                         ignore_index=ignore_index)
    # Probabilidad del target
    p_t = torch.exp(-ce)
    # Focal term
    loss = (1 - p_t)**gamma * ce
    # Ignoramos posiciones de padding en el promedio
    valid_mask = (targets != ignore_index).float()
    return (loss * valid_mask).sum() / valid_mask.sum()

# — Antes de entrenar, calculas alpha (pesos de clase) como ya tienes:
all_labels = [lab for seq in labels_list for lab in seq]
counts = Counter(all_labels)
total = sum(counts.values())
eps = 1e-6
class_weights = [ total / (counts[i] + eps) for i in range(4) ]
alpha = torch.tensor(class_weights, dtype=torch.float).to(device)

# Reemplazamos criterion y scheduler si lo quisieras:
optimizer = optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(1, 41):
    model.train()
    epoch_loss = 0.0

    for input_ids, targets in dataloader:
        input_ids = input_ids.to(device)          # [B, S]
        targets   = targets.to(device)            # [B, S]

        logits = model(input_ids)                 # [B, S, C]
        logits = logits.view(-1, 4)               # [B*S, C]
        targets = targets.view(-1)                # [B*S]

        loss = focal_loss(logits, targets,
                          alpha=alpha,
                          gamma=2,
                          ignore_index=-100)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch:02d} — Loss: {epoch_loss/len(dataloader):.4f}")


TypeError: sqrt(): argument 'input' (position 1) must be Tensor, not int

In [78]:
def predict_capitalization(model, sentence, tokenizer, device):
    model.eval()
    words = sentence.strip().split()
    
    tokens_per_word = [tokenizer.tokenize(word) for word in words]
    flat_tokens = [tok for toks in tokens_per_word for tok in toks]

    input_ids = tokenizer.convert_tokens_to_ids(flat_tokens)
    input_tensor = torch.tensor([input_ids], dtype=torch.long).to(device)

    with torch.no_grad():
        outputs = model(input_tensor)
        probs = torch.round(torch.softmax(outputs, dim=-1)).squeeze(0)
        print(f"outputs: {probs}")
        predictions = torch.argmax(outputs, dim=-1).squeeze(0).tolist()

    index = 0
    result = []
    for word, toks in zip(words, tokens_per_word):
        length = len(toks)
        pred = predictions[index:index+length]
        most_common = max(set(pred), key=pred.count)
        result.append((word, most_common))
        index += length

    return result


In [84]:
sample = "españa es un país de europa"
preds = predict_capitalization(model, sample, tokenizer, device)
for word, label in preds:
    print(f"{word}: {label}")


outputs: tensor([[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.]], device='cuda:0')
españa: 0
es: 0
un: 0
país: 0
de: 0
europa: 0


In [124]:
from sklearn.metrics import classification_report

all_preds = []
all_targets = []

model.eval()
with torch.no_grad():
    for input_ids, targets in dataloader:
        input_ids, targets = input_ids.to(device), targets.to(device)
        outputs = model(input_ids)
        preds = torch.argmax(outputs, dim=-1)

        # Aplanar y filtrar padding
        flat_preds = preds.view(-1).cpu().tolist()
        flat_targets = targets.view(-1).cpu().tolist()
        for p, t in zip(flat_preds, flat_targets):
            if t != -100:
                all_preds.append(p)
                all_targets.append(t)

# Mostrar reporte
print(classification_report(all_targets, all_preds, labels=[0, 1, 2, 3]))


              precision    recall  f1-score   support

           0       0.05      0.73      0.09        26
           1       0.17      0.92      0.28        25
           2       0.00      0.00      0.00         0
           3       1.00      0.92      0.96      6702

    accuracy                           0.92      6753
   macro avg       0.30      0.64      0.33      6753
weighted avg       0.99      0.92      0.95      6753



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


# By Flor (GPT)

GPT lo limpió un poco y lo completó con lo que faltaba....

In [None]:
from transformers import BertTokenizer, BertModel
import unicodedata, re, random
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
from collections import Counter

In [None]:
# ## 2. Load Tokenizer and (optional) BERT Embeddings
model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
# (Later: extract embeddings if desired)

In [None]:
# ## 3. Text Cleaning and Feature Extraction Functions

def limpiar_texto(texto):
    texto = texto.replace('"', '').replace("'", '')
    texto = re.sub(r'FIN[\s\S]*$', '', texto.strip())
    return texto.strip()

# Detect accent

def tiene_acento(word):
    w = re.sub(r"ñ", "n", word)
    normalized = unicodedata.normalize("NFD", w)
    cleaned = re.sub(r"[^\w\s]", "", normalized)
    return w != cleaned

# Capitalization type

def get_capitalization_type(word):
    if word.islower(): return 0
    elif word.istitle(): return 1
    elif word.isupper(): return 3
    elif any(c.isupper() for c in word[1:]): return 2
    else: return 0

# Punctuation type around token

def get_punctuation_type(sentence, start, end):
    before = sentence[start-1] if start > 0 else ''
    after  = sentence[end]     if end < len(sentence) else ''
    if before=='¿' and after=='?': return 0
    if before=='¿':                return 1
    if after=='?':                 return 2
    if before=='.' or after=='.':  return 3
    if before==',' or after==',':  return 4
    return 5

# Process one sentence into list of feature dicts

def procesar_oracion(sentence, word_to_index):
    original = [(m.group(), m.start(), m.end()) for m in re.finditer(r'\b\w+\b', sentence)]
    cleaned = unicodedata.normalize("NFD", sentence).encode("ascii","ignore").decode("utf-8")
    cleaned = re.sub(r"[^a-zA-Z\s]+", "", cleaned).lower().strip()
    words = cleaned.split()
    N = len(words)
    output = []
    used = set()
    for i, w in enumerate(words):
        token_id = word_to_index.get(w, -1)
        prev_id  = word_to_index.get(words[i-1], -1) if i>0 else -1
        next_id  = word_to_index.get(words[i+1], -1) if i<N-1 else -1
        punct_type = 5
        orig_word = ''
        for j, (ow, s, e) in enumerate(original):
            if j in used: continue
            ow_clean = unicodedata.normalize("NFD", ow).encode("ascii","ignore").decode("utf-8")
            ow_clean = re.sub(r"[^\w\s]","", ow_clean).lower()
            if ow_clean == w:
                used.add(j)
                orig_word = ow
                punct_type = get_punctuation_type(sentence, s, e)
                break
        accent = 1 if tiene_acento(orig_word) else 0
        pos_norm = round(i/(N-1), 2) if N>1 else 0.0
        cap_type = get_capitalization_type(orig_word)
        output.append({
            'token': token_id,
            'prev_token': prev_id,
            'next_token': next_id,
            'has_accent': accent,
            'position': pos_norm,
            'punctuation_type': punct_type,
            'capitalization_type': cap_type
        })
    return output

In [None]:
# ## 4. Build Vocabulary from Corpus
# Load CSV corpus
df = pd.read_csv('./full_corpus.csv')
# Clean text
sentences = df['text'].apply(limpiar_texto).tolist()
# Build blob
blob = " ".join(sentences)
blob = unicodedata.normalize("NFD", blob).encode("ascii","ignore").decode("utf-8")
blob = re.sub(r"[^a-zA-Z]+"," ", blob).lower().strip()
# Token-to-index map
def build_tokenizer(blob):
    vocab, idx = {}, 0
    for w in blob.split():
        if w not in vocab:
            vocab[w] = idx; idx += 1
    return vocab
word_to_index = build_tokenizer(blob)

In [None]:
# ## 5. Extract Features for All Tokens
all_features = []
for sent in sentences:
    all_features.extend(procesar_oracion(sent, word_to_index))

In [None]:
# ## 6. Prepare X and y for Three Tasks
# Features for model
X = [[f['token'], f['has_accent'], f['position']] for f in all_features]
# Labels
y_init = [1 if f['punctuation_type']==1 else 0 for f in all_features]  # apertura
y_final = [1 if f['punctuation_type']==2 else 0 for f in all_features] # cierre
# Multi-class for final punctuation: 0=ambas,1=apertura,2=cierre,3=punto,4=coma,5=ninguna
y_punc_multi = [f['punctuation_type'] for f in all_features]
# Capitalization classes
y_cap = [f['capitalization_type'] for f in all_features]

In [None]:
# ## 7. Train/Test Split for Each Task
X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X, y_init, test_size=0.2, random_state=42)
X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(X, y_final, test_size=0.2, random_state=42)
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(X, y_punc_multi, test_size=0.2, random_state=42)
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_cap, test_size=0.2, random_state=42)


In [None]:
# %%
# ## 8. Train Random Forests
clf_init  = RandomForestClassifier(n_estimators=100, random_state=42)
clf_final = RandomForestClassifier(n_estimators=100, random_state=42)
clf_multi = RandomForestClassifier(n_estimators=100, random_state=42)
clf_cap   = RandomForestClassifier(n_estimators=100, random_state=42)

clf_init.fit(X_train_i, y_train_i)
clf_final.fit(X_train_f, y_train_f)
clf_multi.fit(X_train_m, y_train_m)
clf_cap.fit(X_train_c, y_train_c)

In [None]:
# ## 9. Evaluation with F1-macro and Confusion Matrices
for name, clf, X_t, y_t in [
    ('Apertura', clf_init, X_test_i, y_test_i),
    ('Cierre',  clf_final, X_test_f, y_test_f),
    ('Multi-Punc',clf_multi, X_test_m, y_test_m),
    ('Capitalización',clf_cap, X_test_c, y_test_c)
]:
    y_pred = clf.predict(X_t)
    f1 = f1_score(y_t, y_pred, average='macro')
    print(f"F1-macro {name}: {f1:.4f}")
    cm = confusion_matrix(y_t, y_pred)
    print(f"Confusion matrix {name}:\n", cm)

Esto reúne las predicciones de árbol:

In [None]:
# ### Pipeline function (optional)
def pipeline_full(sentence):
    feats = procesar_oracion(sentence, word_to_index)
    X_in = [[f['token'], f['has_accent'], f['position']] for f in feats]
    pi = clf_init.predict(X_in); pf = clf_final.predict(X_in)
    pm = clf_multi.predict(X_in); pc = clf_cap.predict(X_in)
    out = []
    for f, i_lbl, f_lbl, c_lbl in zip(feats, pi, pm, pc):
        w = f.get('word', '')
        # apply capitalization
        if c_lbl==1: w = w.capitalize()
        elif c_lbl==2 and len(w)>1: w = w[0].upper()+w[1:-1].lower()+w[-1].upper()
        elif c_lbl==3: w = w.upper()
        else: w = w.lower()
        # apply punctuation (multi-label)
        if f_lbl==1: w = '¿'+w
        if i_lbl==1: w = w+'?'
        out.append(w)
    return ' '.join(out)