# Tema 5: Transformers

In [None]:
from pathlib import Path

PATH_DATA = Path.cwd().parent / 'data'

## Ejercicio 1
Entrenar un encoder transformer simplificado para análisis de sentimientos en español.

### Apartado a
Importar librerías y definir datos de ejemplo.

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, MultiHeadAttention, LayerNormalization, Embedding, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = [
    'Estoy un poco harto del día a día, nada mejora',
    'Hoy es un buen día',
    'No se te ve satisfecho con el trabajo',
    'Este paisaje es hermoso y bonito'
]
labels = [0, 1, 0, 1]  # 1: Positivo, 0: Negativo

### Apartado b
Preprocesamiento del texto.

In [None]:
vocab_size = 1000
max_length = 10

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
X = pad_sequences(sequences, maxlen=max_length)

X = tf.convert_to_tensor(X, dtype=tf.float32)
labels = tf.convert_to_tensor(labels, dtype=tf.float32)

### Apartado c
Crear modelo transformer encoder.

In [None]:
def create_transformer_classifier(vocab_size, max_length):
    inputs = Input(shape=(max_length,))

    embedding_layer = tf.keras.layers.Embedding(vocab_size, 32)(inputs)

    attention = MultiHeadAttention(num_heads=2, key_dim=32)(embedding_layer, embedding_layer, embedding_layer)

    x = LayerNormalization()(attention + embedding_layer)

    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    outputs = Dense(1, activation='sigmoid')(x)

    return Model(inputs, outputs)


model = create_transformer_classifier(vocab_size, max_length)
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()

### Apartado d
Entrenar el modelo.

In [None]:
history = model.fit(
    X, labels,
    epochs=10,
    batch_size=2,
    verbose=1
)

### Apartado e
Evaluar con nuevas frases.

In [None]:
test_sentences = [
    'No fui al estreno de la película porque nadie me quería acompañar',
    'Envidio de buena manera a los que tienen la oportunidad de ir mañana al estadio',
    'Se nos está volviendo costumbre del domingo por la noche, ver el episodio anterior de SNL y eso me hace recibir el lunes con mejor humor',
    'Al final decidí no ir al cine porque estaba cansada'
]

test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, maxlen=max_length)
test_padded = tf.convert_to_tensor(test_padded, dtype=tf.float32)

predictions = model.predict(test_padded)

for sentence, prediction in zip(test_sentences, predictions):
    sentiment = "Positivo" if prediction > 0.5 else "Negativo"
    print(f"Frase: '{sentence}'")
    print(f"Sentimiento: {sentiment} (probabilidad: {prediction[0]:.2f})")
    print()

## Ejercicio 2
Crear una arquitectura transformer tipo encoder para detectar fake news en español. Comparar versiones con y sin positional embeddings.

In [None]:
import os
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from pathlib import Path
from keras_nlp.layers import PositionEmbedding
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, GlobalAveragePooling1D, LayerNormalization, MultiHeadAttention
from tensorflow.keras.optimizers import Adam

PATH_DATA = Path.cwd().parent / 'data'

SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

df = pd.read_excel(str(PATH_DATA / 'train.xlsx'), engine="openpyxl")
df.head()

texts = df["Text"].astype(str).tolist()
labels = df["Category"].astype(str).tolist()

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

### Apartado b
División del dataset y tokenización.

In [None]:
X_train_texts, X_temp_texts, y_train, y_temp = train_test_split(texts, labels, test_size=0.2, random_state=SEED)
X_val_texts, X_test_texts, y_val, y_test = train_test_split(X_temp_texts, y_temp, test_size=0.5, random_state=SEED)

In [None]:
VOCAB_SIZE = 10000
MAX_LEN = 200

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_texts)

X_train_seq = tokenizer.texts_to_sequences(X_train_texts)
X_val_seq = tokenizer.texts_to_sequences(X_val_texts)
X_test_seq = tokenizer.texts_to_sequences(X_test_texts)

X_train = pad_sequences(X_train_seq, maxlen=MAX_LEN)
X_val = pad_sequences(X_val_seq, maxlen=MAX_LEN)
X_test = pad_sequences(X_test_seq, maxlen=MAX_LEN)

### Apartado c
Definir bloque transformer encoder.

In [None]:
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0.1):
    x = MultiHeadAttention(num_heads=num_heads, key_dim=head_size)(inputs, inputs)
    x = Dropout(dropout)(x)
    x = LayerNormalization(epsilon=1e-6)(x + inputs)

    ff = Dense(ff_dim, activation="relu")(x)
    ff = Dense(inputs.shape[-1])(ff)
    ff = Dropout(dropout)(ff)
    return LayerNormalization(epsilon=1e-6)(x + ff)

### Apartado d
Construir modelo parametrizado (con y sin positional embeddings).

In [None]:
def build_model(use_positional_embedding):
    inputs = Input(shape=(MAX_LEN,), dtype="int32")

    x = Embedding(input_dim=VOCAB_SIZE, output_dim=64)(inputs)

    if use_positional_embedding:
        pos_x = PositionEmbedding(sequence_length=MAX_LEN)(x)
        x = x + pos_x

    x = transformer_encoder(x, head_size=64, num_heads=2, ff_dim=128)
    x = GlobalAveragePooling1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(64, activation="relu")(x)
    x = Dropout(0.1)(x)
    outputs = Dense(1, activation="sigmoid")(x)

    model = Model(inputs, outputs)
    model.compile(optimizer=Adam(1e-4), loss="binary_crossentropy", metrics=["accuracy"])
    model.summary()
    return model

### Apartado e
Entrenar y evaluar ambos modelos.

In [None]:
def train_and_evaluate(use_positional_embedding):
    name = "CON_POSITIONAL" if use_positional_embedding else "SIN_POSITIONAL"
    print(f"\n Entrenando modelo: {name}\n")

    model = build_model(use_positional_embedding)
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=32, verbose=2)

    y_pred_probs = model.predict(X_test)
    y_pred = (y_pred_probs > 0.5).astype(int).reshape(-1)

    print(f"\n Reporte para {name}")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
    return model

In [None]:
model_with_pos = train_and_evaluate(use_positional_embedding=True)
model_without_pos = train_and_evaluate(use_positional_embedding=False)

## Ejercicio 3
Utilizar un decoder transformer para predicción autorregresiva de tokens.

### Apartado a
Importar librerías y definir datos de ejemplo.

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, MultiHeadAttention, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = [
    "El perro corre feliz",
    "El gato salta ágil",
    "La tortuga camina lenta",
    "El caballo trota fuerte",
    "El perro ladra ruidoso",
    "El gato duerme tranquilo",
    "La tortuga nada lenta",
    "El caballo galopa veloz",
    "El perro juega contento",
    "El gato observa curioso",
    "La tortuga descansa pacífica",
    "El caballo relincha bravo",
    "El perro huele atento",
    "El gato maúlla suave",
    "La tortuga explora cautelosa",
    "El caballo corre elegante"
]

### Apartado b
Preprocesamiento del texto.

In [None]:
vocab_size = 40
max_length = 10
embedding_dim = 32

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

X, y = [], []
for seq in sequences:
    X.append(seq[:-1])
    y.append(seq[1:])

X = pad_sequences(X, maxlen=max_length)
y = pad_sequences(y, maxlen=max_length)

X = tf.convert_to_tensor(X, dtype=tf.int32)
y = tf.convert_to_tensor(y, dtype=tf.int32)

### Apartado c
Crear modelo transformer decoder con causal masking.

In [None]:
def create_transformer_decoder(vocab_size, max_length):
    inputs = Input(shape=(max_length,))

    embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_dim)(inputs)

    attention = MultiHeadAttention(num_heads=2, key_dim=embedding_dim)(embedding_layer, embedding_layer, embedding_layer, use_causal_mask=True)

    x = LayerNormalization()(attention + embedding_layer)

    outputs = Dense(vocab_size)(x)

    return Model(inputs, outputs)


model = create_transformer_decoder(vocab_size, max_length)
model.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

model.summary()

### Apartado d
Entrenar el modelo.

In [None]:
history = model.fit(
    X, y,
    epochs=10,
    batch_size=2,
    verbose=1
)

### Apartado e
Predicción de tokens.

In [None]:
def predict_next_token(model, tokenizer, sentence, max_length):
    seq = tokenizer.texts_to_sequences([sentence])[0]
    seq = pad_sequences([seq], maxlen=max_length)

    predictions = model.predict(seq)

    last_pred_logits = predictions[0, -1]
    next_token_id = np.argmax(last_pred_logits)

    return tokenizer.index_word.get(next_token_id, None)


sentence = "El caballo"
next_token = predict_next_token(model, tokenizer, sentence, max_length)
print(f"Sentence: '{sentence}' --> Siguiente token predicho: '{next_token}'")

In [None]:
def predict_all_tokens(model, tokenizer, sentence, max_length):
    next_token = predict_next_token(model, tokenizer, sentence, max_length)

    predicted_tokens = [next_token]
    while next_token is not None:
        sentence += " " + next_token
        next_token = predict_next_token(model, tokenizer, sentence, max_length)
        predicted_tokens.append(next_token)

    return predicted_tokens


sentence = "El caballo"
predicted_tokens = predict_all_tokens(model, tokenizer, sentence, max_length)
print(f"Sentence: '{sentence}' --> Tokens predichos: {predicted_tokens}")

## Ejercicio 4
Crear una arquitectura completa de transformers (encoder y decoder) para traducción del inglés al español.

### Apartado a
Importar librerías y definir datos de ejemplo.

In [None]:
import os
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, LayerNormalization, MultiHeadAttention
from tensorflow.keras.optimizers import Adam

sentences_english = [
    "hello world",
    "goodbye world",
    "hello everyone",
]

sentences_spanish = [
    "hola mundo",
    "adiós mundo",
    "hola a todos",
]

### Apartado b
Preparación de datos y tokenización.

In [None]:
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

sentences_spanish = [f"<bos> {sent} <eos>" for sent in sentences_spanish]

In [None]:
encoder_tokenizer = Tokenizer(filters='', oov_token='<OOV>')
encoder_tokenizer.fit_on_texts(sentences_english)
encoder_seq = encoder_tokenizer.texts_to_sequences(sentences_english)

encoder_vocab_size = len(encoder_tokenizer.word_index) + 1
encoder_max_len = max(len(seq) for seq in encoder_seq)

encoder_inputs = pad_sequences(encoder_seq, maxlen=encoder_max_len, padding='post')

decoder_tokenizer = Tokenizer(filters='', oov_token='<OOV>')
decoder_tokenizer.fit_on_texts(sentences_spanish)
decoder_seq = decoder_tokenizer.texts_to_sequences(sentences_spanish)

decoder_vocab_size = len(decoder_tokenizer.word_index) + 1
decoder_max_len = max(len(seq) for seq in decoder_seq)

decoder_all = pad_sequences(decoder_seq, maxlen=decoder_max_len, padding='post')

decoder_inputs = decoder_all[:, :-1]
decoder_targets = decoder_all[:, 1:]

### Apartado c
Definir bloques transformer encoder y decoder.

In [None]:
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0.1):
    x = MultiHeadAttention(num_heads=num_heads, key_dim=head_size)(query=inputs, value=inputs, key=inputs)
    x = Dropout(dropout)(x)
    x = LayerNormalization(epsilon=1e-6)(x + inputs)

    ff = Dense(ff_dim, activation="relu")(x)
    ff = Dense(inputs.shape[-1])(ff)
    ff = Dropout(dropout)(ff)

    return LayerNormalization(epsilon=1e-6)(x + ff)


def transformer_decoder(inputs, encoder_output, head_size, num_heads, ff_dim, dropout=0.1):
    attn1 = MultiHeadAttention(num_heads=num_heads, key_dim=head_size)(query=inputs, value=inputs, key=inputs, use_causal_mask=True)
    attn1 = Dropout(dropout)(attn1)
    out1 = LayerNormalization(epsilon=1e-6)(inputs + attn1)

    attn2 = MultiHeadAttention(num_heads=num_heads, key_dim=head_size)(query=out1, value=encoder_output, key=encoder_output)
    attn2 = Dropout(dropout)(attn2)
    out2 = LayerNormalization(epsilon=1e-6)(out1 + attn2)

    ff = Dense(ff_dim, activation='relu')(out2)
    ff = Dense(out2.shape[-1])(ff)
    ff = Dropout(dropout)(ff)

    return LayerNormalization(epsilon=1e-6)(out2 + ff)

### Apartado d
Construir modelo encoder-decoder.

In [None]:
embedding_dim = 64
head_size = 64
num_heads = 2
ff_dim = 128

en_in = Input(shape=(None,), dtype="int32")
dec_in = Input(shape=(None,), dtype="int32")

enc_embed = Embedding(encoder_vocab_size, embedding_dim)(en_in)
enc_out = transformer_encoder(enc_embed, head_size=head_size, num_heads=num_heads, ff_dim=ff_dim)

dec_embed = Embedding(decoder_vocab_size, embedding_dim)(dec_in)
dec_out = transformer_decoder(dec_embed, enc_out, head_size=head_size, num_heads=num_heads, ff_dim=ff_dim)

outputs = Dense(decoder_vocab_size)(dec_out)
model = Model([en_in, dec_in], outputs)

model.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

model.summary()

### Apartado e
Entrenar el modelo.

In [None]:
model.fit([encoder_inputs, decoder_inputs], decoder_targets, epochs=5)

### Apartado f
Traducir nuevas frases.

In [None]:
eos_token = decoder_tokenizer.word_index['<eos>']
bos_token = decoder_tokenizer.word_index['<bos>']


def translate_sentence(input_sentence):
    encoder_input = encoder_tokenizer.texts_to_sequences([input_sentence])
    encoder_input = pad_sequences(encoder_input, maxlen=encoder_max_len, padding='post')

    decoder_output = [bos_token]

    for _ in range(decoder_max_len - 1):
        decoder_input = pad_sequences([decoder_output], maxlen=decoder_max_len-1, padding='post')

        pred = model.predict([encoder_input, decoder_input], verbose=0)
        next_token = np.argmax(pred[0, len(decoder_output)-1])

        decoder_output.append(next_token)
        if next_token == eos_token:
            break

    output_tokens = [
        decoder_tokenizer.index_word[token]
        for token in decoder_output
        if token != eos_token and token != bos_token
    ]
    return ' '.join(output_tokens)


print(translate_sentence("hello world"))