# Transformer 

Este notebook implementa um modelo **Transformer** para tradução (Pt→En).



## Overview

**Objetivos do Step 1** - Preparar ambiente e checar GPU - Baixar e preparar o dataset `ted_hrlr_translate/pt_to_en` (TFDS) - Tokenizar com `TextVectorization` (simples e robusto) - Criar `tf.data` pipelines|

In [None]:
import os, time, math, random
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

try:
    import tensorflow_text as tf_text  
    HAS_TF_TEXT = True
except Exception as e:
    HAS_TF_TEXT = False

print("TensorFlow:", tf.__version__)
print("GPU disponível:", tf.config.list_physical_devices('GPU'))
print("tensorflow_text disponível:", HAS_TF_TEXT)

# Reprodutibilidade básica
SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

In [None]:
# Hiperparâmetros de dados
MAX_VOCAB = 20000
MAX_TOKENS = 100  # tamanho máximo da sequência (tokens)
BATCH_SIZE = 64
BUFFER_SIZE = 20000

# Carregue o dataset TED HRLR (Português → Inglês)
# Splits: 'train', 'validation', 'test'
ds_train, ds_val, ds_test = tfds.load(
    "ted_hrlr_translate/pt_to_en",
    split=["train", "validation", "test"],
    as_supervised=True  # retorna tuplas (pt, en)
)

def to_numpy_text(ds, take=-1):
    xs_pt, ys_en = [], []
    for i, (pt, en) in enumerate(ds):
        xs_pt.append(pt.numpy().decode("utf-8"))
        ys_en.append(en.numpy().decode("utf-8"))
        if take > 0 and i+1 >= take:
            break
    return xs_pt, ys_en

LIMIT = None  
if LIMIT is not None:
    xpt_train, yen_train = to_numpy_text(ds_train, take=LIMIT)
    xpt_val,  yen_val  = to_numpy_text(ds_val,  take=max(1000, LIMIT//10))
else:
    xpt_train, yen_train = to_numpy_text(ds_train)
    xpt_val,  yen_val  = to_numpy_text(ds_val)

print(f"Amostras de treino: {len(xpt_train)}, validação: {len(xpt_val)}")

In [None]:
from tensorflow.keras.layers import TextVectorization

# Tokens especiais
START_TOKEN = "[START]"
END_TOKEN = "[END]"

def add_special_tokens(texts, start=START_TOKEN, end=END_TOKEN):
    return [f"{start} " + t.strip() + f" {end}" for t in texts]

# Vetorizadores separados para PT (source) e EN (target)
src_vectorizer = TextVectorization(
    max_tokens=MAX_VOCAB,
    output_mode="int",
    output_sequence_length=MAX_TOKENS,
    standardize="lower_and_strip_punctuation"
)
tgt_vectorizer = TextVectorization(
    max_tokens=MAX_VOCAB,
    output_mode="int",
    output_sequence_length=MAX_TOKENS,
    standardize="lower_and_strip_punctuation"
)

# Fit nos textos
src_vectorizer.adapt(xpt_train)
tgt_vectorizer.adapt(add_special_tokens(yen_train))  # target recebe [START]/[END]

vocab_src = src_vectorizer.get_vocabulary()
vocab_tgt = tgt_vectorizer.get_vocabulary()
PAD_ID = 0  # por padrão TextVectorization usa 0 para padding

print("Vocabulário (src) tamanho:", len(vocab_src))
print("Vocabulário (tgt) tamanho:", len(vocab_tgt))
print("Exemplo vocab tgt (0..10):", vocab_tgt[:10])

In [None]:
# Construa tf.data com pares (enc_inputs, dec_inputs) e labels (dec_targets)
def make_dataset(x_src_raw, y_tgt_raw, batch_size=BATCH_SIZE):
    y_tgt_in = add_special_tokens(y_tgt_raw)  # [START] ... [END]
    # Vetorize
    x_src = src_vectorizer(np.array(x_src_raw))
    y_all = tgt_vectorizer(np.array(y_tgt_in))  # contém START + ... + END

    # Dec inputs = tudo exceto o último token
    dec_inputs = y_all[:, :-1]
    # Labels/targets = tudo exceto o primeiro token
    dec_targets = y_all[:, 1:]

    # Máscara de loss (1 onde target != PAD, 0 caso contrário)
    sample_weights = tf.cast(tf.not_equal(dec_targets, PAD_ID), tf.float32)

    ds = tf.data.Dataset.from_tensor_slices(((x_src, dec_inputs), dec_targets, sample_weights))
    ds = ds.shuffle(min(len(x_src_raw), BUFFER_SIZE)).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

train_ds = make_dataset(xpt_train, yen_train)
val_ds   = make_dataset(xpt_val,  yen_val)

for (a, b, w) in train_ds.take(1):
    print("Shapes enc_in, dec_in:", a[0].shape, a[1].shape)
    print("Shapes y, w:", b.shape, w.shape)

## Model & Train

- Construir um Transformer Encoder-Decoder (Keras)
- Compilar com otimizador Adam e loss de entropia cruzada 
- Treinar por poucas épocas e registrar tempos

In [None]:
from tensorflow.keras import layers, Model, optimizers

# Positional Encoding (sinusoidal)
def positional_encoding(maxlen, d_model):
    pos = np.arange(maxlen)[:, np.newaxis]
    i = np.arange(d_model)[np.newaxis, :]
    angle_rates = 1 / np.power(10000, (2*(i//2)) / np.float32(d_model))
    angle_rads = pos * angle_rates
    # aplica sin aos índices pares e cos aos índices ímpares
    sines = np.sin(angle_rads[:, 0::2])
    coses = np.cos(angle_rads[:, 1::2])
    pe = np.zeros((maxlen, d_model))
    pe[:, 0::2] = sines
    pe[:, 1::2] = coses
    return tf.cast(pe[np.newaxis, ...], tf.float32)  # shape: (1, maxlen, d_model)

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, vocab_size, d_model, maxlen):
        super().__init__()
        self.tok_emb = layers.Embedding(vocab_size, d_model, mask_zero=True)
        self.pos_enc = positional_encoding(maxlen, d_model)

    def call(self, x):
        length = tf.shape(x)[1]
        x = self.tok_emb(x)  # (B, L, d_model)
        return x + self.pos_enc[:, :length, :]

class TransformerEncoder(layers.Layer):
    def __init__(self, d_model, num_heads, dff, dropout=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model, dropout=dropout)
        self.ffn = tf.keras.Sequential([
            layers.Dense(dff, activation='relu'),
            layers.Dense(d_model),
        ])
        self.norm1 = layers.LayerNormalization(epsilon=1e-6)
        self.norm2 = layers.LayerNormalization(epsilon=1e-6)
        self.drop1 = layers.Dropout(dropout)
        self.drop2 = layers.Dropout(dropout)

    def call(self, x, training, padding_mask=None):
        attn_output = self.att(x, x, attention_mask=padding_mask, training=training)
        x = self.norm1(x + self.drop1(attn_output, training=training))
        ffn_output = self.ffn(x)
        x = self.norm2(x + self.drop2(ffn_output, training=training))
        return x

class TransformerDecoder(layers.Layer):
    def __init__(self, d_model, num_heads, dff, dropout=0.1):
        super().__init__()
        self.self_att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model, dropout=dropout)
        self.cross_att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model, dropout=dropout)
        self.ffn = tf.keras.Sequential([
            layers.Dense(dff, activation='relu'),
            layers.Dense(d_model),
        ])
        self.norm1 = layers.LayerNormalization(epsilon=1e-6)
        self.norm2 = layers.LayerNormalization(epsilon=1e-6)
        self.norm3 = layers.LayerNormalization(epsilon=1e-6)
        self.drop1 = layers.Dropout(dropout)
        self.drop2 = layers.Dropout(dropout)
        self.drop3 = layers.Dropout(dropout)

    def call(self, x, enc_out, training, padding_mask=None, look_ahead_mask=None):
        # Máscara causal (look-ahead)
        attn1 = self.self_att(x, x, attention_mask=look_ahead_mask, use_causal_mask=True, training=training)
        x = self.norm1(x + self.drop1(attn1, training=training))

        attn2 = self.cross_att(x, enc_out, attention_mask=padding_mask, training=training)
        x = self.norm2(x + self.drop2(attn2, training=training))

        ffn_out = self.ffn(x)
        x = self.norm3(x + self.drop3(ffn_out, training=training))
        return x

def create_padding_mask(seq):
    # seq: (B, L)
    mask = tf.cast(tf.not_equal(seq, PAD_ID), tf.float32)
    # MultiHeadAttention espera máscara como (B, L) -> broadcasting para (B, 1, 1, L) é automático
    return mask

# Hiperparâmetros do modelo (pequenos para rodar rápido)
D_MODEL = 128
NUM_HEADS = 4
DFF = 512
NUM_LAYERS = 2
DROPOUT = 0.1

VOCAB_SRC = len(vocab_src)
VOCAB_TGT = len(vocab_tgt)

# Entradas
enc_inputs = layers.Input(shape=(None,), dtype="int32", name="enc_inputs")
dec_inputs = layers.Input(shape=(None,), dtype="int32", name="dec_inputs")

# Embeddings + Positional Encoding
enc_emb = TokenAndPositionEmbedding(VOCAB_SRC, D_MODEL, MAX_TOKENS)(enc_inputs)
dec_emb = TokenAndPositionEmbedding(VOCAB_TGT, D_MODEL, MAX_TOKENS)(dec_inputs)

# Encoder stack
enc_out = enc_emb
enc_mask = create_padding_mask(enc_inputs)
for _ in range(NUM_LAYERS):
    enc_out = TransformerEncoder(D_MODEL, NUM_HEADS, DFF, DROPOUT)(enc_out, training=True, padding_mask=enc_mask)

# Decoder stack
dec_out = dec_emb
for _ in range(NUM_LAYERS):
    dec_out = TransformerDecoder(D_MODEL, NUM_HEADS, DFF, DROPOUT)(dec_out, enc_out, training=True, padding_mask=enc_mask)

# Saída final
logits = layers.Dense(VOCAB_TGT, name="classifier")(dec_out)  # (B, L, vocab_tgt)

model = Model([enc_inputs, dec_inputs], logits, name="tiny_transformer_pt_en")
model.summary()

In [None]:
# Compilar
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")
optimizer = optimizers.Adam(learning_rate=2e-4)

# Função de perda que ignora padding via sample_weight (já fornecido no dataset)
def masked_loss(y_true, y_pred):
    # y_true: (B, L), y_pred: (B, L, V)
    # Usaremos 'sample_weight' no .fit, então aqui apenas calculamos a loss padrão por timestep
    loss = loss_fn(y_true, y_pred)
    # shape (B, L). A média ponderada ocorrerá via sample_weight
    return tf.reduce_mean(loss)

model.compile(optimizer=optimizer, loss=masked_loss, metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

# Treinar poucas épocas para comparação de tempo (ajuste conforme necessário)
EPOCHS = int(os.environ.get("EPOCHS", 3))

t0 = time.perf_counter()
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    verbose=1
)
t1 = time.perf_counter()

train_seconds = t1 - t0
print(f"Tempo total de treino: {train_seconds:.2f} s em {EPOCHS} épocas.")
# Salve para uso no Step 3
TRAIN_SECONDS = train_seconds
EPOCHS_RAN = EPOCHS

## Evaluate & CPU vs GPU Timing

- Implementar inferência (decodificação greedy) para exemplos Pt→En
- (Opcional) Calcular BLEU com `sacrebleu`
- Registrar números de tempo de treino para comparação CPU vs GPU


In [None]:
# Mapeios para detokenização
inv_vocab_tgt = np.array(vocab_tgt)

def detokenize_tgt(ids):
    # Converte IDs de volta para tokens, remove [start]/[end] e padding
    tokens = inv_vocab_tgt[ids]
    tokens = [t for t in tokens if t not in ("[start]", "[end]", START_TOKEN.lower(), END_TOKEN.lower()) and t != ""]
    # Reconstroi frase simples
    return " ".join(tokens).replace("  ", " ").strip()

def greedy_decode(pt_sentence, max_len=MAX_TOKENS):
    # Tokeniza source
    src = src_vectorizer(np.array([pt_sentence]))
    dec = tgt_vectorizer(np.array([START_TOKEN]))
    # 'dec' aqui vira sequência com [START] + padding; manteremos um buffer manual
    dec_ids = [tgt_vectorizer.get_vocabulary().index(START_TOKEN.lower()) if START_TOKEN.lower() in tgt_vectorizer.get_vocabulary() else 1]

    for _ in range(max_len-1):
        dec_in = tf.constant([dec_ids + [0]*(max_len - len(dec_ids))], dtype=tf.int32)
        logits = model([src, dec_in], training=False)  # (1, L, vocab)
        next_id = int(tf.argmax(logits[0, len(dec_ids)-1]))  # pega o último passo
        dec_ids.append(next_id)
        # Se atingiu token END, para
        tok = vocab_tgt[next_id] if next_id < len(vocab_tgt) else ""
        if tok in ("[end]", END_TOKEN.lower()):
            break
    return detokenize_tgt(dec_ids)

# Teste rápido com poucas amostras da validação
samples = 5
xpt_val_s, yen_val_s = xpt_val[:samples], yen_val[:samples]
for i in range(samples):
    src = xpt_val_s[i]
    ref = yen_val_s[i]
    pred = greedy_decode(src)
    print(f"PT: {src}\nPRED EN: {pred}\nREF  EN: {ref}\n" + "-"*60)

In [None]:
try:
    import sacrebleu
    refs = [yen_val_s]
    hyps = [greedy_decode(s) for s in xpt_val_s]
    bleu = sacrebleu.corpus_bleu(hyps, [refs])
    print("BLEU (subset pequeno):", bleu.score)
except Exception as e:
    print("sacrebleu indisponível. Pule esta célula ou instale sacrebleu.")

In [None]:
DEVICE = "GPU" if tf.config.list_physical_devices('GPU') else "CPU"
print("Dispositivo detectado nesta execução:", DEVICE)
print(f"Tempo total de treino nesta execução: {TRAIN_SECONDS:.2f} s para {EPOCHS_RAN} épocas.")

# Preencha manualmente para documentar no README do repositório:
cpu_seconds = None   # ex.: 180.25
gpu_seconds = None   # ex.: 42.10

print("\n👉 Dica: preencha cpu_seconds/gpu_seconds acima e rode novamente para imprimir o comparativo.")

if cpu_seconds is not None and gpu_seconds is not None:
    speedup = cpu_seconds / gpu_seconds if gpu_seconds > 0 else float('inf')
    print(f"Comparativo: CPU={cpu_seconds:.2f}s, GPU={gpu_seconds:.2f}s, Speedup ≈ {speedup:.2f}x")
else:
    print("Comparativo pendente — rode em ambos os modos (CPU e GPU) e registre os tempos.")