<a href="https://colab.research.google.com/github/nobertomaciel/PLN-ANIMA/blob/main/UA2/PLN_tradutor_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# =============================================================
# TRADUTOR SIMPLES COM CNN (SEQ2SEQ CONVOLUCIONAL) – COLAB
# =============================================================
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Conv1D, Dense, Dropout
from tensorflow.keras.layers import GlobalMaxPooling1D, Concatenate, UpSampling1D
from tensorflow.keras.models import Model

# -------------------------------------------------------------
# 1) Mini-dataset EN → PT
# -------------------------------------------------------------
eng_sentences = [
    "hi", "how are you", "thanks", "i love you", "good morning", "good night",
    "see you soon", "what is your name", "i am happy", "let's go"
]

pt_sentences = [
    "olá", "como você está", "obrigado", "eu te amo", "bom dia", "boa noite",
    "até logo", "qual é o seu nome", "estou feliz", "vamos lá"
]

# Tokens especiais
pt_in  = ["<start> " + s for s in pt_sentences]
pt_out = [s + " <end>" for s in pt_sentences]

# -------------------------------------------------------------
# 2) Tokenização
# -------------------------------------------------------------
tok_en = Tokenizer()
tok_pt = Tokenizer()

tok_en.fit_on_texts(eng_sentences)
tok_pt.fit_on_texts(pt_in + pt_out)

seq_en = tok_en.texts_to_sequences(eng_sentences)
seq_pt_in  = tok_pt.texts_to_sequences(pt_in)
seq_pt_out = tok_pt.texts_to_sequences(pt_out)

max_en = max(len(s) for s in seq_en)
max_pt = max(len(s) for s in seq_pt_in)

seq_en = pad_sequences(seq_en, maxlen=max_en, padding='post')
seq_pt_in = pad_sequences(seq_pt_in, maxlen=max_pt, padding='post')
seq_pt_out = pad_sequences(seq_pt_out, maxlen=max_pt, padding='post')

vocab_en = len(tok_en.word_index) + 1
vocab_pt = len(tok_pt.word_index) + 1

# -------------------------------------------------------------
# 3) Construção do Modelo Seq2Seq com CNN
# -------------------------------------------------------------
embedding_dim = 64
filters = 128
kernel_size = 3

# ---------- Encoder CNN ----------
encoder_inputs = Input(shape=(max_en,))
enc_emb = Embedding(vocab_en, embedding_dim)(encoder_inputs)

# CNN para extrair características
conv1 = Conv1D(filters, kernel_size, activation='relu', padding='same')(enc_emb)
conv2 = Conv1D(filters, kernel_size, activation='relu', padding='same')(conv1)
pool = GlobalMaxPooling1D()(conv2)

# Vetor latente
latent = Dense(filters, activation='relu')(pool)

# ---------- Decoder CNN ----------
decoder_inputs = Input(shape=(max_pt,))
dec_emb = Embedding(vocab_pt, embedding_dim)(decoder_inputs)

# Replicar o vetor latente ao longo da sequência
repeat_latent = tf.keras.layers.RepeatVector(max_pt)(latent)

# Concatenar embeddings + informações do encoder
decoder_concat = Concatenate()([dec_emb, repeat_latent])

# CNN no decoder
dec_conv1 = Conv1D(filters, kernel_size, activation='relu', padding='same')(decoder_concat)
dec_conv2 = Conv1D(filters, kernel_size, activation='relu', padding='same')(dec_conv1)

# Previsão palavra a palavra
decoder_dense = Dense(vocab_pt, activation='softmax')
decoder_outputs = decoder_dense(dec_conv2)

# Modelo final
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

model.summary()

# -------------------------------------------------------------
# 4) Treinamento
# -------------------------------------------------------------
model.fit([seq_en, seq_pt_in], np.expand_dims(seq_pt_out, -1),
          batch_size=2, epochs=300, verbose=0)

print("\nTreinamento concluído!")

# -------------------------------------------------------------
# 5) Inferência (tradução)
# -------------------------------------------------------------
index_to_word_pt = {i: w for w, i in tok_pt.word_index.items()}

def translate(sentence):
    seq = tok_en.texts_to_sequences([sentence])
    seq = pad_sequences(seq, maxlen=max_en, padding='post')

    # encoder
    enc_pred = model.layers[5].output  # não utilizaremos diretamente

    # ultra-simplificação: usamos o modelo completo alimentando <start>
    tgt = ["<start>"]

    for _ in range(max_pt):
        seq_tgt = tok_pt.texts_to_sequences([" ".join(tgt)])
        seq_tgt = pad_sequences(seq_tgt, maxlen=max_pt, padding='post')

        preds = model.predict([seq, seq_tgt], verbose=0)
        token = np.argmax(preds[0][len(tgt)-1])

        word = index_to_word_pt.get(token, "")

        if word == "end":
            break
        tgt.append(word)

    return " ".join(tgt[1:])

# -------------------------------------------------------------
# 6) Testes
# -------------------------------------------------------------
tests = ["hi", "i love you", "how are you", "good night"]

print("\n===== TESTES DE TRADUÇÃO =====")
for s in tests:
    print(f"EN: {s}")
    print(f"PT: {translate(s)}\n")



Treinamento concluído!

===== TESTES DE TRADUÇÃO =====
EN: hi
PT: olá

EN: i love you
PT: eu

EN: how are you
PT: como

EN: good night
PT: obrigado

