<a href="https://colab.research.google.com/github/nobertomaciel/PLN-ANIMA/blob/main/UA2/PLN_tradutor_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# =============================================================
# TRADUTOR SIMPLES COM LSTM (SEQ2SEQ) – GOOGLE COLAB
# =============================================================

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

# -------------------------------------------------------------
# 1) Mini-dataset de exemplo (EN → PT)
# -------------------------------------------------------------
eng_sentences = [
    "hi", "how are you", "thanks", "i love you", "good morning", "good night",
    "see you soon", "what is your name", "i am happy", "let's go"
]

pt_sentences = [
    "olá", "como você está", "obrigado", "eu te amo", "bom dia", "boa noite",
    "até logo", "qual é o seu nome", "estou feliz", "vamos lá"
]

# Adicionar tokens especiais
pt_sentences_input  = ["<start> " + s for s in pt_sentences]
pt_sentences_target = [s + " <end>" for s in pt_sentences]

# -------------------------------------------------------------
# 2) Tokenização
# -------------------------------------------------------------
tokenizer_eng = Tokenizer()
tokenizer_pt = Tokenizer()

tokenizer_eng.fit_on_texts(eng_sentences)
tokenizer_pt.fit_on_texts(pt_sentences_input + pt_sentences_target)

seq_eng = tokenizer_eng.texts_to_sequences(eng_sentences)
seq_pt_input  = tokenizer_pt.texts_to_sequences(pt_sentences_input)
seq_pt_target = tokenizer_pt.texts_to_sequences(pt_sentences_target)

max_eng = max(len(x) for x in seq_eng)
max_pt  = max(len(x) for x in seq_pt_input)

seq_eng = pad_sequences(seq_eng, maxlen=max_eng, padding='post')
seq_pt_input = pad_sequences(seq_pt_input, maxlen=max_pt, padding='post')
seq_pt_target = pad_sequences(seq_pt_target, maxlen=max_pt, padding='post')

vocab_eng = len(tokenizer_eng.word_index) + 1
vocab_pt  = len(tokenizer_pt.word_index) + 1

# -------------------------------------------------------------
# 3) Construção do modelo Seq2Seq com LSTM
# -------------------------------------------------------------
embedding_dim = 64
latent_dim = 128

# Encoder
encoder_inputs = Input(shape=(max_eng,))
enc_emb = Embedding(vocab_eng, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
_, h, c = encoder_lstm(enc_emb)

encoder_states = [h, c]

# Decoder
decoder_inputs = Input(shape=(max_pt,))
dec_emb_layer = Embedding(vocab_pt, embedding_dim)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

decoder_dense = Dense(vocab_pt, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Modelo final (treinamento)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

model.summary()

# -------------------------------------------------------------
# 4) Treinamento
# -------------------------------------------------------------
model.fit([seq_eng, seq_pt_input], np.expand_dims(seq_pt_target, -1),
          batch_size=2, epochs=300, verbose=0)

print("\nTreinamento concluído!")

# -------------------------------------------------------------
# 5) Modelo para inferência (tradução)
# -------------------------------------------------------------

# Encoder para inferência
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder para inferência
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = dec_emb_layer(decoder_inputs)
decoder_outputs2, h2, c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2, h2, c2]
)

index_to_word_pt = {i: w for w, i in tokenizer_pt.word_index.items()}

def translate(sentence):
    seq = tokenizer_eng.texts_to_sequences([sentence])
    seq = pad_sequences(seq, maxlen=max_eng, padding='post')

    # estado inicial do encoder
    states = encoder_model.predict(seq, verbose=0)

    # primeiro token do decoder
    tgt_seq = np.array([[tokenizer_pt.word_index["start"]]])

    result = []

    for _ in range(max_pt):
        output, h, c = decoder_model.predict([tgt_seq] + states, verbose=0)

        token_id = np.argmax(output[0, -1, :])

        if token_id == 0:
            break

        word = index_to_word_pt.get(token_id, "")

        if word == "end":
            break

        result.append(word)

        tgt_seq = np.array([[token_id]])
        states = [h, c]

    return " ".join(result)

# -------------------------------------------------------------
# 6) Teste
# -------------------------------------------------------------
test_sentences = [
    "hi",
    "i love you",
    "good night",
    "how are you",
]

print("\n===== TESTES DE TRADUÇÃO =====")
for s in test_sentences:
    print(f"EN: {s}")
    print(f"PT: {translate(s)}\n")



Treinamento concluído!

===== TESTES DE TRADUÇÃO =====
EN: hi
PT: olá

EN: i love you
PT: eu te amo

EN: good night
PT: boa noite

EN: how are you
PT: como você está

