## Seq2Seq

Os modelos Sequence-to-Sequence (Seq2Seq) são um pilar no campo do processamento de linguagem natural, especialmente para tarefas que envolvem a geração de sequências a partir de outras sequências, como tradução automática, resumo de texto e resposta a perguntas.

Exemplo em uma tarefa de tradução:

In [None]:
# To get data
#!wget http://www.manythings.org/anki/fra-eng.zip
#!unzip fra-eng.zip

# Libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# Load the dataset
with open('fra.txt', 'r', encoding='utf-8') as f:
    lines = f.read().split("
")

# Extract sentence pairs
input_texts = []
target_texts = []

# Use "\t" as the start sequence character and "\n" as the end sequence character
for line in lines[:10000]:  # Using the first 10,000 sentence pairs for simplicity
    input_text, target_text, _ = line.split("\t")
    target_text = "\t" + target_text + "\n"  
    input_texts.append(input_text)
    target_texts.append(target_text)

# Tokenization
tokenizer_in = Tokenizer(char_level=True)
tokenizer_in.fit_on_texts(input_texts)
encoder_input_data = tokenizer_in.texts_to_sequences(input_texts)
encoder_input_data = pad_sequences(encoder_input_data, padding='post')

tokenizer_out = Tokenizer(char_level=True)
tokenizer_out.fit_on_texts(target_texts)
decoder_input_data = tokenizer_out.texts_to_sequences(target_texts)
decoder_input_data = pad_sequences(decoder_input_data, padding='post')
decoder_target_data = np.roll(decoder_input_data, -1, axis=1)  # Shift decoder input for target data

# Convert to one-hot encoding
encoder_input_data = to_categorical(encoder_input_data)
decoder_input_data = to_categorical(decoder_input_data)
decoder_target_data = to_categorical(decoder_target_data)

# Set model parameters
num_encoder_tokens = encoder_input_data.shape[2]
num_decoder_tokens = decoder_input_data.shape[2]
max_encoder_seq_length = encoder_input_data.shape[1]
max_decoder_seq_length = decoder_input_data.shape[1]

# Split data
encoder_input_train, encoder_input_val, decoder_input_train, decoder_input_val, decoder_target_train, decoder_target_val = train_test_split(encoder_input_data, decoder_input_data, decoder_target_data, test_size=0.2)

# Define the encoder
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

# Define the decoder
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the Seq2Seq model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Training
model.fit([encoder_input_train, decoder_input_train], decoder_target_train,
          batch_size=64,
          epochs=30,  # Increase epochs for better results
          validation_data=([encoder_input_val, decoder_input_val], decoder_target_val))

# Define the encoder model for inference
encoder_model = Model(encoder_inputs, encoder_states)

# Define the decoder model for inference
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

# Create a function to decode sequences
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, tokenizer_out.word_index['	']] = 1.

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = tokenizer_out.index_word.get(sampled_token_index, '')

        if sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True
        else:
            decoded_sentence += sampled_char

            target_seq = np.zeros((1, 1, num_decoder_tokens))
            target_seq[0, 0, sampled_token_index] = 1.

            states_value = [h, c]

    return decoded_sentence

# Predict
for seq_index in range(10):
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)

## Attention Mechanisms

Os mecanismos de atenção evoluíram a PNL, oferecendo uma maneira mais eficiente e eficaz para os modelos processarem e relacionarem diferentes partes de uma sequência. Originalmente introduzidos no contexto da tradução automática neural, os mecanismos de atenção são agora onipresentes em várias tarefas de modelagem de sequência.

Exemplo de implementação em uma tarefa de tradução:

In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Concatenate, Attention
from tensorflow.keras.models import Model

# Sample encoder-decoder model with attention
input_seq = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(input_seq)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
attention = Attention()
attention_out = attention([decoder_outputs, encoder_outputs])
decoder_concat = Concatenate(axis=-1)([decoder_outputs, attention_out])
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat)

model = Model([input_seq, decoder_inputs], decoder_outputs)

# Model compilation, training, and evaluation code would follow

NameError: name 'num_encoder_tokens' is not defined

## Transformers

Os transformadores representam um grande avanço nas arquiteturas de redes neurais; eles melhoraram significativamente o desempenho por meio do uso de blocos de atenção em conjunto com redes neurais de feed-forward simples e contornaram os problemas que acompanham as unidades RNN. Eles são a espinha dorsal dos modelos modernos de grandes linguagens e entrarei em muitos detalhes sobre eles aqui (e também, mais especificamente para LLMs em uma seção posterior).

In [2]:
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import numpy as np

# 1. Load a smaller subset of the IMDB dataset for quick training
dataset = load_dataset('imdb', split={'train': 'train[:10%]', 'test': 'test[:10%]'})

# 2. Load the DistilBert tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Function to tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

# Tokenize the entire dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 3. Load the DistilBert model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# 4. Training arguments
training_args = TrainingArguments(
    output_dir='./results',         # output directory for model checkpoints
    num_train_epochs=2,             # number of training epochs for demonstration
    per_device_train_batch_size=8,  # batch size for training
    per_device_eval_batch_size=8,   # batch size for evaluation
    logging_dir='./logs',           # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",    # evaluate at the end of each epoch
)

# Function to compute accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(labels, predictions)}

# 5. Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()

# 6. Perform inference on a new sentence
new_sentences = ["This movie is fantastic!", "I did not like this movie at all."]
new_inputs = tokenizer(new_sentences, padding=True, truncation=True, return_tensors="pt")
predictions = model(**new_inputs).logits
predicted_classes = np.argmax(predictions.detach().numpy(), axis=1)
print("Predictions:", predicted_classes)

ModuleNotFoundError: No module named 'datasets'