In [2]:
!pip install -q transformers datasets torch

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [13]:
import numpy as np

from datasets import load_dataset

from huggingface_hub import notebook_login

from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling

import warnings
warnings.filterwarnings('ignore')

In [5]:
dataset = load_dataset("muriloms/tcc-dataset-mini")

README.md:   0%|          | 0.00/524 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/73.3k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/13.4k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/13.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/800 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['history', 'action_label'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['history', 'action_label'],
        num_rows: 100
    })
    test: Dataset({
        features: ['history', 'action_label'],
        num_rows: 100
    })
})

In [9]:
dataset['train']

Dataset({
    features: ['history', 'action_label'],
    num_rows: 800
})

In [15]:
# Escolha do modelo (ex.: "gpt2", "distilgpt2", "gpt2-medium", etc.)
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# GPT-2 não possui um token de padding por padrão; usamos o token EOS como padding.
tokenizer.pad_token = tokenizer.eos_token

# Carrega o modelo para linguagem causal.
model = AutoModelForCausalLM.from_pretrained(model_name)

In [16]:
# Tokeniza todos os exemplos de 'history' concatenados com 'action_label'
lengths = []
for example in dataset['train']["history"]:
    text = example + "\nNextAction: " + dataset['train']["action_label"][dataset['train']["history"].index(example)]
    tokenized = tokenizer(text)
    lengths.append(len(tokenized["input_ids"]))

max_seq_length = int(np.percentile(lengths, 95))
print("Max seq length (95th percentile):", max_seq_length)

Max seq length (95th percentile): 200


In [17]:
max_seq_length = 200

In [18]:
def preprocess_function(examples):
    """
    Constrói uma única string a partir de 'history' e 'action_label', aplicando
    técnicas de truncamento e padding, para que cada exemplo fique com comprimento fixo.
    """
    # Concatena os campos 'history' e 'action_label' com um separador customizado.
    texts = [f"{history}\nNextAction: {label}" for history, label in zip(examples["history"], examples["action_label"])]

    # Tokeniza os textos com truncamento e padding dinâmico até max_seq_length.
    model_inputs = tokenizer(
        texts,
        max_length=max_seq_length,
        truncation=True,
        padding="max_length",  # Pode-se usar "longest" se preferir padding dinâmico
        return_attention_mask=True
    )

    # Para modelos de linguagem causal, geralmente é utilizado labels iguais aos input_ids.
    model_inputs["labels"] = model_inputs["input_ids"].copy()

    return model_inputs

In [19]:
# Aplica a função de pré-processamento aos dados (assumindo que 'dataset' já foi definido)
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["history", "action_label"])

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [20]:
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})


In [21]:
# Utiliza um DataCollator apropriado para LM causal, que realiza o agrupamento dos dados em batches.
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [22]:
data_collator

DataCollatorForLanguageModeling(tokenizer=GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
), mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')

# Connect Hugging Face

In [23]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [25]:
tokenized_dataset.push_to_hub("tcc-tokenized-dataset-mini")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/muriloms/tcc-tokenized-dataset-mini/commit/c3dc77d024244dc5a790bea329a4742ec395a4d0', commit_message='Upload dataset', commit_description='', oid='c3dc77d024244dc5a790bea329a4742ec395a4d0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/muriloms/tcc-tokenized-dataset-mini', endpoint='https://huggingface.co', repo_type='dataset', repo_id='muriloms/tcc-tokenized-dataset-mini'), pr_revision=None, pr_num=None)

In [24]:
tokenizer.push_to_hub("tcc-token-mini")

CommitInfo(commit_url='https://huggingface.co/muriloms/tcc-token-mini/commit/5206e995abc54a0fdfb458a0e0be19df2dee4667', commit_message='Upload tokenizer', commit_description='', oid='5206e995abc54a0fdfb458a0e0be19df2dee4667', pr_url=None, repo_url=RepoUrl('https://huggingface.co/muriloms/tcc-token-mini', endpoint='https://huggingface.co', repo_type='model', repo_id='muriloms/tcc-token-mini'), pr_revision=None, pr_num=None)