# Preprocesado del dataset

## Carga de los datos

Primero de todo cargamos los datos a los que le hicimos la limpieza

In [1]:
from datasets import load_from_disk

path = "data/opus100_croped_10"
opus100 = load_from_disk(path)

Definimos los start, end, padding token y la longitud máxima de secuencia

In [2]:
import tiktoken

encoder = tiktoken.get_encoding("cl100k_base")

start_token = chr(1)
start_token = encoder.encode(start_token)

end_token = chr(2)
end_token = encoder.encode(end_token)

padding_token = chr(3)
padding_token = encoder.encode(padding_token)

max_secuence_length = 10 + 2

Creamos una función que preprocesa el dataset

In [3]:
import torch
import tqdm

def preprocess_dataset(dataset, split, start_token, end_token, padding_token, encoder, max_secuence_length):
    inputs = []
    labels = []

    progress_bar = tqdm.tqdm(total=len(dataset[split]))

    for example in dataset[split]:
        input = example['translation']['en']
        input = start_token + encoder.encode(input) + end_token
        if len(input) > max_secuence_length:  # Truncate if too long
            input = input[:max_secuence_length]
        else:  # Pad if too short
            input = input + padding_token * (max_secuence_length - len(input))
        input = torch.tensor(input)
        inputs.append(input)

        label = example['translation']['es']
        label = start_token + encoder.encode(label) + end_token
        if len(label) > max_secuence_length:  # Truncate if too long
            label = label[:max_secuence_length]
        else:  # Pad if too short
            label = label + padding_token * (max_secuence_length - len(label))
        label = torch.tensor(label)
        labels.append(label)

        progress_bar.update(1)

    inputs = torch.stack(inputs)
    labels = torch.stack(labels)

    return inputs, labels

Como vemos la función lo que coger cada muestra del dataset codificarla, convertirla en un tensor y guardarla en un stack. Esto lo hacemos para que durante el entrenamiento no se pierda tiempo en codificar las secuencias, podamos entrenar más rápido y por tanto más épocas.

Después guardamos esos stacks en un fichero para poder cargarlos más tarde y no tener que volver a preprocesar el dataset.

In [4]:
split = 'train'
inputs, labels = preprocess_dataset(opus100, split, start_token, end_token, padding_token, encoder, max_secuence_length)
torch.save(inputs, f"data/opus100_croped_10/{split}_inputs.pt")
torch.save(labels, f"data/opus100_croped_10/{split}_labels.pt")

100%|██████████| 463854/463854 [00:27<00:00, 16818.00it/s]


In [5]:
split = 'test'
inputs, labels = preprocess_dataset(opus100, 'test', start_token, end_token, padding_token, encoder, max_secuence_length)
torch.save(inputs, f"data/opus100_croped_10/{split}_inputs.pt")
torch.save(labels, f"data/opus100_croped_10/{split}_labels.pt")

100%|██████████| 691/691 [00:00<00:00, 16724.07it/s]


In [6]:
split = 'validation'
inputs, labels = preprocess_dataset(opus100, 'test', start_token, end_token, padding_token, encoder, max_secuence_length)
torch.save(inputs, f"data/opus100_croped_10/{split}_inputs.pt")
torch.save(labels, f"data/opus100_croped_10/{split}_labels.pt")

100%|██████████| 691/691 [00:00<00:00, 18527.90it/s]


In [7]:
path = "data/opus100_croped_20"
opus100 = load_from_disk(path)

In [8]:
split = 'train'
inputs, labels = preprocess_dataset(opus100, split, start_token, end_token, padding_token, encoder, max_secuence_length)
torch.save(inputs, f"data/opus100_croped_10/{split}_inputs.pt")
torch.save(labels, f"data/opus100_croped_10/{split}_labels.pt")

100%|██████████| 741145/741145 [00:47<00:00, 15599.54it/s]


In [9]:
split = 'test'
inputs, labels = preprocess_dataset(opus100, 'test', start_token, end_token, padding_token, encoder, max_secuence_length)
torch.save(inputs, f"data/opus100_croped_10/{split}_inputs.pt")
torch.save(labels, f"data/opus100_croped_10/{split}_labels.pt")

100%|██████████| 1311/1311 [00:00<00:00, 15443.28it/s]


In [10]:
split = 'validation'
inputs, labels = preprocess_dataset(opus100, 'test', start_token, end_token, padding_token, encoder, max_secuence_length)
torch.save(inputs, f"data/opus100_croped_10/{split}_inputs.pt")
torch.save(labels, f"data/opus100_croped_10/{split}_labels.pt")

100%|██████████| 1311/1311 [00:00<00:00, 14806.26it/s]


In [11]:
path = "data/opus100_croped"
opus100 = load_from_disk(path)

In [12]:
split = 'train'
inputs, labels = preprocess_dataset(opus100, split, start_token, end_token, padding_token, encoder, max_secuence_length)
torch.save(inputs, f"data/opus100_croped_10/{split}_inputs.pt")
torch.save(labels, f"data/opus100_croped_10/{split}_labels.pt")

100%|██████████| 983138/983138 [01:13<00:00, 13412.87it/s]


In [13]:
split = 'test'
inputs, labels = preprocess_dataset(opus100, 'test', start_token, end_token, padding_token, encoder, max_secuence_length)
torch.save(inputs, f"data/opus100_croped_10/{split}_inputs.pt")
torch.save(labels, f"data/opus100_croped_10/{split}_labels.pt")

100%|██████████| 1955/1955 [00:00<00:00, 12452.92it/s]


In [14]:
split = 'validation'
inputs, labels = preprocess_dataset(opus100, 'test', start_token, end_token, padding_token, encoder, max_secuence_length)
torch.save(inputs, f"data/opus100_croped_10/{split}_inputs.pt")
torch.save(labels, f"data/opus100_croped_10/{split}_labels.pt")

100%|██████████| 1955/1955 [00:00<00:00, 12194.48it/s]
