### Dataset

Descargamos el mismo dataset

In [1]:
from datasets import load_dataset

dataset = load_dataset("mteb/amazon_reviews_multi", "en")

Creamos una variable con el número de clases

In [2]:
num_classes = len(dataset['train'].unique('label'))
num_classes

5

Antes procesamos todo el dataset para crear un campo llamado `labels`, pero ahora no hace falta porque como vamos a programar nosotros todo nos adaptamos a cómo es el dataset

### Tokenizador

Creamos el tokenizador. Le asignamos el token de padding para que no nos de error como antes

In [3]:
from transformers import AutoTokenizer

checkpoint = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token

Creamos una función para tokenizar el dataset

In [4]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=768, return_tensors="pt")

Lo tokenizamos. Eliminamos columnas que no nos hagan falta, pero ahora dejamos la del texto

In [5]:
dataset = dataset.map(tokenize_function, batched=True, remove_columns=['id', 'label_text'])

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 200000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 5000
    })
})

In [7]:
percentage = 0.00005
subset_train = dataset['train'].select(range(int(len(dataset['train']) * percentage)))
percentage = 0.001
subset_validation = dataset['validation'].select(range(int(len(dataset['validation']) * percentage)))
subset_test = dataset['test'].select(range(int(len(dataset['test']) * percentage)))
print(f"len subset_train: {len(subset_train)}, len subset_validation: {len(subset_validation)}, len subset_test: {len(subset_test)}")

len subset_train: 10, len subset_validation: 5, len subset_test: 5


### Modelo

Importamos los pesos y asignamos el token de padding

In [8]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_classes)
model.config.pad_token_id = model.config.eos_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at openai-community/gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Device

Creamos el dispositivo donde se va a ejecutar todo

In [9]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

De paso pasamos el modelo al dispositivo y de paso lo pasamos a FP16 para que ocupe menos memoria

In [10]:
model.half().to(device)
print()




### Pytorch Dataset

Creamos un dataset de pytorch

In [11]:
from torch.utils.data import Dataset

class ReviewsDataset(Dataset):
    def __init__(self, huggingface_dataset):
        self.dataset = huggingface_dataset

    def __getitem__(self, idx):
        label = self.dataset[idx]['label']
        input_ids = torch.tensor(self.dataset[idx]['input_ids'])
        attention_mask = torch.tensor(self.dataset[idx]['attention_mask'])
        return input_ids, attention_mask, label

    def __len__(self):
        return len(self.dataset)

Instanciamos los datasets

In [12]:
train_dataset = ReviewsDataset(subset_train)
validatation_dataset = ReviewsDataset(subset_validation)
test_dataset = ReviewsDataset(subset_test)

Vamos a ver una muestra

In [13]:
input_ids, at_mask, label = train_dataset[0]
input_ids.shape, at_mask.shape, label

(torch.Size([768]), torch.Size([768]), 0)

### Pytorch Dataloader

Creamos ahora un dataloader de pytorch

In [14]:
from torch.utils.data import DataLoader

BS = 1

train_loader = DataLoader(train_dataset, batch_size=BS, shuffle=True)
validation_loader = DataLoader(validatation_dataset, batch_size=BS)
test_loader = DataLoader(test_dataset, batch_size=BS)

Vamos a ver una muestra

In [15]:
input_ids, at_mask, labels = next(iter(train_loader))
input_ids.shape, at_mask.shape, labels

(torch.Size([1, 768]), torch.Size([1, 768]), tensor([0]))

Para ver que está todo bien pasamos la muestra al modelo para ver qué sale todo bien. Primero pasamos los tokens al dispositivo

In [16]:
input_ids = input_ids.to(device)
at_mask = at_mask.to(device)
labels = labels.to(device)

Ahora se los pasamos al modelo

In [17]:
output = model(input_ids=input_ids, attention_mask=at_mask, labels=labels)
output.keys()

odict_keys(['loss', 'logits', 'past_key_values'])

Como vemos nos da la loss y los logits

In [18]:
output['loss']

tensor(4.1055, device='cuda:0', dtype=torch.float16,
       grad_fn=<NllLossBackward0>)

In [19]:
output['logits']

tensor([[ 1.0137,  4.7852, -5.0664, -2.4297,  3.8047]], device='cuda:0',
       dtype=torch.float16, grad_fn=<IndexBackward0>)

### Métrica

Vamos a crear una función para obtener la métrica, que en este cáso va a ser el accuracy

In [20]:
def predicted_labels(logits):
    percent = torch.softmax(logits, dim=1)
    predictions = torch.argmax(percent, dim=1)
    return predictions

In [21]:
def compute_accuracy(logits, labels):
    predictions = predicted_labels(logits)
    correct = (predictions == labels).float()
    return correct.mean()

Vamos a ver si lo calcula bien

In [22]:
compute_accuracy(output['logits'], labels).item()

0.0

### Optimizador

Como vamos a necesitar un optimizador, creamos uno

In [23]:
from transformers import AdamW

LR = 2e-5
optimizer = AdamW(model.parameters(), lr=LR)



### Entrenamiento

Creamos el bucle de entrenamiento

In [24]:
from tqdm import tqdm

EPOCHS = 3

accuracy = 0

for epoch in range(EPOCHS):
    model.train()
    train_loss = 0
    progresbar = tqdm(train_loader, total=len(train_loader), desc=f'Epoch {epoch + 1}')
    for input_ids, at_mask, labels in progresbar:
        input_ids = input_ids.to(device)
        at_mask = at_mask.to(device)
        label = labels.to(device)

        output = model(input_ids=input_ids, attention_mask=at_mask, labels=label)

        loss = output['loss']
        train_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        progresbar.set_postfix({'train_loss': loss.item()})
    train_loss /= len(train_loader)
    progresbar.set_postfix({'train_loss': train_loss})

    model.eval()
    valid_loss = 0
    progresbar = tqdm(validation_loader, total=len(validation_loader), desc=f'Epoch {epoch + 1}')
    for input_ids, at_mask, labels in progresbar:
        input_ids = input_ids.to(device)
        at_mask = at_mask.to(device)
        labels = labels.to(device)

        output = model(input_ids=input_ids, attention_mask=at_mask, labels=labels)

        loss = output['loss']
        valid_loss += loss.item()

        step_accuracy = compute_accuracy(output['logits'], labels)
        accuracy += step_accuracy
        progresbar.set_postfix({'valid_loss': loss.item(), 'accuracy': step_accuracy.item()})
        
    valid_loss /= len(validation_loader)
    accuracy /= len(validation_loader)
    progresbar.set_postfix({'valid_loss': valid_loss, 'accuracy': accuracy})

Epoch 1: 100%|██████████| 10/10 [00:17<00:00,  1.79s/it, train_loss=0.000525]
Epoch 1: 100%|██████████| 5/5 [00:02<00:00,  1.73it/s, valid_loss=0.00115, accuracy=1] 
Epoch 2: 100%|██████████| 10/10 [00:17<00:00,  1.78s/it, train_loss=0.000551]
Epoch 2: 100%|██████████| 5/5 [00:02<00:00,  1.73it/s, valid_loss=0.000254, accuracy=1]
Epoch 3: 100%|██████████| 10/10 [00:17<00:00,  1.78s/it, train_loss=11.1]   
Epoch 3: 100%|██████████| 5/5 [00:02<00:00,  1.73it/s, valid_loss=0.000102, accuracy=1]


### Uso del modelo

Vamos a probar el modelo que hemos entrenado

Primero tokenizamos un texto

In [25]:
input_tokens = tokenize_function({"text": "I love this product. It is amazing."})
input_tokens['input_ids'].shape, input_tokens['attention_mask'].shape

(torch.Size([1, 768]), torch.Size([1, 768]))

Ahora se lo pasamos al modelo

In [26]:
output = model(input_ids=input_tokens['input_ids'].to(device), attention_mask=input_tokens['attention_mask'].to(device))
output['logits']

tensor([[10.9531, -0.4324, -2.3105, -0.8188,  1.2773]], device='cuda:0',
       dtype=torch.float16, grad_fn=<IndexBackward0>)

Vemos las predicciones de esos logits

In [27]:
predicted = predicted_labels(output['logits'])
predicted

tensor([0], device='cuda:0')