Source codes for Python Machine Learning By Example 4th Edition (Packt Publishing)

Chapter 13 Advancing language understanding and Generation with the Transformer models

Author: Yuxi (Hayden) Liu (yuxi.liu.ece@gmail.com)

# Understanding self-attention 

In [1]:
import torch

sentence = torch.tensor(
    [0, # python 
     8, # machine      
     1, # learning 
     6, # by 
     2] # example 
)

sentence

tensor([0, 8, 1, 6, 2])

In [2]:
torch.manual_seed(0)
embed = torch.nn.Embedding(10, 16)
sentence_embed = embed(sentence).detach()

In [3]:
sentence_embed

tensor([[-1.1258, -1.1524, -0.2506, -0.4339,  0.8487,  0.6920, -0.3160, -2.1152,
          0.3223, -1.2633,  0.3500,  0.3081,  0.1198,  1.2377,  1.1168, -0.2473],
        [-0.8834, -0.4189, -0.8048,  0.5656,  0.6104,  0.4669,  1.9507, -1.0631,
         -0.0773,  0.1164, -0.5940, -1.2439, -0.1021, -1.0335, -0.3126,  0.2458],
        [-1.3527, -1.6959,  0.5667,  0.7935,  0.5988, -1.5551, -0.3414,  1.8530,
          0.7502, -0.5855, -0.1734,  0.1835,  1.3894,  1.5863,  0.9463, -0.8437],
        [ 1.6459, -1.3602,  0.3446,  0.5199, -2.6133, -1.6965, -0.2282,  0.2800,
          0.2469,  0.0769,  0.3380,  0.4544,  0.4569, -0.8654,  0.7813, -0.9268],
        [-0.6136,  0.0316, -0.4927,  0.2484,  0.4397,  0.1124,  0.6408,  0.4412,
         -0.1023,  0.7924, -0.2897,  0.0525,  0.5229,  2.3022, -1.4689, -1.5867]])

In [4]:
d = sentence_embed.shape[1]
w_key = torch.rand(d, d)
w_query = torch.rand(d, d)
w_value = torch.rand(d, d)

In [5]:
token1_embed = sentence_embed[0]
key_1 = w_key.matmul(token1_embed)
query_1 = w_query.matmul(token1_embed)
value_1 = w_value.matmul(token1_embed)

In [6]:
key_1

tensor([-1.1371, -0.5677, -0.9324, -0.3195, -2.8886, -1.2679, -1.1153,  0.2904,
         0.3825,  0.3179, -0.4977, -3.8230,  0.3699, -0.3932, -1.8788, -3.3556])

In [7]:
keys = sentence_embed.matmul(w_key.T)

In [8]:
keys[0]

tensor([-1.1371, -0.5677, -0.9324, -0.3195, -2.8886, -1.2679, -1.1153,  0.2904,
         0.3825,  0.3179, -0.4977, -3.8230,  0.3699, -0.3932, -1.8788, -3.3556])

In [9]:
values = sentence_embed.matmul(w_value.T)

In [10]:
import torch.nn.functional as F
a1 = F.softmax(query_1.matmul(keys.T) / d ** 0.5, dim=0)

In [11]:
a1

tensor([3.2481e-01, 4.2515e-01, 6.8915e-06, 2.5002e-01, 1.5529e-05])

In [12]:
z1 = a1.matmul(values)
z1

tensor([-0.7136, -1.1795, -0.5726, -0.4959, -0.6838, -1.6460, -0.3782, -1.0066,
        -0.4798, -0.8996, -1.2138, -0.3955, -1.3302, -0.3832, -0.8446, -0.8470])

# Improving sentiment analysis with BERT and Transformers

## Fine-tuning a pre-trained BERT model for sentiment Analysis

In [13]:
from torchtext.datasets import IMDB

train_dataset = list(IMDB(split='train'))
test_dataset = list(IMDB(split='test'))

print(len(train_dataset), len(test_dataset))

25000 25000


In [14]:
train_texts = [train_sample[1] for train_sample in train_dataset]
train_labels = [train_sample[0] for train_sample in train_dataset]

test_texts = [test_sample[1] for test_sample in test_dataset]
test_labels = [test_sample[0] for test_sample in test_dataset]

In [15]:
import transformers
from transformers import DistilBertTokenizerFast

# tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased', local_files_only=True)

In [16]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [17]:
train_encodings[0] 

Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [18]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor([0., 1.] if self.labels[idx] == 2 else [1., 0.])
        return item

    def __len__(self):
        return len(self.labels)


In [19]:
train_encoded_dataset = IMDbDataset(train_encodings, train_labels)
test_encoded_dataset = IMDbDataset(test_encodings, test_labels)


In [20]:
batch_size = 32
train_dl = torch.utils.data.DataLoader(train_encoded_dataset, batch_size=batch_size, shuffle=True)
test_dl = torch.utils.data.DataLoader(test_encoded_dataset, batch_size=batch_size, shuffle=False)

In [21]:
from transformers import DistilBertForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', local_files_only=True)
model.to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [22]:
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

In [23]:
def train(model, dataloader, optimizer):
    model.train()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs['loss'] 
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()*len(batch)

    return total_loss/len(dataloader.dataset)
    

In [24]:
def evaluate(model, dataloader):
    model.eval()
    total_acc = 0
    with torch.no_grad():
        for batch in dataloader:

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs['logits']

            pred = torch.argmax(logits, 1)
            total_acc += (pred == torch.argmax(labels, 1)).float().sum().item()

    return  total_acc/len(dataloader.dataset)
 

In [25]:
torch.manual_seed(0)
num_epochs = 1 
for epoch in range(num_epochs):
    train_loss = train(model, train_dl, optimizer)
    train_acc = evaluate(model, train_dl)
    print(f'Epoch {epoch+1} - loss: {train_loss:.4f} - accuracy: {train_acc:.4f}')

Epoch 1 - loss: 0.0244 - accuracy: 0.9646


In [26]:
test_acc = evaluate(model, test_dl)
print(f'Accuracy on test set: {100 * test_acc:.2f} %')

Accuracy on test set: 92.96 %


In [27]:
# torch.cuda.mem_get_info()

In [28]:
# torch.cuda.empty_cache()

In [29]:
# free up memory
del model 

## Using the Trainer API to train Transformer models 

In [30]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', local_files_only=True)
model.to(device)

optim = torch.optim.Adam(model.parameters(), lr=5e-5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
# !conda install -c conda-forge accelerate -y

In [32]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results', 
    num_train_epochs=1,     
    per_device_train_batch_size=32, 
    logging_dir='./logs',
    logging_steps=50,
)


In [33]:
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_encoded_dataset,
#     optimizers=(optim, None)
# )


In [34]:
from datasets import load_metric
import numpy as np

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred  
    pred = np.argmax(logits, axis=-1)
    return metric.compute(predictions=pred, references=np.argmax(labels, 1))


  metric = load_metric("accuracy")


In [35]:
trainer = Trainer(
    model=model,
    compute_metrics=compute_metrics,
    args=training_args,
    train_dataset=train_encoded_dataset,
    eval_dataset=test_encoded_dataset,
    optimizers=(optim, None)
)

In [36]:
trainer.train()

Step,Training Loss
50,0.4469
100,0.3171
150,0.3298
200,0.2629
250,0.2573
300,0.2423
350,0.2584
400,0.2342
450,0.2258
500,0.251


TrainOutput(global_step=782, training_loss=0.2543304844585526, metrics={'train_runtime': 377.3168, 'train_samples_per_second': 66.257, 'train_steps_per_second': 2.073, 'total_flos': 3311684966400000.0, 'train_loss': 0.2543304844585526, 'epoch': 1.0})

In [37]:
print(trainer.evaluate())

{'eval_loss': 0.18691246211528778, 'eval_accuracy': 0.9292, 'eval_runtime': 123.198, 'eval_samples_per_second': 202.925, 'eval_steps_per_second': 25.366, 'epoch': 1.0}


---

Readers may ignore the next cell.

In [38]:
!jupyter nbconvert --to python ch13_part1.ipynb --TemplateExporter.exclude_input_prompt=True

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[NbConvertApp] Converting notebook ch13_part1.ipynb to python
[NbConvertApp] Writing 6092 bytes to ch13_part1.py
