In [1]:
import numpy as np
import torch 
import math

In [2]:
#tokenizer
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_basic_tokenization = True)

In [3]:
from torch.utils.data import Dataset, DataLoader

def data_collate(batch_dataset):
    arr = np.array(batch_dataset)
    inputs = tokenizer(text = arr.tolist(), padding = 'max_length', max_length = 512, truncation=True, return_tensors = 'pt')
    return inputs

class CreateDataset(Dataset):
    def __init__(self, src, tokenizer):
        #src = sentences 
        self.src = src
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        src = self.src[idx]
        return src

In [4]:
from datasets import load_dataset

data = load_dataset("cnn_dailymail", "2.0.0", split = 'train')

In [5]:
data[0]['article']

'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details o

In [6]:
#check tokens
#max_length = 512 for BERT tokenizer

sample_input = np.array(data[0]['article'])
tokenizer_output = tokenizer(text = sample_input.tolist(), return_tensors = 'pt')
tokens = tokenizer_output['input_ids']

print(tokens)
print(len(tokens[0]))

Token indices sequence length is longer than the specified maximum sequence length for this model (587 > 512). Running this sequence through the model will result in indexing errors


tensor([[  101,  2414,  1010,  2563,  1006, 26665,  1007,  1011,  1011,  4302,
         10693,  2732,  3817, 22603, 12154,  3229,  2000,  1037,  2988, 21853,
          2692,  2454,  1006,  1002,  4601,  1012,  1015,  2454,  1007,  7280,
          2004,  2002,  4332,  2324,  2006,  6928,  1010,  2021,  2002, 16818,
          1996,  2769,  2180,  1005,  1056,  3459,  1037,  6297,  2006,  2032,
          1012,  3817, 22603,  2004,  4302, 10693,  1999,  1000,  4302, 10693,
          1998,  1996,  2344,  1997,  1996,  6708,  1000,  2000,  1996, 10520,
          1997, 13761, 13317,  2015,  2105,  1996,  2088,  1010,  1996,  2402,
          3364,  2758,  2002,  2038,  2053,  3488,  2000, 10424, 27100,  2099,
          2010,  5356,  2185,  2006,  3435,  3765,  1010,  4392,  1998,  8958,
          4243,  1012,  1000,  1045,  2123,  1005,  1056,  2933,  2000,  2022,
          2028,  1997,  2216,  2111,  2040,  1010,  2004,  2574,  2004,  2027,
          2735,  2324,  1010,  3402,  4965,  3209,  

In [7]:
import re

def filter_data(text):
    #remove last line
    text = re.sub(r"Copyright \d{4} Reuters. All rights reserved.*", "", text)
    
    #replace \'
    text = text.replace("\'", "")
    
    #replace 's
    text = re.sub(r"'s\b'", "", text)
    
    #remove extra white space
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [8]:
filter_text = filter_data(data[0]['article'])

print(filter_text)

sample_input = np.array(filter_text)
tokenizer_output = tokenizer(text = sample_input.tolist(), return_tensors = 'pt')
tokens = tokenizer_output['input_ids']

print(tokens)
print(len(tokens[0]))

LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money wont cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I dont plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I dont think Ill be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details of how hel

In [9]:
#faster opration can be done using multithread 

from tqdm import tqdm 

train_data = []

for i in tqdm(range(len(data))):
    filter_d = filter_data(data[i]['article'])
    train_data.append(filter_d)

100%|████████████████████████████████████████████████████████████████████████| 287113/287113 [00:44<00:00, 6498.04it/s]


In [10]:
len(train_data)

287113

In [11]:
train_data = train_data[:10000]

In [12]:
train_data = CreateDataset(train_data, tokenizer)
dataloader = DataLoader(train_data, batch_size = 8, collate_fn = data_collate)

In [13]:
len(dataloader)

1250

In [14]:
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer

In [15]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [16]:
#model 
class TransformerModel(nn.Module):
    
    def __init__(self, ntokens, ninp, nhead, nhid, nlayers, dropout = 0.5):
        super(TransformerModel, self).__init__()
        self.model_type = "Transformer"
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layer = TransformerEncoderLayer(ninp, nhead, nhid, dropout, batch_first = True)
        self.transformer_encoder = TransformerEncoder(encoder_layer, nlayers)
        self.encoder = nn.Embedding(ntokens, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntokens)
        
        self.init_weights()
        
    def generate_square_subsequent_mask(self, sz):
        
        '''
        We generate the mask to prevent the transformer from seeing future tokens
        Square matrix is created with elements below the diagonal = 0
        Conver the mask to float, all zeros are replaced with -inf(indicating no access to elements) 
        and 1 with 0.0(this apporation does not changes the magnitude but influences the output)
        '''
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        
        return mask
    
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)
        
    def forward(self, src, src_mask):
        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        return output 

In [17]:
if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")

else:
    mps_device = torch.device("mps")

    ntokens = tokenizer.vocab_size 
    emsize = 512 # embedding dimension

    nhid = 100 # the dimension of the feedforward network model in nn.TransformerEncoder

    nlayers = 5 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder

    nhead = 4 # the number of heads in the multiheadattention models

    dropout = 0.2 # the dropout value

    model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(mps_device)

In [18]:
model

TransformerModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.2, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-4): 5 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=100, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=100, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (encoder): Embedding(30522, 512)
  (decoder): Linear(in_features=512, out_features=30522, bias=True)
)

In [19]:
def train(model, dataloader):
    model.train()
    epochs = 50
    total_loss = 0
    criterion = nn.CrossEntropyLoss()
    optim = torch.optim.AdamW(model.parameters(), lr = 0.1)
    
    for epoch in tqdm(range(epochs)):
        for batch in tqdm(dataloader):
            optim.zero_grad()
            input = batch['input_ids'].clone()
            
            src_mask = model.generate_square_subsequent_mask(batch['input_ids'].size(1))
            
            rand_value = torch.rand(batch.input_ids.shape)
            rand_mask = (rand_value < 0.15) * (input != 101) * (input != 102) * (input != 0)
            
            mask_idx=(rand_mask.flatten() == True).nonzero().view(-1)
            
            input = input.flatten()
            input[mask_idx] = 103
            input = input.view(batch['input_ids'].size())
            
            out = model(input.to(mps_device), src_mask.to(mps_device))
            loss = criterion(out.view(-1, ntokens), batch['input_ids'].view(-1).to(mps_device))
            total_loss += loss
            loss.backward()
            optim.step()
            
        print("Epoch: {} -> loss: {}".format(epoch+1, total_loss/(len(dataloader)*epoch+1)))

In [None]:
torch.mps.empty_cache()
train(model, dataloader)

In [None]:
param_size = 0
for param in model.parameters():
    param_size += param.nelement() * param.element_size()
    
buffer_size = 0
for buffer in model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
print('model size: {:.3f}MB'.format(size_all_mb))

In [None]:
import torch 

one_mat = torch.ones(4, 4)

In [None]:
torch.triu(one_mat)

In [None]:
torch.triu(one_mat == 1)

In [None]:
torch.triu(one_mat == 1).transpose(0, 1)

In [None]:
mat = torch.triu(one_mat == 1).transpose(0, 1).float()

In [None]:
mat.masked_fill(mat == 0, float('-inf')).masked_fill(mat == 1, float(0.0))

## Optimizing GPU usage

In [20]:
from accelerate import Accelerator

In [32]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    per_device_train_batch_size=1,
    learning_rate = 0.1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    #fp16=True, # can only be done with CUDA 
    output_dir = "./model_output"
)

In [26]:
dataloader = DataLoader(train_data, batch_size=training_args.per_device_train_batch_size)

if training_args.gradient_checkpointing:
    model.gradient_checkpointing_enable()

accelerator = Accelerator()
model, optimizer, dataloader = accelerator.prepare(model, adam_bnb_optim, dataloader)

AttributeError: 'TransformerModel' object has no attribute 'gradient_checkpointing_enable'

In [27]:
from transformers.trainer_pt_utils import get_parameter_names

decay_parameters = get_parameter_names(model, [nn.LayerNorm])
print(decay_parameters)

['transformer_encoder.layers.0.self_attn.out_proj.weight', 'transformer_encoder.layers.0.self_attn.out_proj.bias', 'transformer_encoder.layers.0.self_attn.in_proj_weight', 'transformer_encoder.layers.0.self_attn.q_proj_weight', 'transformer_encoder.layers.0.self_attn.k_proj_weight', 'transformer_encoder.layers.0.self_attn.v_proj_weight', 'transformer_encoder.layers.0.self_attn.in_proj_bias', 'transformer_encoder.layers.0.linear1.weight', 'transformer_encoder.layers.0.linear1.bias', 'transformer_encoder.layers.0.linear2.weight', 'transformer_encoder.layers.0.linear2.bias', 'transformer_encoder.layers.1.self_attn.out_proj.weight', 'transformer_encoder.layers.1.self_attn.out_proj.bias', 'transformer_encoder.layers.1.self_attn.in_proj_weight', 'transformer_encoder.layers.1.self_attn.q_proj_weight', 'transformer_encoder.layers.1.self_attn.k_proj_weight', 'transformer_encoder.layers.1.self_attn.v_proj_weight', 'transformer_encoder.layers.1.self_attn.in_proj_bias', 'transformer_encoder.layers

In [28]:
decay_parameters = [name for name in decay_parameters if "bias" not in name]
print(decay_parameters)

['transformer_encoder.layers.0.self_attn.out_proj.weight', 'transformer_encoder.layers.0.self_attn.in_proj_weight', 'transformer_encoder.layers.0.self_attn.q_proj_weight', 'transformer_encoder.layers.0.self_attn.k_proj_weight', 'transformer_encoder.layers.0.self_attn.v_proj_weight', 'transformer_encoder.layers.0.linear1.weight', 'transformer_encoder.layers.0.linear2.weight', 'transformer_encoder.layers.1.self_attn.out_proj.weight', 'transformer_encoder.layers.1.self_attn.in_proj_weight', 'transformer_encoder.layers.1.self_attn.q_proj_weight', 'transformer_encoder.layers.1.self_attn.k_proj_weight', 'transformer_encoder.layers.1.self_attn.v_proj_weight', 'transformer_encoder.layers.1.linear1.weight', 'transformer_encoder.layers.1.linear2.weight', 'transformer_encoder.layers.2.self_attn.out_proj.weight', 'transformer_encoder.layers.2.self_attn.in_proj_weight', 'transformer_encoder.layers.2.self_attn.q_proj_weight', 'transformer_encoder.layers.2.self_attn.k_proj_weight', 'transformer_encod

In [29]:
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if n in decay_parameters],
        "weight_decay": training_args.weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
        "weight_decay": 0.0,
    },
]

In [30]:
optimizer_kwargs = {
    "betas": (training_args.adam_beta1, training_args.adam_beta2),
    "eps": training_args.adam_epsilon,
}

In [34]:
import bitsandbytes as bnb

optimizer_kwargs["lr"] = training_args.learning_rate

adam_bnb_optim = bnb.optim.Adam8bit(
    optimizer_grouped_parameters,
    betas=(training_args.adam_beta1, training_args.adam_beta2),
    eps=training_args.adam_epsilon,
    lr=training_args.learning_rate,
)

ModuleNotFoundError: No module named 'bitsandbytes'