In [55]:
from datasets import load_dataset

train_dataset = load_dataset("abisee/cnn_dailymail", '1.0.0', split="train")

In [56]:
train_dataset

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 287113
})

In [57]:
train_dataset = train_dataset.shard(num_shards=10, index=0)
len(train_dataset)

28712

In [58]:
# from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors

# tokenizer = Tokenizer(models.BPE())
# tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
# trainer = trainers.BpeTrainer(vocab_size=30000, special_tokens=["<pad>", "<s>", "</s>", "<unk>", "<mask>"])

# dataset = [article['article'] for article in train_dataset]
# tokenizer.train_from_iterator(dataset, trainer)
# tokenizer.save("bpe_tokenizer.json")

In [59]:
def tokenize_and_pad(sentence, tokenizer, max_len):
    tokens = tokenizer.encode(sentence).ids  # Tokenize the sentence
    if len(tokens) < max_len:
        tokens = tokens + [tokenizer.token_to_id("<pad>")] * (max_len - len(tokens))  # Pad
    else:
        tokens = tokens[:max_len]  # Truncate if necessary
    return torch.tensor(tokens)

In [60]:
import torch
from tokenizers import Tokenizer
tokenizer = Tokenizer.from_file("bpe_tokenizer.json")

MAX_LEN = 512 
SUMMARY_MAX_LEN = 128

article = train_dataset[0]['article']
summary = train_dataset[0]['highlights']


article = tokenize_and_pad(article, tokenizer, MAX_LEN)
summary = tokenize_and_pad(summary, tokenizer, MAX_LEN)

In [61]:
def preprocess_batch(train_dataset):
    train_dataset['input_ids'] = [tokenize_and_pad(article, tokenizer, MAX_LEN) for article in train_dataset['article']]
    train_dataset['labels'] = [tokenize_and_pad(summary, tokenizer, SUMMARY_MAX_LEN) for summary in train_dataset['highlights']]
    
    return train_dataset
    
train_dataset = train_dataset.map(preprocess_batch, batched = True)


In [62]:
train_dataset

Dataset({
    features: ['article', 'highlights', 'id', 'input_ids', 'labels'],
    num_rows: 28712
})

In [63]:
train_dataset.set_format(type="torch", columns=["input_ids", "labels"])

In [64]:
train_dataset

Dataset({
    features: ['article', 'highlights', 'id', 'input_ids', 'labels'],
    num_rows: 28712
})

In [65]:
from torch.utils.data import DataLoader

BATCH_SIZE = 16

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)


In [66]:
import torch
import torch.nn as nn
import torch.optim as optim
import random

class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, enc_hidden_dim, dec_hidden_dim):
        super(Encoder, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.rnn1 = nn.LSTM(emb_dim, enc_hidden_dim, batch_first=True)
        self.rnn2 = nn.LSTM(enc_hidden_dim, enc_hidden_dim, batch_first=True)
        self.fc = nn.Linear(enc_hidden_dim, dec_hidden_dim)
        
    def forward(self, src):
        embedded = self.embedding(src) 
        outputs1, (hidden1, cell1) = self.rnn1(embedded)
        outputs2, (hidden2, cell2) = self.rnn2(outputs1)
        hidden2 = torch.tanh(self.fc(hidden2))
        return outputs2, hidden2

In [67]:
class Attention(nn.Module):
    def __init__(self, enc_hidden_dim, dec_hidden_dim):
        super(Attention, self).__init__()
        
        self.attn = nn.Linear(enc_hidden_dim + dec_hidden_dim, dec_hidden_dim)
        self.v = nn.Parameter(torch.rand(dec_hidden_dim))
        
    def forward(self, hidden, encoder_outputs):
        src_len = encoder_outputs.shape[1]
        # The decoder hidden state is repeated src_len times (once for each time step in the encoder's output). 
        # The purpose is to compare the decoder hidden state to each of the encoder's hidden states at each time step.
        hidden = hidden.repeat(src_len, 1, 1).transpose(0, 1)
        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        energy = energy.permute(0, 2, 1)
        
        v = self.v.repeat(encoder_outputs.shape[0], 1).unsqueeze(1)
        attention = torch.bmm(v, energy).squeeze(1)
        return torch.softmax(attention, dim=1)

In [68]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, enc_hidden_dim, dec_hidden_dim, attention):
        super(Decoder, self).__init__()
        
        self.attention = attention
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.rnn1 = nn.LSTM(enc_hidden_dim + emb_dim, dec_hidden_dim, batch_first=True)
        self.rnn2 = nn.LSTM(dec_hidden_dim, dec_hidden_dim, batch_first=True)
        self.fc_out = nn.Linear(enc_hidden_dim + dec_hidden_dim + emb_dim, vocab_size)
        
    def forward(self, input_, hidden, encoder_outputs):
        # unsqueeze to [batch_size, 1]
        input_ = input_.unsqueeze(1)

        embedded = self.embedding(input_)

        attn_weights = self.attention(hidden, encoder_outputs)

        attn_applied = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs) # attention weights * encoder output
        
        rnn_input = torch.cat((embedded, attn_applied), dim=2)
        
        if isinstance(hidden, torch.Tensor):
            hidden = (hidden, torch.zeros_like(hidden))
            
        output1, (hidden1, cell1) = self.rnn1(rnn_input, hidden)
        
        if isinstance(hidden1, torch.Tensor):
            hidden1 = (hidden1, torch.zeros_like(hidden1))
            
        output2, (hidden2, cell2) = self.rnn2(output1, hidden1)

        prediction = self.fc_out(torch.cat((output2.squeeze(1), attn_applied.squeeze(1), embedded.squeeze(1)), dim=1))
        
        return prediction, hidden2


# https://discuss.pytorch.org/t/the-difference-and-use-of-output-and-hidden-state-of-an-rnn/15108

In [69]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        encoder_outputs, hidden = self.encoder(src)
        trg_len = trg.shape[1]
        batch_size = src.shape[0]
        outputs = torch.zeros(batch_size, trg_len, self.decoder.fc_out.out_features).to(src.device)

        #first decoder input
        input_ = trg[:, 0]

        #loop until max length of output
        for t in range(1, trg_len):
            output, hidden = self.decoder(input_, hidden, encoder_outputs)
            outputs[:, t, :] = output
            top1 = output.argmax(1)
            input_ = trg[:, t] if random.random() < teacher_forcing_ratio else top1
            
        return outputs

## LSTM Explanation
time step - Single point in a sequence of data 
layer - depth of LSTM in terms of stacked LSTM layers 
output - Output of LSTM for each time step in input sequence. Contains the hidden state for all time steps in sequence of all layers(default we have 1 layer). If hidden state > 1, output contains hidden states from last layer only 
hidden - hidden state at the last time step
cell - memory state of LSTM at last time step

In [70]:
from tqdm import tqdm

# Hyperparameters
INPUT_DIM = len(tokenizer.get_vocab())
OUTPUT_DIM = len(tokenizer.get_vocab())
EMB_DIM = 512
ENC_HIDDEN_DIM = 512
DEC_HIDDEN_DIM = 512
LEARNING_RATE = 0.001
MAX_LEN = 512 
SUMMARY_MAX_LEN = 128

# Instantiate the model
encoder = Encoder(INPUT_DIM, EMB_DIM, ENC_HIDDEN_DIM, DEC_HIDDEN_DIM)
attention = Attention(ENC_HIDDEN_DIM, DEC_HIDDEN_DIM)
decoder = Decoder(OUTPUT_DIM, EMB_DIM, ENC_HIDDEN_DIM, DEC_HIDDEN_DIM, attention)

# Training function
def train(model, iterator, clip, device):
    model.train()
    epoch_loss = 0
    # Define optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.token_to_id("<pad>"))
    
    for i, batch in tqdm(enumerate(iterator)):
        src = batch['input_ids'].to(device)
        trg = batch['labels'].to(device)
        
        optimizer.zero_grad()
        output = model(src, trg)
        
        # Reshape the output and target tensors
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        loss = criterion(output, trg)
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
    
    return epoch_loss / len(iterator)



# Data parallelization

In [71]:
import matplotlib.pyplot as plt
plt.switch_backend('Agg')
import numpy as np
import timeit


# single GPU

num_repeat = 1
CLIP = 1
stmt = '''
device = 'cuda:0'
train(model, train_loader, CLIP, device)
'''

setup_1 = '''
device = 'cuda:0'
model = Seq2Seq(encoder, decoder).to(device)
'''

sg_run_times = timeit.repeat(
    stmt, setup_1, number=1, repeat=num_repeat, globals=globals())
sg_mean, sg_std = np.mean(sg_run_times), np.std(sg_run_times)

1795it [49:56,  1.67s/it]


In [72]:
sg_mean

2996.7662307778373

In [73]:
# data parallelization 

num_repeat = 1
CLIP = 1
stmt = '''
device = 'cuda'
train(model, train_loader, CLIP, device)
'''

setup_2 = '''
device = 'cuda'

model = Seq2Seq(encoder, decoder)

if torch.cuda.device_count() > 1:
  model = nn.DataParallel(model)

model.to(device)
'''

dp_run_times = timeit.repeat(
    stmt, setup_2, number=1, repeat=num_repeat, globals=globals())
dp_mean, dp_std = np.mean(dp_run_times), np.std(dp_run_times)

1795it [42:00,  1.40s/it]


In [74]:
dp_mean

2520.746455488028

In [75]:
def plot(means, stds, labels, fig_name):
    fig, ax = plt.subplots()
    ax.bar(np.arange(len(means)), means, yerr=stds,
           align='center', alpha=0.5, ecolor='red', capsize=10, width=0.6)
    ax.set_ylabel('Model Execution Time')
    ax.set_xticks(np.arange(len(means)))
    ax.set_xticklabels(labels)
    ax.yaxis.grid(True)
    plt.tight_layout()
    plt.savefig(fig_name)
    plt.close(fig)


plot([sg_mean, dp_mean],
     [sg_std, dp_std],
     ['Single GPU', 'Data Parallelization'],
     'sg_vs_dp.png')