In [2]:
import numpy as np
import tiktoken
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device set to: {device}')

Device set to: cpu


In [4]:
class PositionalEncoding(nn.Module):

    def __init__(self, context_size, d_model):
        super().__init__()

        self.encoding = torch.zeros(context_size, d_model)

        pos = torch.arange(0, context_size).unsqueeze(dim=1)
        dim = torch.arange(
            0, d_model, 2)  # dim is i in the positional encoding formula
        self.encoding[:, 0::2] = torch.sin(pos / (10000**(2 * dim / d_model)))
        self.encoding[:, 1::2] = torch.cos(pos / (10000**(2 * dim / d_model)))

    def forward(self, x):
        seq_len = x.size(1)
        return self.encoding[:seq_len, :]

In [5]:
class PositionwiseFeedForward(nn.Module):

    def __init__(self, d_model, d_ff):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        return x

In [6]:
class DecoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout_rate = 0.1):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(d_model, num_heads, batch_first=True)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        hidden_states, _ = self.self_attn(x, x, x)
        x = self.norm1(x + self.dropout(hidden_states)).to(device)
        ff_output = self.feed_forward(x).to(device)
        x = self.norm2(x + self.dropout(ff_output)).to(device)
        return x

In [7]:
class Decoder(nn.Module):
    def __init__(self, output_size, context_size,
                 d_model, d_ff, num_heads, n_blocks):
        super().__init__()
        self.embedding = nn.Embedding(output_size, d_model)
        self.pos_embedding = PositionalEncoding(context_size, d_model)

        self.blocks = nn.ModuleList([
            DecoderBlock(
                d_model=d_model,
                num_heads=num_heads,
                d_ff=d_ff,
            )
            for _ in range(n_blocks)
        ])

        self.out = nn.Linear(d_model, output_size)

    def forward(self, x):
        x = self.embedding(x).to(device) + self.pos_embedding(x).to(device)

        for block in self.blocks:
            x = block(x)

        output = self.out(x)
        return output

In [8]:
class Transformer(nn.Module):

    def __init__(self, vocab_size, context_size,
                 d_model, d_ff, num_heads, n_blocks):
        super().__init__()

        self.decoder = Decoder(
            vocab_size,
            context_size,
            d_model,
            d_ff,
            num_heads,
            n_blocks
        )

    def forward(self, x):
        output = self.decoder(x)  # input_decoder shape - (64, 99)
        return output

### Data Import

In [9]:
## Reading and processing text

# Download 'The Mysterious Island' from
# https://www.gutenberg.org/cache/epub/1268/pg1268.txt
with open('data/1268-0.txt', 'r', encoding="utf8") as fp:
    text=fp.read()

start_indx = text.find('THE MYSTERIOUS ISLAND')
end_indx = text.find('End of the Project Gutenberg')

text = text[start_indx:end_indx]

In [10]:
enc = tiktoken.get_encoding("cl100k_base")

text_encoded = enc.encode(text)

### Train model from scratch

In [11]:
CONTEXT_SIZE = 40
chunk_size = CONTEXT_SIZE  + 1

# n chunks where each next chunk is 1 word offset from the previous chunk
token_chunks = [
    text_encoded[i:i + chunk_size]
    for i in range(len(text_encoded) - chunk_size + 1)
]

In [12]:
class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)

    def __getitem__(self, idx):
        text_chunk = self.text_chunks[idx]
        return text_chunk[:-1].long(), text_chunk[1:].long()

seq_dataset = TextDataset(torch.tensor(token_chunks).to(device))

In [13]:
BATCH_SIZE = 50
seq_dl = DataLoader(seq_dataset,
                    batch_size=BATCH_SIZE,
                    shuffle=True,
                    drop_last=True)

In [14]:
VOCAB_SIZE = 104000
D_MODEL = 500
D_FF = 20
NUM_HEADS = 10
N_BLOCKS = 10

In [15]:
model = Transformer(
    vocab_size=VOCAB_SIZE,
    context_size=CONTEXT_SIZE,
    d_model=D_MODEL,
    d_ff=D_FF,  # internal dimension of the feed forward network
    num_heads=NUM_HEADS,
    n_blocks=N_BLOCKS)

model.to(device)

Transformer(
  (decoder): Decoder(
    (embedding): Embedding(104000, 500)
    (pos_embedding): PositionalEncoding()
    (blocks): ModuleList(
      (0-9): 10 x DecoderBlock(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=500, out_features=500, bias=True)
        )
        (feed_forward): PositionwiseFeedForward(
          (linear1): Linear(in_features=500, out_features=20, bias=True)
          (linear2): Linear(in_features=20, out_features=500, bias=True)
          (relu): ReLU()
        )
        (norm1): LayerNorm((500,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((500,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (out): Linear(in_features=500, out_features=104000, bias=True)
  )
)

In [16]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

model.train()

  from .autonotebook import tqdm as notebook_tqdm


Transformer(
  (decoder): Decoder(
    (embedding): Embedding(104000, 500)
    (pos_embedding): PositionalEncoding()
    (blocks): ModuleList(
      (0-9): 10 x DecoderBlock(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=500, out_features=500, bias=True)
        )
        (feed_forward): PositionwiseFeedForward(
          (linear1): Linear(in_features=500, out_features=20, bias=True)
          (linear2): Linear(in_features=20, out_features=500, bias=True)
          (relu): ReLU()
        )
        (norm1): LayerNorm((500,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((500,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (out): Linear(in_features=500, out_features=104000, bias=True)
  )
)

In [17]:
NUM_EPOCHS = 10

In [18]:
# loss_avg = [0] * NUM_EPOCHS
# for epoch in range(NUM_EPOCHS):
#     for src_data, tgt_data in seq_dl:
#         output = model(src_data)
#         loss = criterion(output.view(-1, VOCAB_SIZE), tgt_data.view(-1))
#         loss.backward()
#         optimizer.step()
#         optimizer.zero_grad()
#         loss_avg[epoch] += loss.item()
#     loss_avg[epoch] /= BATCH_SIZE
#     print(f"Epoch: {epoch+1}, Loss: {loss_avg[epoch]}")

#     torch.save(model.state_dict(), f"model_epoch_{epoch+7}.pth")

### Predict on New Data

In [19]:
# Load model weights

# trained on GPU but performing inference locally on CPU
MODEL_PATH = "data/model_epoch_10.pth"
model.load_state_dict(torch.load(MODEL_PATH, map_location=torch.device('cpu')))

<All keys matched successfully>

In [73]:
# prediction_start = "And then they decided to embark on a new adventure.  They began to make preparations"

# prediction_start = "Sheila noticed her emotions"

#prediction_start = "The sea was calm that "

prediction_start = "Harding wondered "

In [74]:
model.eval()

Transformer(
  (decoder): Decoder(
    (embedding): Embedding(104000, 500)
    (pos_embedding): PositionalEncoding()
    (blocks): ModuleList(
      (0-9): 10 x DecoderBlock(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=500, out_features=500, bias=True)
        )
        (feed_forward): PositionwiseFeedForward(
          (linear1): Linear(in_features=500, out_features=20, bias=True)
          (linear2): Linear(in_features=20, out_features=500, bias=True)
          (relu): ReLU()
        )
        (norm1): LayerNorm((500,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((500,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (out): Linear(in_features=500, out_features=104000, bias=True)
  )
)

#### Greedy search prediction

In [75]:
tokenized_input = torch.tensor(enc.encode(prediction_start)).to(device)
tokenized_input = tokenized_input[None,:] # add dimension

In [76]:
# Greedy search

with torch.no_grad():
  for _ in range(15):
    res = model(tokenized_input)
    next_token = torch.argmax(res[:,-1,:], dim=1)
    tokenized_input = torch.cat([tokenized_input, next_token[None,:]], dim=-1)

In [77]:
enc.decode(tokenized_input.squeeze().tolist())

'Harding wondered  that  that  was  was  was  wouldChapter Chapter '

#### Multinomial prediction

In [87]:
tokenized_input = torch.tensor(enc.encode(prediction_start)).to(device)
tokenized_input = tokenized_input[None,:] # add dimension

m = torch.nn.Softmax(dim=2)

with torch.no_grad():
  for _ in range(10):
    res = model(tokenized_input)
    res_probs = m(res)
    next_token = torch.multinomial(res_probs[:,-1,:], 1)
    tokenized_input = torch.cat([tokenized_input, next_token], dim=-1)

In [88]:
enc.decode(tokenized_input.squeeze().tolist())

'Harding wondered mouth resigned“And Your litter intercepted condemn rigging discovery'