In [48]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import math


## 1. Encoding Language into Word Embeddings and Positional Encodings

In [36]:
sentence = "This is an awesome jupyter notebook for begginers"
# A simple word to id mapping
word2id = {word: i for i,word in enumerate(set(sentence.split()))}

In [37]:
word2id

{'an': 0,
 'for': 1,
 'begginers': 2,
 'notebook': 3,
 'jupyter': 4,
 'awesome': 5,
 'This': 6,
 'is': 7}

In [38]:
# Converting text to indices
input_ids = torch.tensor([word2id[word] for word in sentence.split()])

In [39]:
input_ids

tensor([6, 7, 0, 5, 4, 3, 1, 2])

In [40]:
def get_word_embeddings(input_ids, embedding_size):
  embedding_layer = nn.Embedding(input_ids.max()+1, embedding_size)
  return embedding_layer(input_ids)

embedding_size = 16
word_embeddings = get_word_embeddings(input_ids, embedding_size)

In [41]:
word_embeddings

tensor([[-6.7732e-01, -1.7875e+00,  3.2842e-01, -6.9614e-01, -4.8572e-01,
          7.3380e-01, -7.1722e-01, -3.5299e-01,  3.4001e-01, -1.2634e-02,
          9.9393e-01, -1.0153e+00, -1.7011e-01,  1.2828e+00, -6.2329e-02,
          1.0504e+00],
        [ 1.0578e+00,  1.4519e+00,  2.3196e+00, -6.9701e-01,  2.9474e-01,
         -9.4667e-01,  1.8872e+00,  1.7656e-01, -1.2878e+00,  1.5229e+00,
         -3.4224e-01, -2.1905e-01, -1.4975e+00, -4.7021e-01,  6.1198e-01,
         -1.2806e-01],
        [-8.2940e-01, -8.6052e-01,  1.4146e-01, -9.0389e-01,  6.4459e-01,
         -1.2700e+00, -1.3968e+00, -1.2418e+00, -6.1304e-01, -1.2733e+00,
         -3.2748e-01, -1.5087e+00,  7.6330e-01,  1.1904e+00, -1.0604e+00,
          2.0609e-01],
        [-8.2693e-01, -1.9949e+00, -5.3386e-01, -1.1536e+00,  6.0652e-01,
         -7.5105e-01,  8.8081e-01,  8.7497e-01,  9.5998e-01,  7.2323e-02,
         -2.5221e-01,  1.4166e-03,  2.5746e-01,  1.1236e+00,  1.0645e+00,
          8.3294e-01],
        [ 5.0048e-01

In [42]:
# Defining a function to generate positional encodings
def get_positional_encodings(max_seq_len, d_model):
  position = np.arange(max_seq_len)[:, np.newaxis]
  div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0)/ d_model))
  positional_encoding = np.zeros((max_seq_len, d_model))
  positional_encoding[:, 0::2] = np.sin(position * div_term)
  positional_encoding[:, 1::2] = np.cos(position * div_term)
  return torch.tensor(positional_encoding, dtype=torch.float)

In [45]:
max_seq_len = len(sentence.split())
d_model = embedding_size # same size of the word embeddings
positional_encodings = get_positional_encodings(max_seq_len, d_model)

#Adding word embeddings and positional encodings
final_embeddings = word_embeddings + positional_encodings


In [46]:
final_embeddings.size()

torch.Size([8, 16])

## 2. Decoder from Scratch

In [47]:
class DecoderBlock(nn.Module):
  def __init__(self, d_model, num_heads, ff_hidden_dim, dropout):
    super(DecoderBlock, self).__init__()
    # d_model : The dimension of the inputer vector, our case it is  dimension of of word embeddings
    # num_heads : the number of heads in the multihead attention mechanism
    # ff_hidden_dim: the dimension of the feed forward hidden layer

    self.self_attention = nn.MultiheadAttention(d_model, num_heads, dropout=dropout)
    self.norm1 = nn.LayerNorm(d_model)
    self.dropout1 = nn.Dropout(dropout)
    self.linear1 = nn.Linear(d_model, ff_hidden_dim)
    self.linear2 = nn.Linear(ff_hidden_dim, d_model)
    self.norm2 = nn.LayerNorm(d_model)
    self.dropout2 = nn.Dropout(dropout)

# x: input tensor
# tgt_mask: masks to prevent attention to certain positions
  def forward(self, x, tgt_mask):
    attn_output, _ = self.self_attention(x, x, x, attn_mask=tgt_mask)
    x = x + self.dropout1(attn_output)
    x = self.norm1(x)
    ff_output = self.linear2(F.relu(self.linear1(x)))
    x = x + self.dropout2(ff_output)
    x = self.norm2(x)
    return x



In [53]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, dropout=0.1, max_len=5000):
    super(PositionalEncoding, self).__init__()
    self.dropout = nn.Dropout(p=dropout)

    pe = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
    pe[: ,0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    pe = pe.unsqueeze(0).transpose(0,1)
    self.register_buffer('pe', pe)

  def forward(self, x):
    x = x + self.pe[:x.size(0), :]
    return self.dropout(x)

In [50]:
class TransformerDecoder(nn.Module):
  def __init__(self, vocab_size, d_model, num_heads, ff_hidden_dim, dropout):
    super(TransformerDecoder, self).__init__()

    self.embedding = nn.Embedding(vocab_size, d_model)
    self.pos_encoder = PositionalEncoding(d_model, dropout)
    self.transformer_block = DecoderBlock(d_model, num_heads, ff_hidden_dim, dropout)
    self.linear = nn.Linear(d_model, vocab_size)
    self.softmax = nn.LogSoftmax(dim=-1)

  def forward(self, x):
    x = self.embedding(x)
    x = self.pos_encoder(x)
    tgt_mask = generate_square_subsequent_mask(x.size(0))
    x = self.transformer_block(x, tgt_mask)
    output = self.linear(x)
    output = self.softmax(output)
    return output

In [51]:
def generate_square_subsequent_mask(sz):
  """Generate a mask to prevent attention to future positions"""
  mask = (torch.triu(torch.ones(sz, sz))==1).transpose(0,1)
  mask = mask.float().masked_fill(mask ==0, float('-inf')).masked_fill(mask ==1, float(0.0))
  return mask

In [54]:
vocab_size = 1000
d_model = 512
num_heads = 1
ff_hidden_dim = 2*d_model
dropout = 0.1
num_layers = 10
context_length = 50
batch_size = 1

model = TransformerDecoder(vocab_size, d_model, num_heads, ff_hidden_dim, dropout)

In [55]:
model

TransformerDecoder(
  (embedding): Embedding(1000, 512)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_block): DecoderBlock(
    (self_attention): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
    )
    (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (linear1): Linear(in_features=512, out_features=1024, bias=True)
    (linear2): Linear(in_features=1024, out_features=512, bias=True)
    (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (linear): Linear(in_features=512, out_features=1000, bias=True)
  (softmax): LogSoftmax(dim=-1)
)

In [56]:
# Create a tensor representing a batch of 1 sequence of length 10
input_tensor = torch.randint(0, vocab_size, (context_length, batch_size))

# Forward pass through the model

output = model.forward(input_tensor)


In [57]:
output.shape

torch.Size([50, 1, 1000])

In [58]:
predicted_indices = output.argmax(dim=-1)

print(predicted_indices.shape)

torch.Size([50, 1])


In [61]:
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 3,127,784 trainable parameters


## 3. Multi-layer Decoder

In [62]:
class MultiLayerTransformer(nn.Module):
  def __init__(self, vocab_size, d_model, num_heads, ff_hidden_dim, dropout, num_layers):
    super(MultiLayerTransformer, self).__init__()

    self.embedding = nn.Embedding(vocab_size, d_model)
    self.pos_encoder = PositionalEncoding(d_model, dropout)
    self.transformer_blocks = nn.ModuleList([
        DecoderBlock(d_model,num_heads, ff_hidden_dim, dropout)
        for _ in range(num_layers)
    ])
    self.linear = nn.Linear(d_model, vocab_size)
    self.softmax = nn.LogSoftmax(dim=-1)


  def forward(self, x):
    x = self.embedding(x)
    x = self.pos_encoder(x)
    for transformer_block in self.transformer_blocks:
      tgt_mask = generate_square_subsequent_mask(x.size(0))
      x = transformer_block(x, tgt_mask)
    output = self.linear(x)
    output = self.softmax(output)
    return output





In [63]:
vocab_size = 1000
d_model = 2048
num_heads = 1
ff_hidden_dim = 4*d_model
dropout = 0.1
num_layers = 10
context_length = 100
batch_size = 1

input_tensor = torch.randint(0, vocab_size, (context_length, batch_size))

model = MultiLayerTransformer(vocab_size, d_model, num_heads, ff_hidden_dim, dropout, num_layers)

In [64]:
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 507,679,720 trainable parameters


In [65]:
output = model(input_tensor)

distribution = torch.exp(output[0, 0, :])

distribution = distribution.detach().numpy()

In [66]:
model

MultiLayerTransformer(
  (embedding): Embedding(1000, 2048)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): ModuleList(
    (0-9): 10 x DecoderBlock(
      (self_attention): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=2048, out_features=2048, bias=True)
      )
      (norm1): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (linear1): Linear(in_features=2048, out_features=8192, bias=True)
      (linear2): Linear(in_features=8192, out_features=2048, bias=True)
      (norm2): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
  )
  (linear): Linear(in_features=2048, out_features=1000, bias=True)
  (softmax): LogSoftmax(dim=-1)
)

# Real Voocabulary Model

In [67]:
d_model = 100
num_heads = 1
ff_hidden_dim = 4*d_model
dropout = 0.1
num_layers = 4
context_length = 5
batch_size = 1

vocab = [
    "natural",
    "language",
    "processing",
    "machine",
    "learning",
    "algorithm",
    "data",
    "analysis",
    "text",
    "corpus",
    "tokenization",
    "sentiment",
    "classification",
    "entity",
    "recognition",
    "lemmatization",
    "stemming",
    "word2vec",
    "embedding",
    "part-of-speech",
    "syntax",
    "semantic",
    "context",
    "vector",
    "feature",
    "stopword",
    "n-gram",
    "bigram",
    "trigram",
    "preprocessing",
    "post-processing",
    "token",
    "word",
    "document",
    "sentence",
    "grammar",
    "parsing",
    "model",
    "neural",
    "network",
    "RNN",
    "LSTM",
    "attention",
    "transformer",
    "BERT",
    "GPT",
    "evaluation",
    "metrics",
    "accuracy",
]

vocab_size = len(vocab)

In [77]:
word2id = {word:id for id,word in enumerate(vocab)}

id2word = {id:word for id, word in enumerate(vocab)}


In [79]:
model = MultiLayerTransformer(vocab_size, d_model, num_heads, ff_hidden_dim, dropout, num_layers)

In [80]:
sequence = ['evaluation', 'metrics', 'accuracy', 'analysis', 'processing'][:context_length]

input_tensor = torch.tensor([[word2id[word] for word in sequence]])

In [81]:
import time

generated_words = []
for i in range(10):
  output = model(input_tensor)
  predicted_index = output.argmax(dim=-1)[0, -1] # Take the last word in sequence
  predicted_word = id2word[predicted_index.item()]
  print(predicted_word, end=' ')
  generated_words.append(predicted_word)
  input_tensor = torch.cat([input_tensor, predicted_index.unsqueeze(0).unsqueeze(0)], dim=-1)
  time.sleep(0.5)

data preprocessing evaluation machine evaluation algorithm sentiment word2vec evaluation language 

## Using a trained decoder and real-world vocabulary

In [83]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m18.3 MB/s[0m eta [36m0:00:0

In [84]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer