In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm.notebook import tqdm
from transformers import GPT2Tokenizer

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [4]:
questions = ["What is AI?", "How does deep learning work?", "What is NLP?"]
answers = ["AI stands for Artificial Intelligence.", "Deep learning uses neural networks.", "NLP stands for Natural Language Processing."]

In [5]:
idx=0
sample = 'question: '+ questions[idx] + ' \nanswer: ' + answers[idx] + tokenizer.eos_token

sample


'question: What is AI? \nanswer: AI stands for Artificial Intelligence.<|endoftext|>'

# We have to fix a context length for the LM

In [6]:
sequence_length = 50

In [7]:
len(tokenizer(sample).input_ids)

17

In [8]:
tokenizer(sample).input_ids

[25652,
 25,
 1867,
 318,
 9552,
 30,
 220,
 198,
 41484,
 25,
 9552,
 6296,
 329,
 35941,
 9345,
 13,
 50256]

In [9]:
len(tokenizer(sample, max_length=sequence_length, padding='max_length' ).input_ids)

50

In [10]:
tokenizer(sample, max_length=sequence_length, padding='max_length' ).input_ids

[25652,
 25,
 1867,
 318,
 9552,
 30,
 220,
 198,
 41484,
 25,
 9552,
 6296,
 329,
 35941,
 9345,
 13,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256]

In [11]:
tokenizer(sample, max_length=sequence_length, padding='max_length' )

{'input_ids': [25652, 25, 1867, 318, 9552, 30, 220, 198, 41484, 25, 9552, 6296, 329, 35941, 9345, 13, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [12]:
tokenizer_output = tokenizer(sample, max_length=sequence_length+1, padding='max_length', return_tensors='pt')

tokenized_text, token_mask = tokenizer_output.input_ids, tokenizer_output.attention_mask



In [13]:
input_tensor = tokenized_text[:,:-1]
print(input_tensor)

tensor([[25652,    25,  1867,   318,  9552,    30,   220,   198, 41484,    25,
          9552,  6296,   329, 35941,  9345,    13, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]])


In [19]:
# target_tensor = tokenized_text[:,1:]#.masked_fill(token_mask[:,1:].ne(1),-100)
# print(target_tensor)

target_tensor = tokenized_text[:,1:].masked_fill(token_mask[:,1:].ne(1),-100)
print(target_tensor)

tensor([[   25,  1867,   318,  9552,    30,   220,   198, 41484,    25,  9552,
          6296,   329, 35941,  9345,    13, 50256,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100]])


In [15]:
class TransformerLanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, hidden_dim, seq_length):
        super(TransformerLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Parameter(torch.zeros(1, seq_length, embed_size))

        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_size,
            nhead=num_heads,
            dim_feedforward=hidden_dim,
            batch_first=True
        ) 
        self.fc = nn.Linear(embed_size, vocab_size)

    def forward(self, x, mask=None):
        seq_length = x.size(1)
        x = self.embedding(x) + self.positional_encoding[:, :seq_length, :]
        x = self.encoder_layer(x, src_mask=mask)
        x = self.fc(x)
        return x


embed_size = 64
num_heads = 4
hidden_dim = 128
num_epochs = 1000
batch_size = 4
learning_rate = 0.01

model = TransformerLanguageModel(
    vocab_size=tokenizer.vocab_size,
    embed_size=embed_size,
    num_heads=num_heads,
    hidden_dim=hidden_dim,
    seq_length=sequence_length,
)

In [20]:
output = model(input_tensor)
print(output.shape)
print(output.argmax(-1))

torch.Size([1, 50, 50257])
tensor([[ 8065, 24110, 29770, 46598, 42345,  1936,  3874,  4587,  8924, 24110,
         42345,  8220, 29580, 24270, 19134, 30019, 11160, 33144, 38888, 11160,
         11160, 18239, 33144, 33144, 33144, 11160, 11160, 38888, 11160, 28064,
         33144, 38888, 38888, 33144, 33144, 11160, 38888, 38888, 11160, 33144,
         38888, 33144, 33144, 33144, 18239, 38888, 38888, 38888, 33144, 28064]])


In [21]:
criterion = nn.CrossEntropyLoss(reduction='sum')

In [22]:
loss = criterion(output.view(-1, tokenizer.vocab_size),target_tensor.view(-1 ))
print(loss)

tensor(178.4055, grad_fn=<NllLossBackward0>)


In [23]:


# Step 1: Prepare the Dataset
class TextDataset(Dataset):
    def __init__(self, questions, answers, seq_length, tokenizer):
        self.seq_length = seq_length
        self.tokenizer = tokenizer
        self.questions = questions
        self.answers = answers
        self.vocab_size = tokenizer.vocab_size

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        answer = self.answers[idx]

        sample = 'question: '+ question + '\nanswer: ' + answer + tokenizer.eos_token
        tokenized_output = tokenizer(sample, max_length=self.seq_length+1, padding='max_length',return_tensors='pt')
        tokenized_text,token_mask = tokenized_output.input_ids, tokenized_output.attention_mask

        target = tokenized_text[:,1:].masked_fill(token_mask[:,1:].ne(1),-100)
        input_sequence = tokenized_text[:,:-1]
        return input_sequence.squeeze(), target.squeeze()

# Step 2: Define the Model
class TransformerLanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, hidden_dim, seq_length):
        super(TransformerLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Parameter(torch.zeros(1, seq_length, embed_size))

        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_size,
            nhead=num_heads,
            dim_feedforward=hidden_dim,
            batch_first=True
        ) 
        self.fc = nn.Linear(embed_size, vocab_size)

    def forward(self, x, mask=None):
        seq_length = x.size(1)
        x = self.embedding(x) + self.positional_encoding[:, :seq_length, :]
        x = self.encoder_layer(x, src_mask=mask)
        x = self.fc(x)
        return x

# Step 3: Training

# Hyperparameters
questions = ["What is AI?", "How does deep learning work?", "What is NLP?"]
answers = ["AI stands for Artificial Intelligence.", "Deep learning uses neural networks.", "NLP stands for Natural Language Processing."]
seq_length = 30
embed_size = 64
num_heads = 4
hidden_dim = 128
num_epochs = 1000
batch_size = 4
learning_rate = 0.01

# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Prepare data
dataset = TextDataset(questions, answers, seq_length, tokenizer)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Model, loss, optimizer
model = TransformerLanguageModel(
    vocab_size=dataset.vocab_size,
    embed_size=embed_size,
    num_heads=num_heads,
    hidden_dim=hidden_dim,
    seq_length=seq_length
).to('cuda')
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    running_loss = []
    for question, answer in dataloader:
        optimizer.zero_grad()
        mask = torch.triu(torch.ones(question.shape[-1],question.shape[-1]) * float('-inf'), diagonal=1)
        output = model(question.to('cuda'),mask.to('cuda'))
        loss = criterion(output.view(-1, dataset.vocab_size), answer.view(-1).to('cuda'))
        loss.backward()
        optimizer.step()
        running_loss.append(loss.item())

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {np.mean(running_loss[-10:]):.4f}")




Epoch 1/1000, Loss: 10.8633
Epoch 2/1000, Loss: 9.4694
Epoch 3/1000, Loss: 8.2564
Epoch 4/1000, Loss: 7.1153
Epoch 5/1000, Loss: 5.9836
Epoch 6/1000, Loss: 4.8883
Epoch 7/1000, Loss: 3.8125
Epoch 8/1000, Loss: 2.7811
Epoch 9/1000, Loss: 1.8687
Epoch 10/1000, Loss: 1.1844
Epoch 11/1000, Loss: 0.7982
Epoch 12/1000, Loss: 0.5392
Epoch 13/1000, Loss: 0.3807
Epoch 14/1000, Loss: 0.3061
Epoch 15/1000, Loss: 0.2503
Epoch 16/1000, Loss: 0.2216
Epoch 17/1000, Loss: 0.1892
Epoch 18/1000, Loss: 0.1776
Epoch 19/1000, Loss: 0.1628
Epoch 20/1000, Loss: 0.1635
Epoch 21/1000, Loss: 0.1449
Epoch 22/1000, Loss: 0.1215
Epoch 23/1000, Loss: 0.1047
Epoch 24/1000, Loss: 0.1079
Epoch 25/1000, Loss: 0.1269
Epoch 26/1000, Loss: 0.1039
Epoch 27/1000, Loss: 0.0973
Epoch 28/1000, Loss: 0.1068
Epoch 29/1000, Loss: 0.1100
Epoch 30/1000, Loss: 0.1085
Epoch 31/1000, Loss: 0.0833
Epoch 32/1000, Loss: 0.0951
Epoch 33/1000, Loss: 0.0829
Epoch 34/1000, Loss: 0.0759
Epoch 35/1000, Loss: 0.0776
Epoch 36/1000, Loss: 0.0761


In [40]:
model.eval()

sample = 'question: '+ questions[2] + '\nanswer:'
input_sequence = tokenizer(sample,return_tensors='pt').input_ids


generated_text = input_sequence.clone().to('cuda')

for i in range(20):
    with torch.no_grad():
        output = model(generated_text[:,-30:].to('cuda'))
    generated_text = torch.concat((generated_text,output.argmax(-1)[:,-1:]),dim=1)
    if output.argmax(-1)[:,-1:].squeeze().item() == 50256:
        print('*** reached end of the sentence ***')
        break
print(tokenizer.batch_decode(generated_text)[0])

*** reached end of the sentence ***
question: What is NLP?
answer: NLP stands for Natural Language Processing.<|endoftext|>


In [32]:
tokenizer.decode(output.argmax(-1)[:,-1])

' AI'

In [8]:
tokenizer.batch_decode(input_sequence)

['question: What is NLP?\n answer: ']