# Reading in a short story as text sample into Python.
# Step 1: Creating Tokens

In [None]:
import torch
from IPython.core.pylabtools import figsize
from matplotlib.lines import lineStyles

with open("the-verdict.txt", "r", encoding="utf-8") as file:
    raw_text = file.read()
print("Total number of character:", len(raw_text))

In [None]:
import re

result = re.split(r'(\s|--|[^a-zA-Z-])', raw_text)
result = [item for item in result if item.strip()]
print(result[:30])
print(len(result))

# Step 2:Creating Token IDs
each unique taken is mapped to a unique integer called takenID

In [None]:
all_words = sorted(set(result))
vocab_size = len(all_words)
print(vocab_size)

vocab = {token: integer for integer, token in enumerate(all_words)}

In [None]:
class SimpleTokenizerV1(object):
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_vocab = {i: s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'(\s|--|[^a-zA-Z-])', text)
        preprocessed = [item for item in preprocessed if item.strip()]
        ids = [self.str_to_int[item] for item in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_vocab[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [None]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know,"
          Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

print(tokenizer.decode(ids))

# ADDING SPECIAL CONTEXT TOKENS

In [None]:
all_tokens = sorted(list(set(result)))
all_tokens.extend(['<|unk|>', '<|endoftext|>'])

vocab = {token: integer for integer, token in enumerate(all_tokens)}
len(vocab)
print(list(vocab.items())[-5:])

In [None]:
class SimpleTokenizerV2(object):
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_vocab = {i: s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'(\s|--|[^a-zA-Z-])', text)
        preprocessed = [item for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        ids = [self.str_to_int[item] for item in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_vocab[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

the tokenizer used for GPT models also doesn't use an <|unk|> token for out-of-vocabulary words.
Instead, GPT use type pair encoding

# BYTE PAIR ENCODING (BPE)
BPE is a subword tokenization algorithm

In [None]:
import tiktoken
import importlib

print("tiktoken version:", importlib.metadata.version("tiktoken"))

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")

text = ("Hello, do you like tea? <|endoftext|> In the sunlit terraces"
        "of someunknownPlace.")

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

In [None]:
strings = tokenizer.decode(integers)

print(strings)

In [None]:
integers = tokenizer.encode("f i n e")
print(integers)

# CREATING INPUT-TARGET PAIRS

In [None]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

In [None]:
enc_sample = enc_text[50:]

In [None]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size + 1]

for i in range(1, context_size + 1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context, "----->", desired)

# IMPLEMENTING A DATA LOADER

In [31]:
from tiktoken import Encoding
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer: Encoding, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i: i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [30]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True, num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [None]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

import torch

print("PyTorch version:", torch.__version__)

dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)


# IMPORT TRAINED MODEL

In [None]:
import gensim.downloader as gensim_api

model = gensim_api.load("word2vec-google-news-300")  # download the model and return as an object ready to use

In [None]:
print(model['computer'])
word_vectors = model
# Example of using most_similar
print(word_vectors.most_similar(positive=["king", "woman"], negative=["man"], topn=10))

# Example of calculating similarity
print(word_vectors.similarity('woman', 'man'))
print(word_vectors.similarity('king', 'queen'))
print(word_vectors.similarity('boy', 'girl'))
print(word_vectors.similarity('uncle', 'aunt'))
print(word_vectors.similarity('nephew', 'niece'))
print(word_vectors.similarity('paper', 'water'))

In [None]:
input_ids = torch.tensor([2, 3, 5, 1])

vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

print(embedding_layer.weight)


In [None]:
print(embedding_layer(torch.tensor([3])))

In [None]:
print(embedding_layer(input_ids))

print(torch.nn.Embedding(4, 5).weight)

# POSITIONAL EMBEDDING

In [None]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [None]:
max_length = 4
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False)

data_iter = iter(dataloader)
inputs, target = next(data_iter)
print("Token IDs:\n", inputs)
print("shape:\n", inputs.shape)


In [None]:
token_embeddings: torch.Tensor = token_embedding_layer(inputs)
print(token_embeddings.shape)

In [None]:
context_length = max_length
context_size = 4
pos_embedding_layer = torch.nn.Embedding(context_size, output_dim)

pos_embeddings: torch.Tensor = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings)

In [None]:
input_embedding = token_embeddings + pos_embeddings
print(input_embedding.shape)

# IMPLEMENTING A SIMPLIFIED ATTENTION MECHANISM

In [None]:
import torch

inputs = torch.tensor([
    [0.43, 0.15, 0.89],  # Your
    [0.55, 0.87, 0.66],  # journey
    [0.57, 0.85, 0.64],  # starts
    [0.22, 0.58, 0.33],  # with
    [0.77, 0.25, 0.10],  # one
    [0.05, 0.80, 0.55]  # step
])

In [None]:
query = inputs[1]
attn_scores_2 = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
    attn_scores_2[i] = torch.dot(x_i, query)
print(attn_scores_2)

attn_weights_2_tmp = attn_scores_2 / attn_scores_2.sum()
print(attn_weights_2_tmp)

In [None]:
%%sql


In [None]:
def softmax_naive(x):
    return torch.exp(x) / torch.exp(x).sum(dim=0)


# this naive softmax implementation may encounter numerical instability issues, such as overflow (for very large values) and underflow (for very small values)
attn_weights_2_naive = softmax_naive(attn_scores_2)
print(attn_weights_2_naive)

In [None]:
attn_weights_2 = torch.softmax(attn_scores_2, dim=0)
print(attn_weights_2)
print(attn_weights_2.sum())

In [None]:
query = inputs[1]

context_vec_2 = torch.zeros(query.shape)
for i, i_x in enumerate(inputs):
    context_vec_2 += attn_weights_2[i] * i_x

print(context_vec_2)


In [None]:
attn_scores = torch.zeros(inputs.shape[0], inputs.shape[0])
print(attn_scores)

for i, i_x in enumerate(inputs):
    for j, j_x in enumerate(inputs):
        attn_scores[i, j] = torch.dot(i_x, j_x)
print(attn_scores)


In [None]:
attn_scores = inputs @ inputs.T
print(attn_scores)
attn_weights = torch.softmax(attn_scores, dim=-1)
print(attn_weights)

In [None]:
all_context_vecs = attn_weights @ inputs
print(all_context_vecs)

# IMPLEMENTING SELF ATTENTION WITH TRAINABLE WEIGHTS

In [None]:
import torch

inputs = torch.tensor([
    [0.43, 0.15, 0.89],  # Your
    [0.55, 0.87, 0.66],  # journey
    [0.57, 0.85, 0.64],  # starts
    [0.22, 0.58, 0.33],  # with
    [0.77, 0.25, 0.10],  # one
    [0.05, 0.80, 0.55]  # step
])

In [None]:
torch.manual_seed(123)
d_in = inputs.shape[1]
d_out = 2
W_query = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_key = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_value = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

In [None]:
x_2 = inputs[1]
query_2 = x_2 @ W_query
key_2 = x_2 @ W_key
value_2 = x_2 @ W_value
print(query_2)

In [None]:
queries = inputs @ W_query
keys = inputs @ W_key
values = inputs @ W_value

print(queries)
print(keys)
print(values)

In [None]:
query_2 = queries[1]
attn_scores_2 = query_2 @ keys.T
print(attn_scores_2)

In [None]:
attn_scores = queries @ keys.T
print(attn_scores)

In [None]:
d_k = keys.shape[-1]

attn_weights = torch.softmax(attn_scores / d_k ** 0.5, dim=-1)
print(attn_weights)

# WHY DIVIDE BY SQRT (DIMENSION)

In [None]:
import torch

# reason 1: For stability in learning

tensor = torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])

softmax_result = torch.softmax(tensor, dim=-1)
print(softmax_result)

# BUT WHY SQRT?

In [None]:
# Reason 2: To make the variance of the dot product stable close to 1

import numpy as np


# Function to compute variance before and after scaling
def compute_variance(dim, num_trails=1000):
    dot_products = []
    scaled_dot_products = []

    # Generate multiple random vectors and compute dot products
    for _ in range(num_trails):
        q = np.random.randn(dim)
        k = np.random.randn(dim)
        print(q, k)
        # Compute dot product
        dot_product = np.dot(q, k)
        dot_products.append(dot_product)

        # Scale the dot product by sqrt(dim)
        scale_dot_product = dot_product / np.sqrt(dim)
        scaled_dot_products.append(scale_dot_product)

    variance_before_scaling = np.var(dot_products)
    variance_after_scaling = np.var(scaled_dot_products)
    return variance_before_scaling, variance_after_scaling


variance_before_scaling_5, variance_after_scaling_5 = compute_variance(5)
variance_before_scaling_100, variance_after_scaling_100 = compute_variance(100)


# COMPUTE CONTEXT VALUE

In [None]:
context_vec_2 = attn_weights_2 @ values
print(context_vec_2)

In [None]:
context_vec = attn_weights @ values
print(context_vec)

In [None]:
# IMPLEMENTING A COMPACT SELF ATTENTION PYTHON CLASS

In [None]:
import torch.nn as nn


class SelfAttention_v1(nn.Module):

    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_query = nn.Parameter(torch.rand(d_in, d_out))
        self.W_key = nn.Parameter(torch.rand(d_in, d_out))
        self.W_value = nn.Parameter(torch.rand(d_in, d_out))

    def forward(self, x):
        keys = x @ self.W_key
        queries = x @ self.W_query
        values = x @ self.W_value
        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(attn_scores / keys.shape[-1] ** 0.5, dim=-1)
        context_vec = attn_weights @ values
        return context_vec

In [None]:
torch.manual_seed(123)
sa_v1 = SelfAttention_v1(d_in, d_out)
print(sa_v1(inputs))

In [None]:
import torch.nn as nn


class SelfAttention_v2(nn.Module):
    def __init__(self, d_in, d_out, qkv_bias=False):
        super().__init__()
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        # nn.Linear has an optimized weight initialization scheme, contributing to more stable and effective model training.

    def forward(self, x):
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)
        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(attn_scores / keys.shape[-1] ** 0.5, dim=-1)
        context_vec = attn_weights @ values
        return context_vec

In [None]:
inputs = torch.tensor([
    [0.43, 0.15, 0.89],  # Your
    [0.55, 0.87, 0.66],  # journey
    [0.57, 0.85, 0.64],  # starts
    [0.22, 0.58, 0.33],  # with
    [0.77, 0.25, 0.10],  # one
    [0.05, 0.80, 0.55]  # step
])
torch.manual_seed(789)
sa_v2 = SelfAttention_v2(d_in, d_out)
print(sa_v2(inputs))

# HIDING FUTURE WORDS WITH CAUSAL ATTENTION

In [None]:
torch.manual_seed(789)
queries = sa_v2.W_query(inputs)
keys = sa_v2.W_key(inputs)
attn_scores: torch.Tensor = queries @ keys.T
attn_weights = torch.softmax(attn_scores / inputs.shape[-1] ** 0.5, dim=-1)
print(attn_weights)


In [None]:
context_length = inputs.shape[0]
mask_simple = torch.tril(torch.ones(context_length, context_length))
print(mask_simple)

In [None]:
masked_simple = attn_weights * mask_simple
print(masked_simple)

In [None]:
row_sums = masked_simple.sum(dim=1, keepdim=True)
masked_simple_norm = masked_simple / row_sums
print(masked_simple_norm)

In [None]:
mask = torch.triu(torch.ones(context_length, context_length), diagonal=1)
masked = attn_scores.masked_fill(mask.bool(), -torch.inf)
attn_weights = torch.softmax(masked / inputs.shape[-1] ** 0.5, dim=1)
print(attn_weights)

# MASKING ADDITIONAL ATTENTION WEIGHTS WITH DROPOUT

In [None]:
# Dropout is a deep learning technique where randomly selected hidden layer unit s are ignored during training
# This prevents overfitting and improves generalization performance
# applied in 2 specific areas  one is after calculating attention scores and the other is after applying attention weights to value vectors
torch.manual_seed(123)
dropout = torch.nn.Dropout(0.5)
example = torch.ones(6, 6)
print(example)
print(dropout(example))

In [None]:
print(attn_weights)
print(dropout(attn_weights))

# IMPLEMENTING A COMPACT CAUSAL ATTENTION CLASS

In [None]:
# self test not the lecture content
class MyAttention_V3(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_query = nn.Linear(d_in, d_out, bias=False)
        self.W_key = nn.Linear(d_in, d_out, bias=False)
        self.W_value = nn.Linear(d_in, d_out, bias=False)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        queries = self.W_query(x)
        keys = self.W_key(x)
        values = self.W_value(x)
        attn_scores = queries @ keys.T
        context_length = queries.shape[0]
        mask = torch.triu(torch.ones(context_length, context_length), diagonal=1)
        attn_scores_masked = attn_scores.masked_fill(mask.bool(), -torch.inf)
        attn_weights = torch.softmax(attn_scores_masked / queries.shape[-1] ** 0.5, dim=1)
        attn_weights = self.dropout(attn_weights)
        context_vec = attn_weights @ values
        return context_vec


inputs = torch.tensor([
    [0.43, 0.15, 0.89],  # Your
    [0.55, 0.87, 0.66],  # journey
    [0.57, 0.85, 0.64],  # starts
    [0.22, 0.58, 0.33],  # with
    [0.77, 0.25, 0.10],  # one
    [0.05, 0.80, 0.55]  # step
])

torch.manual_seed(123)
mav = MyAttention_V3(3, 2)
print(mav(inputs))


In [None]:
batch = torch.stack((inputs, inputs), dim=0)
print(batch.shape)

In [None]:
class CausalAttention(nn.Module):

    def __init__(self, d_in, d_out, context_length, dropout, qkv_bias=False):
        super().__init__()
        self.d_out = d_out
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape
        keys: torch.Tensor = self.W_key(x)
        queries: torch.Tensor = self.W_query(x)
        values: torch.Tensor = self.W_value(x)
        attn_scores = queries @ keys.transpose(1, 2)
        attn_scores.masked_fill(self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)
        attn_weights = torch.softmax(attn_scores / num_tokens ** 0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)
        context_vec = attn_weights @ values
        return context_vec


torch.manual_seed(123)
context_length = batch.shape[1]
d_in = 3
d_out = 2
ca = CausalAttention(d_in, d_out, context_length, 0.0)
context_vecs = ca(batch)
print(context_vecs)

# EXTENDING SINGLE HEAD ATTENTION TO MULTI-HEAD ATTENTION

In [None]:
class MultiHeadAttentionWrapper(nn.Module):

    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        self.heads = nn.ModuleList(
            [CausalAttention(d_in, d_out, context_length, dropout, qkv_bias) for _ in range(num_heads)])

    def forward(self, x):
        return torch.cat([head(x) for head in self.heads], dim=-1)


torch.manual_seed(123)
context_length = batch.shape[1]
d_in = 3
d_out = 2
ca = MultiHeadAttentionWrapper(d_in, d_out, context_length, 0.0, 2)
context_vecs = ca(batch)
print(context_vecs)

# IMPLEMENTING MULTI-HEAD ATTENTION WITH WEIGHT SPLITS

In [None]:
class MultiHeadAttention(nn.Module):

    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer("mask",
                             torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)

        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        attn_scores: torch.Tensor = queries @ keys.transpose(2, 3)

        attn_scores.masked_fill_(self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1] ** 0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        context_vec: torch.Tensor = (attn_weights @ values).transpose(1, 2)

        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)  # optional
        return context_vec


inputs = torch.tensor([
    [0.43, 0.15, 0.89],  # Your
    [0.55, 0.87, 0.66],  # journey
    [0.57, 0.85, 0.64],  # starts
    [0.22, 0.58, 0.33],  # with
    [0.77, 0.25, 0.10],  # one
    [0.05, 0.80, 0.55]  # step
])
torch.manual_seed(123)
batch = torch.stack((inputs, inputs), dim=0)
print(batch.shape)
context_length = batch.shape[1]
d_in = 3
d_out = 4
ca = MultiHeadAttention(d_in, d_out, context_length, 0.0, 2)
context_vecs = ca(batch)
print(context_vecs)

# IMPLEMENTING A GPT MODEL FROM SCRATCH TO GENERATE TEXT

In [None]:
GPT_CONFIG_124M = {
    'vocab_size': 50257,
    'context_length': 1024,
    'emb_dim': 768,
    'n_heads': 12,
    'n_layers': 12,
    'drop_rate': 0.1,
    'qkv_bias': False
}

# GPT ARCHITECTURE PART 1: DUMMY GPT MODEL CLASS

In [None]:
import torch
import torch.nn as nn


class DummyGPTModel(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.token_emb = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_length'], cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg['drop_rate'])

        self.trf_blocks = nn.Sequential(*[DummyTransformerBlock(cfg) for _ in range(cfg['n_layers'])])
        self.final_norm = DummyLayerNorm(cfg['emb_dim'])

        self.out_head = nn.Linear(cfg['emb_dim'], cfg['vocab_size'], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.token_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits


class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        # simple placeholder

    def forward(self, x):
        return x


class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()
        # simple placeholder mean of zero and variance of one
        # help stability neural network training and reduce the problem of internal covariate shift

    def forward(self, x):
        return x


# STEP 1: TOKENIZATION

In [None]:
import torch
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
text1 = "Every effort moves you"
text2 = "Every day holds a"
batch = []
batch.append(torch.tensor(tokenizer.encode(text1)))
batch.append(torch.tensor(tokenizer.encode(text2)))
batch = torch.stack(batch, dim=0)
print(batch)

# STEP 2: CREATE AN INSTANCE OF DUMMYGPTMODEL

In [None]:
torch.manual_seed(123)
model = DummyGPTModel(GPT_CONFIG_124M)
print(model(batch).shape)

# GPT ARCHITECTURE PART 2: LAYER NORMALIZATION

In [None]:
import torch

torch.manual_seed(123)
batch_example = torch.randn(2, 5)
print(batch_example)
layer = nn.Sequential(nn.Linear(5, 6), nn.ReLU())
out: torch.Tensor = layer(batch_example)
print(out)


In [None]:
mean = out.mean(dim=-1, keepdim=True)
var = out.var(dim=-1, keepdim=True)
out = (out - mean) / torch.sqrt(var)
torch.set_printoptions(sci_mode=True)
print(out.mean(dim=-1, keepdim=True))

In [None]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x: torch.Tensor):
        mean = x.mean(dim=-1, keepdim=True)
        # bessel's correction , the embedding dimension is too large, so the unbiased=True which calculating the var by dividing n-1 instead of n is negligible.
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [None]:
ln = LayerNorm(emb_dim=5)
torch.manual_seed(123)
batch_example = torch.randn(2, 5)
out_ln = ln(batch_example)
torch.set_printoptions(sci_mode=False)
print(out_ln)
print(out_ln.mean(dim=-1, keepdim=True))
print(out_ln.var(dim=-1, keepdim=True, unbiased=False))
# available hardware dictates batch size

# GPT ARCHITECTURE PART 3: FEEDFORWARD NEURAL NETWORK WITH GELU ACTIVATION

In [None]:
import torch.nn as nn


class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))))

In [None]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(nn.Linear(cfg['emb_dim'], 4 * cfg['emb_dim']), GELU(),
                                    nn.Linear(4 * cfg['emb_dim'], cfg['emb_dim']))

    def forward(self, x):
        return self.layers(x)

In [None]:
import torch

ffn = FeedForward(GPT_CONFIG_124M)
x = torch.rand(2, 3, 768)
out = ffn(x)
print(out.shape)

# GPT ARCHITECTURE PART 4: SHORTCUT CONNECTIONS

In [None]:
class ExampleDeepNeuralNetwork(nn.Module):
    def __init__(self, layer_sizes, use_shortcut):
        super().__init__()
        self.use_shortcut = use_shortcut
        self.layers = nn.ModuleList([
            nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4]), GELU()),
            nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5]), GELU()),
        ])

    def forward(self, x):
        for layer in self.layers:
            layer_output = layer(x)
            if self.use_shortcut and x.shape == layer_output.shape:
                x = x + layer_output
            else:
                x = layer_output
        return x

In [None]:
layer_sizes = [3, 3, 3, 3, 3, 1]
sample_input = torch.tensor([[1., 0., -1.]])
print(sample_input)
torch.manual_seed(123)
model_without_shortcut = ExampleDeepNeuralNetwork(layer_sizes, False)

In [None]:
def print_gradients(model, x):
    output = model(x)
    target = torch.tensor([[0.]])
    loss = nn.MSELoss()
    loss = loss(output, target)
    loss.backward()

    for name, param in model.named_parameters():
        if 'weight' in name:
            print(f"{name} has gradient mean of {param.grad.abs().mean().item()}")

In [None]:
print_gradients(model_without_shortcut, sample_input)

In [None]:
torch.manual_seed(123)
model_with_shortcut = ExampleDeepNeuralNetwork(layer_sizes, True)
print_gradients(model_with_shortcut, sample_input)

# GPT ARCHITECTURE PART 5: CODING ATTENTION AND LINEAR LAYERS IN A TRANSFORMER BLOCK

In [None]:
GPT_CONFIG_124M = {
    'vocab_size': 50257,
    'context_length': 1024,
    'emb_dim': 768,
    'n_heads': 12,
    'n_layers': 12,
    'drop_rate': 0.1,
    'qkv_bias': False
}

In [None]:
import torch
import torch.nn as nn


class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.scale = nn.Parameter(torch.ones(emb_dim))  # trainable
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True)
        return self.scale * ((x - mean) / torch.sqrt(var)) + self.shift


class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))))


class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg['emb_dim'], 4 * cfg['emb_dim']),
            GELU(),
            nn.Linear(4 * cfg['emb_dim'], cfg['emb_dim'])
        )

    def forward(self, x):
        return self.layers(x)

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg['emb_dim'],
            d_out=cfg['emb_dim'],
            context_length=cfg['context_length'],
            num_heads=cfg['n_heads'],
            dropout=cfg['drop_rate'],
            qkv_bias=cfg['qkv_bias']
        )
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg['emb_dim'])
        self.norm2 = LayerNorm(cfg['emb_dim'])
        self.drop_shortcut = nn.Dropout(cfg['drop_rate'])

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x += shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x += shortcut
        return x

In [None]:
torch.manual_seed(123)
x = torch.rand(2, 4, 768)
block = TransformerBlock(GPT_CONFIG_124M)
output = block(x)
print(output.shape)

# GPT ARCHITECTURE PART 6: ENTIRE GPT MODEL ARCHITECTURE IMPLEMENTATION

In [None]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [None]:
import torch
import torch.nn as nn


class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.token_emb = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_length'], cfg['emb_dim'])
        self.dropout = nn.Dropout(cfg['drop_rate'])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg['n_layers'])]
        )

        self.final_norm = LayerNorm(cfg['emb_dim'])
        self.out_head = nn.Linear(cfg['emb_dim'], cfg['vocab_size'], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.token_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.dropout(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits


class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg['emb_dim'],
            d_out=cfg['emb_dim'],
            context_length=cfg['context_length'],
            dropout=cfg['drop_rate'],
            num_heads=cfg['n_heads'],
            qkv_bias=cfg['qkv_bias']
        )
        self.norm1 = LayerNorm(cfg['emb_dim'])
        self.norm2 = LayerNorm(cfg['emb_dim'])
        self.dropout = nn.Dropout(cfg['drop_rate'])
        self.ff = FeedForward(cfg)

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.dropout(x)
        x += shortcut

        shortcut = x
        x = self.norm1(x)
        x = self.ff(x)
        x = self.dropout(x)
        x += shortcut

        return x


class LayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()
        # simple placeholder mean of zero and variance of one
        # help stability neural network training and reduce the problem of internal covariate shift
        self.eps = eps
        self.scale = nn.Parameter(torch.ones(normalized_shape))
        self.shift = nn.Parameter(torch.zeros(normalized_shape))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True)
        return self.scale * ((x - mean) / torch.sqrt(var + self.eps)) + self.shift

In [None]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
text1 = "Every effort moves you"
text2 = "Every day holds a"
batch = []
batch.append(torch.tensor(tokenizer.encode(text1)))
batch.append(torch.tensor(tokenizer.encode(text2)))
batch = torch.stack(batch, dim=0)

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
out = model(batch)
print(out.shape)

In [None]:
params = sum([p.numel() for p in model.parameters()])
print(f"{params:,}")
total_size_types = params * 4
total_size_mb = total_size_types / (1024 * 1024)
print(total_size_mb)


# GPT ARCHITECTURE PART 7: GENERATING TEXT GROM OUTPUT TOKENS

In [35]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (batch, n_tokens) array of indices in the current context
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]
        probas = torch.softmax(logits, dim=-1)
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)
        idx = torch.cat((idx, idx_next), dim=-1)
    return idx

In [None]:
start_text = "hello, I am"
tokenizer = tiktoken.get_encoding("gpt2")
encoded = tokenizer.encode(start_text)
encoded_tensor = torch.tensor(encoded).unsqueeze(0)

In [None]:
model.eval()  # bypass disable norm layer, dropout layer
out = generate_text_simple(model=model, idx=encoded_tensor, max_new_tokens=6,
                           context_size=GPT_CONFIG_124M['context_length'])
print(out)
print(out.shape)
print(tokenizer.decode(out.squeeze(0).tolist()))

In [None]:
import torch

GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,  # Shortened context length (orig: 1024)
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval()

In [11]:
import tiktoken


def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # this unsqueeze(0) just add batch dimension
    return encoded_tensor


def token_ids_to_text(token_ids, tokenizer):
    flat: torch.Tensor = token_ids.squeeze(0)  # remove batch dimension
    return tokenizer.decode(flat.tolist())

In [None]:
start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")
token_ids = text_to_token_ids(start_context, tokenizer)
predicts = generate_text_simple(model, token_ids, 10, GPT_CONFIG_124M['context_length'])
print(token_ids_to_text(predicts, tokenizer))

# CALCULATING THE TEXT GENERATION LOSS: CROSS-ENTROPY AND PERPLEXITY

In [None]:
inputs = torch.tensor([[16833, 3626, 6100], [40, 1170, 588]])
targets = torch.tensor([[3626, 6100, 345], [1170, 588, 11311]])

In [None]:
with torch.no_grad():
    logits = model(inputs)

probas = torch.softmax(logits, dim=-1)
print(probas.shape)

In [None]:
token_ids = torch.argmax(probas, dim=-1, keepdim=True)  # argmax return the indices of the maximum value
print(token_ids)

In [None]:
print(token_ids_to_text(token_ids[0].flatten(), tokenizer))
print(token_ids_to_text(targets[0], tokenizer))

# CROSS ENTROPY LOSS

In [None]:
text_idx = 0
target_probas_1 = probas[text_idx, [0, 1, 2], targets[text_idx]]
text_idx = 1
target_probas_2 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print(target_probas_1, target_probas_2)

In [None]:
log_probas = torch.log(torch.cat((target_probas_1, target_probas_2)))
print(log_probas)

In [None]:
avg_log_probas = log_probas.mean()
print(avg_log_probas)

In [None]:
neg_avg_log_probas = avg_log_probas * -1
print(neg_avg_log_probas)

In [None]:
logits_flat = logits.flatten(0, 1)
print(logits_flat)
target_flat = targets.flatten(0, 1)
print(targets)

In [None]:
pos_avg_log_probas = nn.functional.cross_entropy(logits_flat, target_flat)

# PERPLEXITY

In [None]:
# lower perplexity score = better predictions
perplexity = torch.exp(pos_avg_log_probas)
print(perplexity)
# this means model is roughly as uncertain as if it had to choose the next token randomly from about 51492 tokens in the vocabulary

# CALCULATING THE TRAINING AND VALIDATION SET LOSSES

In [27]:
with open("the-verdict.txt", "r", encoding="utf-8") as file:
    text_data = file.read()

In [28]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

In [32]:
train_ratio = 0.9
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

torch.manual_seed(123)

train_loader = create_dataloader_v1(train_data, batch_size=2, max_length=GPT_CONFIG_124M['context_length'],
                                    stride=GPT_CONFIG_124M['context_length'], drop_last=True,
                                    shuffle=True, num_workers=0)

val_loader = create_dataloader_v1(val_data, batch_size=2, max_length=GPT_CONFIG_124M['context_length'],
                                  stride=GPT_CONFIG_124M['context_length'], drop_last=False,
                                  shuffle=False, num_workers=0)

In [None]:
# Sanity check

In [None]:
for v in train_loader:
    print(x.shape, y.shape)
    # print(x[-1, -10:])
    # print(tokenizer.decode(x.flatten(0, 1).tolist()))
print("-" * 50)
for x, y in val_loader:
    print(x.shape, y.shape)
    # print(tokenizer.decode(x.flatten(0, 1).tolist()))

In [24]:
def calc_loss_batch(input_batch: torch.Tensor, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten(0, 1))
    return loss


def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))

    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

Note:
Uncommenting the following lines will allow the code to run on Apple silicon chips, if applicable
which is approximately 2x faster than on an Apple CPU (as measured on an M3 MacBook Air).
However, the resulting loss values may be slightly different.

In [None]:
# if torch.cuda.is_available():
#     device = torch.device("cuda")
# elif torch.backends.mps.is_available():
#     device = torch.device("mps")
# else:
#     device = torch.device("cpu")

In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

torch.manual_seed(123)

with torch.no_grad():  # Disable gradient tracking for efficiency because we are not training yet.
    train_loss = calc_loss_loader(train_loader, model, device)
    val_loss = calc_loss_loader(val_loader, model, device)

print("Traning loss:", train_loss)
print("Validation loss:", train_loss)

Traning loss: 10.9893217086792
Validation loss: 10.9893217086792


# TRAINING LOOP FOR THE LLM

In [20]:
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq,
                       eval_iter, start_context, tokenizer):
    # Initialize lists to track losses and tokens seen
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()

        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()  # Reset loss gradients from previous batch iteration
            loss: torch.Tensor = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()
            optimizer.step()  # Update model weights using loss gradients
            tokens_seen += input_batch.numel()  # Returns the total number of elements (or tokens) in the input_batch
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch + 1} (Step {global_step:06d}):"
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

        # Print a sample text after each epoch
        generate_and_print_sample(model, tokenizer, device, start_context)
    return train_losses, val_losses, track_tokens_seen



In [21]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

In [22]:
def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(model=model, idx=encoded, max_new_tokens=50, context_size=context_size)
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))  # Compact print format
    model.train()

In [36]:
import time

start_time = time.time()

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

num_epochs = 10
train_losses, val_losses, tokens_seen = train_model_simple(model, train_loader, val_loader, optimizer, device,
                                                           num_epochs=num_epochs, eval_freq=5, eval_iter=5,
                                                           start_context="Every effort moves you", tokenizer=tokenizer)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000):Train loss 9.782, Val loss 9.934
Ep 1 (Step 000005):Train loss 8.112, Val loss 8.340
Every effort moves you,,,,,,,,,,,,.                                     
Ep 2 (Step 000010):Train loss 6.662, Val loss 7.049
Ep 2 (Step 000015):Train loss 5.961, Val loss 6.616
Every effort moves you, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and,, and, and,
Ep 3 (Step 000020):Train loss 5.734, Val loss 6.605
Ep 3 (Step 000025):Train loss 5.195, Val loss 6.342
Every effort moves you, and I had been.                                            
Ep 4 (Step 000030):Train loss 4.412, Val loss 6.277
Ep 4 (Step 000035):Train loss 4.065, Val loss 6.225
Every effort moves you know the                          "I he had the donkey and I had the and I had the donkey and down the room, I had
Ep 5 (Step 000040):Train loss 3.730, Val loss 6.160
Every effort moves you know it was not that the picture--I had the fact by the last I had be

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator


def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
    fig, ax1 = plt.subplots(figsize=(5, 3))

    ax1.plot(epochs_seen, train_losses, label="Training loss")
    ax1.plot(epochs_seen, val_losses, label="Validation loss", linestyle="-.")
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel("Loss")
    ax1.legend(loc="upper right")
    ax1.xaxis.set_major_locator(MaxNLocator(integer=True))  # Only show integer labels on x-axis

    # Create a second x-axis for tokens seen
    ax2 = ax1.twiny()  # Create a second x-axis that shares the same y-axis
    ax2.plot(tokens_seen, train_losses, alpha=0)  # Invisible plot for aligning ticks
    ax2.set_xlabel("Tokens seen")

    fig.tight_layout()  # adjust layout to make room
    plt.savefig("loss-plot.pdf")
    plt.show()


epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)

In [None]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,  # Shortened context length (orig: 1024)
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}
model = GPTModel(GPT_CONFIG_124M)
model.to("cpu")
model.eval()

# MERGE TEMPERATURE SCALING AND TOP-K SAMPLING

In [44]:
def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        if top_k is not None:
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)
        if temperature > 0.0:
            logits = logits / temperature

            probas = torch.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probas, num_samples=1)

        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)

        if idx_next == eos_id:
            break
        idx = torch.cat((idx, idx_next), dim=-1)
    return idx

In [45]:
torch.manual_seed(123)

token_ids = generate(model, text_to_token_ids("Every effort moves you", tokenizer), 15,
                     GPT_CONFIG_124M['context_length'], 1.4, 25)

print(token_ids_to_text(token_ids, tokenizer))

tensor([[18250,   772]])
tensor([[464, 670]])
tensor([[  11, 5975]])
tensor([[ 286, 5739]])
tensor([[764,  13]])
tensor([[383, 402]])
tensor([[ 271, 1617]])
tensor([[261, 286]])
tensor([[262, 607]])
tensor([[438,  12]])
tensor([[12239, 49903]])
tensor([[262,   0]])
tensor([[ 887, 1375]])
tensor([[2045, 3521]])
tensor([[470, 523]])
Every effort moves youlit evenThe work, surprise of frame .. The Gisrafton of the her---piececolour the! But She looking couldn't so
