In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [3]:
torch.manual_seed(1337)  # Seed for reproducibility

<torch._C.Generator at 0x15044416b0f0>

In [4]:
# Load and process the text data
with open('input2.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print("Length of dataset in characters: ", len(text))

# Let's look at the first 1000 characters
print(text[:1000])

Length of dataset in characters:  5427
I am happy to join with you today in what will go down in history as the greatest demonstration for
freedom in the history of our nation.
Five score years ago a great American in whose symbolic shadow we stand today signed the
Emancipation Proclamation. This momentous decree is a great beacon light of hope to millions of Negro
slaves who had been seared in the flames of withering injustice. It came as a joyous daybreak to end the
long night of their captivity. But 100 years later the Negro still is not free. One hundred years later the
life of the Negro is still badly crippled by the manacles of segregation and the chains of discrimination.
One hundred years later the Negro lives on a lonely island of poverty in the midst of a vast ocean of
material prosperity. One hundred years later the Negro is still languished in the corners of American
society and finds himself in exile in his own land. So we’ve come here today to dramatize a shameful
conditi

In [5]:
# Extracting and organizing the unique characters from the text
chars = sorted(list(set(text)))  # Extracts unique characters, converts to a list, and sorts it
vocab_size = len(chars)  # Counts the number of unique characters to determine vocabulary size

# Using f-strings for better output formatting
print('Unique Characters:', ''.join(chars))  # Prints all unique characters as a single string
print(f'Vocabulary Size: {vocab_size}')  # Prints the vocabulary size we will be using

# Creating mappings between characters and integers
stoi = {ch: i for i, ch in enumerate(chars)}  # Mapping from characters to integers
itos = {i: ch for i, ch in enumerate(chars)}  # Mapping from integers to characters

# Defining encoding and decoding functions
encode = lambda s: [stoi[c] for c in s]  # Encodes a string into a list of integers
decode = lambda l: ''.join([itos[i] for i in l])  # Decodes a list of integers back into a string

# Testing the encoding and decoding functions
encoded_str = encode("hii there")  # Encoding a sample string
print(f'Encoded: {encoded_str}')  # Printing the encoded representation

decoded_str = decode(encoded_str)  # Decoding the encoded string
print(f'Decoded: {decoded_str}')  # Printing the decoded string

Unique Characters: 
 ,-.0124:?ABCDEFGHIJLMNOPRSTWYabcdefghijklmnopqrstuvwxyz©—’“”
Vocabulary Size: 62
Encoded: [38, 39, 39, 1, 50, 38, 35, 48, 35]
Decoded: hii there


In [6]:
# Convert the entire text into a tensor of encoded integers
data = torch.tensor(encode(text), dtype=torch.long)  # Encodes the text and converts it to a PyTorch tensor

print(data.shape, data.dtype)
print(data[:1000]) # Encoding of the the 1000 characters printed above

torch.Size([5427]) torch.int64
tensor([19,  1, 31, 43,  1, 38, 31, 46, 46, 55,  1, 50, 45,  1, 40, 45, 39, 44,
         1, 53, 39, 50, 38,  1, 55, 45, 51,  1, 50, 45, 34, 31, 55,  1, 39, 44,
         1, 53, 38, 31, 50,  1, 53, 39, 42, 42,  1, 37, 45,  1, 34, 45, 53, 44,
         1, 39, 44,  1, 38, 39, 49, 50, 45, 48, 55,  1, 31, 49,  1, 50, 38, 35,
         1, 37, 48, 35, 31, 50, 35, 49, 50,  1, 34, 35, 43, 45, 44, 49, 50, 48,
        31, 50, 39, 45, 44,  1, 36, 45, 48,  0, 36, 48, 35, 35, 34, 45, 43,  1,
        39, 44,  1, 50, 38, 35,  1, 38, 39, 49, 50, 45, 48, 55,  1, 45, 36,  1,
        45, 51, 48,  1, 44, 31, 50, 39, 45, 44,  4,  0, 16, 39, 52, 35,  1, 49,
        33, 45, 48, 35,  1, 55, 35, 31, 48, 49,  1, 31, 37, 45,  1, 31,  1, 37,
        48, 35, 31, 50,  1, 11, 43, 35, 48, 39, 33, 31, 44,  1, 39, 44,  1, 53,
        38, 45, 49, 35,  1, 49, 55, 43, 32, 45, 42, 39, 33,  1, 49, 38, 31, 34,
        45, 53,  1, 53, 35,  1, 49, 50, 31, 44, 34,  1, 50, 45, 34, 31, 55,  1,
        4

In [7]:
# Determine the split point for training and validation sets
n = int(0.9 * len(data))  # Calculates 90% of the data length to use as the training set size

# Split the data into training and validation sets
train_data = data[:n]  # Assigns the first 90% of the data to training
val_data = data[n:]  # Assigns the remaining 10% to validation

block_size = 8
train_data[:block_size+1] #block elements + 1 character sampled from the data

x = train_data[:block_size]

y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1] #varying contect
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([19]) the target: 1
when input is tensor([19,  1]) the target: 31
when input is tensor([19,  1, 31]) the target: 43
when input is tensor([19,  1, 31, 43]) the target: 1
when input is tensor([19,  1, 31, 43,  1]) the target: 38
when input is tensor([19,  1, 31, 43,  1, 38]) the target: 31
when input is tensor([19,  1, 31, 43,  1, 38, 31]) the target: 46
when input is tensor([19,  1, 31, 43,  1, 38, 31, 46]) the target: 46


In [8]:
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    
    ix = torch.randint(len(data) - block_size, (batch_size,)) #randomizing data selection
    
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape) #4x8 tensor
print(xb)

print('----')

print('targets:')
print(yb.shape) #4x8 tensor
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[38, 35, 48, 35,  1, 50, 45, 34],
        [34,  1, 50, 38, 35,  0, 15, 43],
        [49,  1, 43, 51, 49, 50,  1, 32],
        [45, 43, 43, 51, 44, 39, 50, 55]])
----
targets:
torch.Size([4, 8])
tensor([[35, 48, 35,  1, 50, 45, 34, 31],
        [ 1, 50, 38, 35,  0, 15, 43, 31],
        [ 1, 43, 51, 49, 50,  1, 32, 35],
        [43, 43, 51, 44, 39, 50, 55,  1]])
----
when input is [38] the target: 35
when input is [38, 35] the target: 48
when input is [38, 35, 48] the target: 35
when input is [38, 35, 48, 35] the target: 1
when input is [38, 35, 48, 35, 1] the target: 50
when input is [38, 35, 48, 35, 1, 50] the target: 45
when input is [38, 35, 48, 35, 1, 50, 45] the target: 34
when input is [38, 35, 48, 35, 1, 50, 45, 34] the target: 31
when input is [34] the target: 1
when input is [34, 1] the target: 50
when input is [34, 1, 50] the target: 38
when input is [34, 1, 50, 38] the target: 35
when input is [34, 1, 50, 38, 35] the target: 0
when input is 

In [8]:
# Hyperparameters and environment setup
batch_size = 16  # Number of sequences processed in parallel
block_size = 32  # Maximum context length for predictions

max_iters = 5000  # Number of training iterations

eval_interval = 100  # Interval for evaluating the model
learning_rate = 1e-3  # Learning rate for optimizer
eval_iters = 200  # Number of iterations for evaluation
n_embd = 64  # Size of each embedding vector
n_head = 4  # Number of attention heads
n_layer = 4  # Number of transformer layers
dropout = 0.0  # Dropout rate


device = 'cuda' if torch.cuda.is_available() else 'cpu'  # Use GPU if available

In [9]:
# Function to get a batch of data
def get_batch(split):
    # Choose the appropriate dataset based on the 'split' argument
    data = train_data if split == 'train' else val_data  

    # Randomly select starting indices for sequences in the batch
    ix = torch.randint(len(data) - block_size, (batch_size,))

    # Create input sequences (x) using the selected indices
    x = torch.stack([data[i:i + block_size] for i in ix])

    # Create target output sequences (y), which are offset by one character
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix])

    # Move the input and target tensors to the specified device (GPU or CPU)
    return x.to(device), y.to(device)

In [10]:
@torch.no_grad()  # Disables gradient calculations for efficiency
def estimate_loss():
    out = {}  # Dictionary to store the average losses for training and validation
    model.eval()  # Set the model to evaluation mode

    # Iterate over both training and validation datasets
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)  # Initialize an array to store individual losses

        # Compute loss over a number of iterations
        for k in range(eval_iters):
            X, Y = get_batch(split)  # Get a batch of data
            logits, loss = model(X, Y)  # Forward pass through the model to compute loss
            losses[k] = loss.item()  # Store the loss for this iteration

        out[split] = losses.mean()  # Calculate the average loss for this dataset

    model.train()  # Set the model back to training mode
    return out  # Return the average losses

In [11]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        # Linear layers to transform input into key, query, and value vectors
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)

        # Lower triangular matrix for masking in attention calculation
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        # Dropout for regularization
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Extract batch size (B), sequence length (T), and embedding dimension (C)
        B, T, C = x.shape

        # Compute key, query, and value vectors
        k = self.key(x)   # Key vector
        q = self.query(x) # Query vector

        # Calculate attention scores
        wei = q @ k.transpose(-2, -1) * C**-0.5  # Scaled dot-product attention
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))  # Masking to prevent attending to future positions
        wei = F.softmax(wei, dim=-1)  # Apply softmax to get attention weights
        wei = self.dropout(wei)  # Apply dropout

        # Compute weighted sum of values
        v = self.value(x)  # Value vector
        out = wei @ v  # Weighted sum of values

        return out

In [12]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        # Creating multiple attention heads
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        # Linear layer for projecting the concatenated output
        self.proj = nn.Linear(n_embd, n_embd)
        # Dropout layer for regularization
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Concatenating outputs from all attention heads
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        # Projecting concatenated output to original embedding size and applying dropout
        out = self.dropout(self.proj(out))
        return out

In [13]:
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        # Defining the feedforward network
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),  # First linear layer, expands dimension
            nn.ReLU(),                      # Non-linear activation function
            nn.Linear(4 * n_embd, n_embd),  # Second linear layer, projects back to original dimension
            nn.Dropout(dropout),            # Dropout for regularization
        )

    def forward(self, x):
        # Passes the input through the feedforward network
        return self.net(x)

In [14]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head  # Determining the size of each attention head
        self.sa = MultiHeadAttention(n_head, head_size)  # Multi-head self-attention layer
        self.ffwd = FeedFoward(n_embd)                    # Feedforward network
        self.ln1 = nn.LayerNorm(n_embd)                  # Layer normalization after self-attention
        self.ln2 = nn.LayerNorm(n_embd)                  # Layer normalization after feedforward network

    def forward(self, x):
        # Apply self-attention and add the result to the original input (residual connection)
        x = x + self.sa(self.ln1(x))
        # Apply the feedforward network and add the result to the above output (residual connection)
        x = x + self.ffwd(self.ln2(x))
        return x

In [15]:
class BigramLanguageModel(nn.Module):
    # Inherits from the PyTorch Module class
    def __init__(self):
        super().__init__()  # Initialize the superclass (nn.Module)
        # Embedding layer to convert token indices to embeddings
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        # Embedding layer for positional encodings
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        # Sequential container of Transformer blocks
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        # Layer normalization applied after the Transformer blocks
        self.ln_f = nn.LayerNorm(n_embd)
        # Linear layer to map from embedding dimension to vocabulary size
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        # Extracting batch size (B) and sequence length (T) from input indices
        B, T = idx.shape

        # Getting token embeddings for input indices
        tok_emb = self.token_embedding_table(idx)  # Embedding lookup for tokens (B,T,C)
        # Generating position embeddings for each position in the sequence
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # Positional encoding (T,C)
        # Combining token and position embeddings
        x = tok_emb + pos_emb  # Summing token and position embeddings (B,T,C)

        # Passing the combined embeddings through the Transformer blocks
        x = self.blocks(x)  # Processed by Transformer blocks (B,T,C)
        # Applying layer normalization to the output of the Transformer blocks
        x = self.ln_f(x)  # Normalized output (B,T,C)

        # Projecting the output to vocabulary size to get logits for next token prediction
        logits = self.lm_head(x)  # Output logits (B,T,vocab_size)

        # Compute loss if target tokens are provided
        if targets is not None:
            # Reshaping for loss computation
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            # Calculating cross-entropy loss
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None  # No loss computation if targets are not provided

        return logits, loss  # Returning logits and the loss

    def generate(self, idx, max_new_tokens):
        # Generate text for a given number of tokens
        for _ in range(max_new_tokens):
            # Keeping only the last 'block_size' tokens
            idx_cond = idx[:, -block_size:]
            # Get predictions for the current sequence
            logits, _ = self(idx_cond)
            # Focus on the last time step for prediction
            logits = logits[:, -1, :]
            # Convert logits to probabilities
            probs = F.softmax(logits, dim=-1)
            # Sample the next token from the probability distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # Append the sampled token to the sequence
            idx = torch.cat((idx, idx_next), dim=1)
        return idx  # Return the generated sequence of tokens

model = BigramLanguageModel()  # Instantiate the model
m = model.to(device)  # Move the model to the specified device (CPU or GPU)


In [16]:
# Summing up the number of elements (parameters) in each parameter tensor of the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

0.209342 M parameters


In [17]:
# Creating an optimizer using the AdamW algorithm for the model's parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [18]:
for iter in range(max_iters):
    # Looping through the training iterations

    # Conditional to evaluate model performance periodically
    if iter % eval_interval == 0 or iter == max_iters - 1:
        # Estimate loss on training and validation datasets
        losses = estimate_loss()
        # Print the current step and the estimated losses
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # Sampling a batch of data for training
    xb, yb = get_batch('train')

    # Forward pass: compute predicted outputs by passing inputs to the model
    logits, loss = model(xb, yb)
    # Zeroing the gradients of all optimized tensors
    optimizer.zero_grad(set_to_none=True)
    # Backward pass: compute gradient of the loss with respect to model parameters
    loss.backward()
    # Perform a single optimization step (parameter update)
    optimizer.step()

step 0: train loss 4.2804, val loss 4.3020
step 100: train loss 2.5206, val loss 2.6041
step 200: train loss 2.3421, val loss 2.4581
step 300: train loss 2.1823, val loss 2.3070
step 400: train loss 1.9277, val loss 2.2070
step 500: train loss 1.7422, val loss 2.1280
step 600: train loss 1.5069, val loss 2.0817
step 700: train loss 1.3336, val loss 2.1217
step 800: train loss 1.1177, val loss 2.1974
step 900: train loss 0.9154, val loss 2.1888
step 1000: train loss 0.7547, val loss 2.3044
step 1100: train loss 0.6302, val loss 2.4746
step 1200: train loss 0.5251, val loss 2.5164
step 1300: train loss 0.4613, val loss 2.6419
step 1400: train loss 0.4169, val loss 2.7466
step 1500: train loss 0.3822, val loss 2.8236
step 1600: train loss 0.3501, val loss 2.9377
step 1700: train loss 0.3465, val loss 2.9975
step 1800: train loss 0.3406, val loss 3.1250
step 1900: train loss 0.3230, val loss 3.0770
step 2000: train loss 0.3133, val loss 3.1518
step 2100: train loss 0.3002, val loss 3.1376


In [19]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


join with you today in what will go down in history as the greatest demonstration for
freedom ring from the snowners was lons of Neee’vel, come come tour nation’s capital to cash a check. When the architects of our Republic wrote
the magnificent words votels of civil rights, “Whiten will you be
satisfied?” We can nation sigull, bert  thate areatele ons of the highways and the hotels of the cities.
We cannot be satisfied as long as the Negro’s basic mobility is crippline. Only This note was a promise that all men—yes, thatens  weltel wo ring injustice. It came as a joyous daybreak to end the
long night of their captivity. But 100 years later the Negro is the videncotence. . . . The marvely come richteat of American
society and finds himself in exile ily is tiew
ll all men are created equalittle in exile ily is tied up
with our destiny.
. . . We cannot walk alone. And as we walk we must make the ple, cannot gur beack. Then are those who are. Let freedom ring from the snowcapped Rockies 

In [1]:
#2

In [43]:
# Hyperparameters and environment setup
batch_size = 32  # Number of sequences processed in parallel
block_size = 64  # Maximum context length for predictions

max_iters = 5000  # Number of training iterations

eval_interval = 100  # Interval for evaluating the model
learning_rate = 1e-3  # Learning rate for optimizer
eval_iters = 200  # Number of iterations for evaluation
n_embd = 64  # Size of each embedding vector
n_head = 4  # Number of attention heads
n_layer = 4  # Number of transformer layers
dropout = 0.0  # Dropout rate


device = 'cuda' if torch.cuda.is_available() else 'cpu'  # Use GPU if available

In [44]:
# Function to get a batch of data
def get_batch(split):
    # Choose the appropriate dataset based on the 'split' argument
    data = train_data if split == 'train' else val_data  

    # Randomly select starting indices for sequences in the batch
    ix = torch.randint(len(data) - block_size, (batch_size,))

    # Create input sequences (x) using the selected indices
    x = torch.stack([data[i:i + block_size] for i in ix])

    # Create target output sequences (y), which are offset by one character
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix])

    # Move the input and target tensors to the specified device (GPU or CPU)
    return x.to(device), y.to(device)

In [45]:
@torch.no_grad()  # Disables gradient calculations for efficiency
def estimate_loss():
    out = {}  # Dictionary to store the average losses for training and validation
    model.eval()  # Set the model to evaluation mode

    # Iterate over both training and validation datasets
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)  # Initialize an array to store individual losses

        # Compute loss over a number of iterations
        for k in range(eval_iters):
            X, Y = get_batch(split)  # Get a batch of data
            logits, loss = model(X, Y)  # Forward pass through the model to compute loss
            losses[k] = loss.item()  # Store the loss for this iteration

        out[split] = losses.mean()  # Calculate the average loss for this dataset

    model.train()  # Set the model back to training mode
    return out  # Return the average losses

In [46]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        # Linear layers to transform input into key, query, and value vectors
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)

        # Lower triangular matrix for masking in attention calculation
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        # Dropout for regularization
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Extract batch size (B), sequence length (T), and embedding dimension (C)
        B, T, C = x.shape

        # Compute key, query, and value vectors
        k = self.key(x)   # Key vector
        q = self.query(x) # Query vector

        # Calculate attention scores
        wei = q @ k.transpose(-2, -1) * C**-0.5  # Scaled dot-product attention
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))  # Masking to prevent attending to future positions
        wei = F.softmax(wei, dim=-1)  # Apply softmax to get attention weights
        wei = self.dropout(wei)  # Apply dropout

        # Compute weighted sum of values
        v = self.value(x)  # Value vector
        out = wei @ v  # Weighted sum of values

        return out

In [47]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        # Creating multiple attention heads
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        # Linear layer for projecting the concatenated output
        self.proj = nn.Linear(n_embd, n_embd)
        # Dropout layer for regularization
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Concatenating outputs from all attention heads
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        # Projecting concatenated output to original embedding size and applying dropout
        out = self.dropout(self.proj(out))
        return out

In [48]:
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        # Defining the feedforward network
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),  # First linear layer, expands dimension
            nn.ReLU(),                      # Non-linear activation function
            nn.Linear(4 * n_embd, n_embd),  # Second linear layer, projects back to original dimension
            nn.Dropout(dropout),            # Dropout for regularization
        )

    def forward(self, x):
        # Passes the input through the feedforward network
        return self.net(x)

In [49]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head  # Determining the size of each attention head
        self.sa = MultiHeadAttention(n_head, head_size)  # Multi-head self-attention layer
        self.ffwd = FeedFoward(n_embd)                    # Feedforward network
        self.ln1 = nn.LayerNorm(n_embd)                  # Layer normalization after self-attention
        self.ln2 = nn.LayerNorm(n_embd)                  # Layer normalization after feedforward network

    def forward(self, x):
        # Apply self-attention and add the result to the original input (residual connection)
        x = x + self.sa(self.ln1(x))
        # Apply the feedforward network and add the result to the above output (residual connection)
        x = x + self.ffwd(self.ln2(x))
        return x

In [50]:
class BigramLanguageModel(nn.Module):
    # Inherits from the PyTorch Module class
    def __init__(self):
        super().__init__()  # Initialize the superclass (nn.Module)
        # Embedding layer to convert token indices to embeddings
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        # Embedding layer for positional encodings
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        # Sequential container of Transformer blocks
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        # Layer normalization applied after the Transformer blocks
        self.ln_f = nn.LayerNorm(n_embd)
        # Linear layer to map from embedding dimension to vocabulary size
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        # Extracting batch size (B) and sequence length (T) from input indices
        B, T = idx.shape

        # Getting token embeddings for input indices
        tok_emb = self.token_embedding_table(idx)  # Embedding lookup for tokens (B,T,C)
        # Generating position embeddings for each position in the sequence
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # Positional encoding (T,C)
        # Combining token and position embeddings
        x = tok_emb + pos_emb  # Summing token and position embeddings (B,T,C)

        # Passing the combined embeddings through the Transformer blocks
        x = self.blocks(x)  # Processed by Transformer blocks (B,T,C)
        # Applying layer normalization to the output of the Transformer blocks
        x = self.ln_f(x)  # Normalized output (B,T,C)

        # Projecting the output to vocabulary size to get logits for next token prediction
        logits = self.lm_head(x)  # Output logits (B,T,vocab_size)

        # Compute loss if target tokens are provided
        if targets is not None:
            # Reshaping for loss computation
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            # Calculating cross-entropy loss
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None  # No loss computation if targets are not provided

        return logits, loss  # Returning logits and the loss

    def generate(self, idx, max_new_tokens):
        # Generate text for a given number of tokens
        for _ in range(max_new_tokens):
            # Keeping only the last 'block_size' tokens
            idx_cond = idx[:, -block_size:]
            # Get predictions for the current sequence
            logits, _ = self(idx_cond)
            # Focus on the last time step for prediction
            logits = logits[:, -1, :]
            # Convert logits to probabilities
            probs = F.softmax(logits, dim=-1)
            # Sample the next token from the probability distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # Append the sampled token to the sequence
            idx = torch.cat((idx, idx_next), dim=1)
        return idx  # Return the generated sequence of tokens

model = BigramLanguageModel()  # Instantiate the model
m = model.to(device)  # Move the model to the specified device (CPU or GPU)


In [51]:
# Summing up the number of elements (parameters) in each parameter tensor of the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

0.21139 M parameters


In [52]:
# Creating an optimizer using the AdamW algorithm for the model's parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [53]:
for iter in range(max_iters):
    # Looping through the training iterations

    # Conditional to evaluate model performance periodically
    if iter % eval_interval == 0 or iter == max_iters - 1:
        # Estimate loss on training and validation datasets
        losses = estimate_loss()
        # Print the current step and the estimated losses
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # Sampling a batch of data for training
    xb, yb = get_batch('train')

    # Forward pass: compute predicted outputs by passing inputs to the model
    logits, loss = model(xb, yb)
    # Zeroing the gradients of all optimized tensors
    optimizer.zero_grad(set_to_none=True)
    # Backward pass: compute gradient of the loss with respect to model parameters
    loss.backward()
    # Perform a single optimization step (parameter update)
    optimizer.step()

step 0: train loss 4.2224, val loss 4.2079
step 100: train loss 2.4728, val loss 2.5442
step 200: train loss 2.3185, val loss 2.4138
step 300: train loss 2.1517, val loss 2.3698
step 400: train loss 1.9202, val loss 2.2575
step 500: train loss 1.6052, val loss 2.2271
step 600: train loss 1.2250, val loss 2.2409
step 700: train loss 0.8427, val loss 2.3793
step 800: train loss 0.5431, val loss 2.5915
step 900: train loss 0.3674, val loss 2.8513
step 1000: train loss 0.2908, val loss 3.0762
step 1100: train loss 0.2476, val loss 3.2044
step 1200: train loss 0.2162, val loss 3.3692
step 1300: train loss 0.2116, val loss 3.3895
step 1400: train loss 0.1890, val loss 3.5368
step 1500: train loss 0.1784, val loss 3.7319
step 1600: train loss 0.1770, val loss 3.7772
step 1700: train loss 0.1601, val loss 3.7662
step 1800: train loss 0.1563, val loss 3.8538
step 1900: train loss 0.1588, val loss 3.9478
step 2000: train loss 0.1488, val loss 3.9286
step 2100: train loss 0.1523, val loss 3.9675


In [54]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


California.
But not only that. Let freedom ring from Stone Mountain of Georgia. Let freedom ring from the mighty
mountains of New York. Let freedom ring from the heightening Alleghenies of Pennsylvania. Let
freedom ring from the snowcapped Rockies of Coloramath ahersorad the
lamen—ons alletenie. We can never be satisfied as long as the Negro is the victim of the unspeakable horrors of
police brutality.
We can never be satisfied as long as our bodies, heavy with the fatigue of travel, cannot gain lodging in
the motels of the highways and the hotels of the cities.
We cannot be satisfied as long as the Negro is the victim of the unspeakable horrors of
police brutality.
We can never be satisfied as long as the Negro in Mississippi cannot vote and the Negro in New York
believes he has nothing for which to vote.
No, no, we are not satisfied, and we will not be satisfied until justice rolls down like waters and
righteousness like a mighty stream. . . .
I say to you today, my friends, thze he