# Assignment 2

In this assignment, you will continue with the Bigram Language Model from the Lecture. Make the training loop and inference for the model.

## Importing Libraries

In [1]:
import os
import math
from dataclasses import dataclass
import torch
from torch.nn import functional as F

import os
import random
import numpy as np
import torch
def set_seed(seed: int):
    """
    Set the random seed for reproducibility.

    Args:
        seed (int): The seed value to set.
    """
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print(f"Random seed set to {seed}")


def configure_device() -> torch.device:
    """
    Configure the device for training.

    Returns:
        torch.device: The device to use for training.
    """
    if torch.cuda.is_available():
        device = torch.device("cuda")
        num_gpu = torch.cuda.device_count()
        print(f"Running on {num_gpu} {torch.cuda.get_device_name()} GPU(s)")
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
        print(f"Running on {device}")
    else:
        device = torch.device("cpu")
        print(f"Running on {device}")
    return device


def load_text(file_path: str, encoding: str = 'utf-8') -> str:
    """
    Load and read text data from a file.

    Args:
        file_path (str): Path to the text file.
        encoding (str, optional): File encoding. Defaults to 'utf-8'.

    Returns:
        str: The content of the text file.
    """
    if not os.path.isfile(file_path):
        print(f"File not found: {file_path}")
        raise FileNotFoundError(f"File not found: {file_path}")

    with open(file_path, 'r', encoding=encoding) as f:
        text = f.read()

    print(f"Loaded text data from {file_path} (length: {len(text)} characters).")
    return text

## Configuration

In [2]:
@dataclass
class BigramConfig:
    root_dir: str = os.getcwd() + "/../../"
    dataset_path: str = "data/names.txt"

    # Tokenizer
    vocab_size: int = 0  # Set later

    seed: int = 101
    
config = BigramConfig()

## Reproducibility

In [3]:
set_seed(config.seed)

Random seed set to 101


## Dataset

In [4]:
names = load_text(config.root_dir + config.dataset_path).splitlines()

Loaded text data from /mnt/c/Users/whald/LLM101n/notebooks/Assignments/../../data/names.txt (length: 228145 characters).


## Preprocessing

In [5]:
# Add special token
names = ["." + name + "." for name in names]

## Tokenizer

In [6]:
chars = [chr(i) for i in range(97, 123)]  # all alphabet characters
chars.insert(0, ".")  # Add special token
config.vocab_size = len(chars)
str2idx = {char: idx for idx, char in enumerate(chars)}
idx2str = {idx: char for char, idx in str2idx.items()}

## Model

In [7]:
# Initialize weights
W = torch.randn(config.vocab_size, config.vocab_size, requires_grad=True)
b = torch.randn(config.vocab_size, requires_grad=True)
params = [W, b]

## Training

#### Task 1: Train Bigram Language Model (Neural Network Approach)

Make the training loop for the Bigram Language Model.

In [8]:
# Set of Input, Target pairs
inputs, targets = [], []
for name in names:
    for char1, char2 in zip(name, name[1:]):
        input = str2idx[char1]
        target = str2idx[char2]
        inputs.append(input)
        targets.append(target)

# Convert to tensor
inputs = torch.tensor(inputs, dtype=torch.long)
targets = torch.tensor(targets, dtype=torch.long)

In [9]:
print(f"Number of Input, Target pairs: {len(inputs)}")
print(f"Input shape: {inputs.shape}")
print(f"Target shape: {targets.shape}")
print(f"First (Input, Target): ({inputs[0]}, {targets[0]})")
print(f"Second (Input, Target): ({inputs[1]}, {targets[1]})")

Number of Input, Target pairs: 228146
Input shape: torch.Size([228146])
Target shape: torch.Size([228146])
First (Input, Target): (0, 5)
Second (Input, Target): (5, 13)


In [10]:
################################################################################
# TODO:                                                                        #
# One-hot encode the input tensor.                                             #
################################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
inputs_encoded = F.one_hot(inputs, num_classes = config.vocab_size)
targets_encoded = F.one_hot(targets, num_classes = config.vocab_size)
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

# Convert data type to float
inputs_encoded = inputs_encoded.float()

In [11]:
# Training Loop
steps = 500
lr = 1 #logscale it.

for step in range(1, steps + 1):
    # Forward pass
    ################################################################################
    # TODO:                                                                        #
    # Implement the forward pass.                                                  #
    ################################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    logits = inputs_encoded @ W + b
    probs = logits.exp()/logits.exp().sum(dim=-1,keepdim=True)
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    
    # loss
    log_probs = torch.log(probs + 1e-9)  # Add small value to prevent log(0)
    loss = -log_probs[torch.arange(len(targets)), targets].mean()
    
    # Backward pass
    ################################################################################
    # TODO:                                                                        #
    # Implement the backward pass.                                                 #
    ################################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    loss.backward()
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    
    # Update weights
    ################################################################################
    # TODO:                                                                        #
    # Update the weights of the model using the gradients.                         #
    ################################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    with torch.no_grad():
        W -= lr*W.grad
        b -= lr*b.grad
        W.grad.zero_()
        b.grad.zero_()
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    
    if step % 10 == 0:
        print(f"Step {step}, Loss {loss.item():.4f}")

Step 10, Loss 3.7275
Step 20, Loss 3.3789
Step 30, Loss 3.2139
Step 40, Loss 3.1160
Step 50, Loss 3.0472
Step 60, Loss 2.9946
Step 70, Loss 2.9523
Step 80, Loss 2.9172
Step 90, Loss 2.8873
Step 100, Loss 2.8613
Step 110, Loss 2.8385
Step 120, Loss 2.8181
Step 130, Loss 2.7999
Step 140, Loss 2.7833
Step 150, Loss 2.7683
Step 160, Loss 2.7546
Step 170, Loss 2.7419
Step 180, Loss 2.7303
Step 190, Loss 2.7195
Step 200, Loss 2.7094
Step 210, Loss 2.7001
Step 220, Loss 2.6914
Step 230, Loss 2.6832
Step 240, Loss 2.6756
Step 250, Loss 2.6684
Step 260, Loss 2.6616
Step 270, Loss 2.6553
Step 280, Loss 2.6493
Step 290, Loss 2.6436
Step 300, Loss 2.6382
Step 310, Loss 2.6331
Step 320, Loss 2.6283
Step 330, Loss 2.6237
Step 340, Loss 2.6193
Step 350, Loss 2.6152
Step 360, Loss 2.6112
Step 370, Loss 2.6074
Step 380, Loss 2.6038
Step 390, Loss 2.6004
Step 400, Loss 2.5971
Step 410, Loss 2.5939
Step 420, Loss 2.5909
Step 430, Loss 2.5880
Step 440, Loss 2.5852
Step 450, Loss 2.5826
Step 460, Loss 2.58

## Inference

#### Task 2: Generate a Name

Create a function to generate a name using the trained Bigram Language Model.

In [12]:
# Create a function to generate a name
def generate_name():
    new_name = []
    start_idx = str2idx["."]
    
    while True:
        ################################################################################
        # TODO:                                                                        #
        # 1. Forward pass                                                              #
        # 2. Sample the next token                                                     #
        # 3. Decode the token                                                          #
        # 4. Update the start_idx                                                      #
        # 5. Break if the next character is "."                                        #
        ################################################################################
        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
        # Forward pass
        start_encode = F.one_hot(torch.tensor([start_idx]),num_classes=config.vocab_size).float()
        logits = start_encode@W+b
        
        # Sample
        probs = logits.exp()/logits.exp().sum(dim=-1,keepdim=True)
        next_idx = torch.multinomial(probs,num_samples=1).item()
        # Decode
        next_char = idx2str[next_idx]
        new_name.append(next_char)
        
        # Update
        start_idx = next_idx
        
        # Break if "."
        if start_idx == str2idx['.']:
            break
        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    return ''.join(new_name)

# Generate 5 names
for _ in range(5):
    print(generate_name())

aymienzeta.
dhchatisrnearasan.
tadeleanion.
greeema.
mzlymaoshyif.


## Extra Credit

We have already made our own custom auto-grad Tensor class. Let's use it!

Train the Bigram Language Model using our custom auto-grad Tensor class.

**Do not use any built-in PyTorch functions.** (other deep learning libraries are also prohibited)

In [None]:
class Tensor:
    def __init__(self, data, _children=(), _operation=''):
        self.data = data
        self._prev = set(_children)
        self.gradient = 0
        self._backward = lambda: None

    def __repr__(self):
        return f"tensor=({self.data})"

    def __add__(self, other):  # self + other
        output = Tensor(self.data + other.data, (self, other), '+')
        def _backward():
            self.gradient = 1 * output.gradient
            other.gradient = 1 * output.gradient
        output._backward = _backward
        return output

    def __mul__(self, other):  # self * other
        output = Tensor(self.data * other.data, (self, other), '*')
        def _backward():
            self.gradient = other.data * output.gradient
            other.gradient = self.data * output.gradient
        output._backward = _backward
        return output

    def tanh(self):  # tanh(self)
        output = Tensor(math.tanh(self.data), (self,), 'tanh')
        def _backward():
            self.gradient = (1.0 - math.tanh(self.data) ** 2) * output.gradient
        output._backward = _backward
        return output

    def __pow__(self, power):  # self ** power
        assert isinstance(power, (int, float)), "Power must be an int or a float"
        output = Tensor(self.data ** power, (self,), f'**{power}')
        def _backward():
            self.gradient = power * (self.data ** (power - 1)) * output.gradient
        output._backward = _backward
        return output

    def backward(self):
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)
        self.gradient = 1
        for node in reversed(topo):
            node._backward()

    def __neg__(self): # -self
        return self * Tensor(-1.0)

    def __sub__(self, other): # self - other
        return self + (-other)

In [None]:
################################################################################
# TODO:                                                                        #
################################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****