In [1]:
%matplotlib inline
import torch
import numpy as np
import matplotlib.pyplot as plt
from dataclasses import dataclass
from torch import nn
import torch.nn.functional as F

In [2]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-07-08 00:59:26--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: 'input.txt'


2024-07-08 00:59:26 (31.0 MB/s) - 'input.txt' saved [1115394/1115394]



In [3]:
# For the bigram model, let's use the first 1000 characters for the data

with open('input.txt', 'r') as f:
    text = f.read()
text = text[:5000]

In [4]:
text

"First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you know Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us kill him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be done: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citizens, the patricians good.\nWhat authority surfeits on would relieve us: if they\nwould yield us but the superfluity, while it were\nwholesome, we might guess they relieved us humanely;\nbut they think we are too dear: the leanness that\nafflicts us, the object of our misery, is as an\ninventory to particularise their abundance; our\nsufferance is a gain to them Let us revenge this with\nour pikes, ere we become rakes: for the gods know I\nspeak this in hunger 

In [5]:
# Get unique characters from the text
# To have unique values, we can use the set data structure
# Reference https://stackoverflow.com/questions/13902805/list-of-all-unique-characters-in-a-string
chars = list(set(text))

In [6]:
len(chars)

53

In [7]:

def encode(string: str) -> list[int]:
    """
    Given a string, encode returns a list of integers that represent the characters
    in the string.
    """
    encodedChars = []

    for s in string:
      encodedChars.append(chars.index(s))
    return encodedChars

def decode(ids: list[int]) -> str:
    """
    Given a list of integers, decode returns the characters in the list as a string.
    """
    decodedChars = [];
    for id in ids:
      decodedChars.append(chars[id])
    return "".join(decodedChars)

# Testing the encode and decode functions

encoded = encode('hello')
decoded = decode(encoded)
print(encoded)
print(decoded)

[47, 33, 1, 1, 36]
hello


## Generative Pretrained Transformer

For this part, it is best to use a gpu. In the settings at the top go to Runtime -> Change Runtime Type and select T4 GPU

In [9]:
# Parameters
vocab_size = len(chars)  # Adjust based on your dataset
n_embd = 768        # Embedding dimension
n_head = 1         # Number of attention heads
n_block = 4         # Number of Transformer blocks
block_size = 128    # Length of the sequence window
batch_size = 64     # Number of sequences per batch
max_iters = 5000    # Total iterations for training
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [10]:
print(device)

cuda


In [11]:
# run nvidia-smi to check gpu usage
!nvidia-smi

Mon Jul  8 01:00:41 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   43C    P8             10W /   70W |       3MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

In [12]:
# For the gpt model, let's use the full text

with open('input.txt', 'r') as f:
    text = f.read()

Implement a character level tokenization function.

1. Create a list of unique characters in the string. (1 points)
2. Implement a function `encode(s: str) -> list[int]` that takes a string and returns a list of ids (1 point)
3. Implement a function `decode(ids: list[int]) -> str` that takes a list of ids (ints) and returns a string (1 point)


In [14]:
chars = []

# List of unique characters in the string
for char in text:
    if char not in chars:
        chars.append(char)

def encode(s: str) -> list[int]:
    """
    Takes in a list of characters and returns a list of ids (ints)
    """
    return [chars.index(char) for char in s]

def decode(ids: list[int]) -> str:
    """
    Takes in a list of ids (ints) and returns a string
    """
    return ''.join([chars[id] for id in ids])

In [15]:
print(f"Length of characters : {len(chars)}")
print(f"First 10 characters : {chars[:10]}")
print(f"Encoded text : {encode('hello')}")
print(f"Decoded text : {decode(encode('hello'))}")

Length of characters : 65
First 10 characters : ['F', 'i', 'r', 's', 't', ' ', 'C', 'z', 'e', 'n']
Encoded text : [22, 8, 28, 28, 14]
Decoded text : hello


> cuda(device=None, non_blocking=False, memory_format=torch.preserve_format) -> Tensor
Returns a copy of this object in CUDA memory.
If this object is already in CUDA memory and on the correct device,
then no copy is performed and the original object is returned.

In [17]:
# Converting the input data to a tensor
encoded_text = encode(text)
print(f"First 10 chars '{text[0:10]}'")
print(f"encoded text first 10 chars {encoded_text[0:10]}")
print(f"Total character count {len(text)}")

First 10 chars 'First Citi'
encoded text first 10 chars [0, 1, 2, 3, 4, 5, 6, 1, 4, 1]
Total character count 1115394


In [19]:
data = torch.tensor(encode(text), dtype=torch.long).to(device)

In [20]:
data.shape

torch.Size([1115394])

In [21]:
# We can think of this as a window of characters that we use as the prefix to predict the next character
data[:block_size+1] # first 17 entities in the tensor ( characters )

tensor([ 0,  1,  2,  3,  4,  5,  6,  1,  4,  1,  7,  8,  9, 10, 11, 12,  8, 13,
        14,  2,  8,  5, 15,  8,  5, 16,  2, 14, 17,  8,  8, 18,  5, 19,  9, 20,
         5, 13, 21,  2,  4, 22,  8,  2, 23,  5, 22,  8, 19,  2,  5, 24,  8,  5,
         3, 16,  8, 19, 25, 26, 11, 11, 27, 28, 28, 10, 11, 29, 16,  8, 19, 25,
        23,  5,  3, 16,  8, 19, 25, 26, 11, 11,  0,  1,  2,  3,  4,  5,  6,  1,
         4,  1,  7,  8,  9, 10, 11, 30, 14, 21,  5, 19,  2,  8,  5, 19, 28, 28,
         5,  2,  8,  3, 14, 28, 31,  8, 18,  5,  2, 19,  4, 22,  8,  2,  5,  4,
        14,  5, 18], device='cuda:0')

To train a transformer, we feed the model `n` tokens (context) and try to predict the `n+1`th token (target) in the sequence.



In [23]:
# Here letss just pick the first block of size `block_size` and try to
# visuazlie how the transformer learns to predict the next character
# We design the system to only learn from tokens that are before the token that has to be predicted
temp_blick_size = 16
x = data[:temp_blick_size]
y = data[1:temp_blick_size+1]
for t in range(temp_blick_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target is: {target}")

when input is tensor([0], device='cuda:0') the target is: 1
when input is tensor([0, 1], device='cuda:0') the target is: 2
when input is tensor([0, 1, 2], device='cuda:0') the target is: 3
when input is tensor([0, 1, 2, 3], device='cuda:0') the target is: 4
when input is tensor([0, 1, 2, 3, 4], device='cuda:0') the target is: 5
when input is tensor([0, 1, 2, 3, 4, 5], device='cuda:0') the target is: 6
when input is tensor([0, 1, 2, 3, 4, 5, 6], device='cuda:0') the target is: 1
when input is tensor([0, 1, 2, 3, 4, 5, 6, 1], device='cuda:0') the target is: 4
when input is tensor([0, 1, 2, 3, 4, 5, 6, 1, 4], device='cuda:0') the target is: 1
when input is tensor([0, 1, 2, 3, 4, 5, 6, 1, 4, 1], device='cuda:0') the target is: 7
when input is tensor([0, 1, 2, 3, 4, 5, 6, 1, 4, 1, 7], device='cuda:0') the target is: 8
when input is tensor([0, 1, 2, 3, 4, 5, 6, 1, 4, 1, 7, 8], device='cuda:0') the target is: 9
when input is tensor([0, 1, 2, 3, 4, 5, 6, 1, 4, 1, 7, 8, 9], device='cuda:0') the

Revisiting some basics:

Terms:
- Block Size: The number of characters that the system has been trained to take into consideration while learning to predict the next character

In [25]:
def get_batch(data, block_size, batch_size):
    """
    This function is responsible for creating a batch of batch_size
    For training a GPT model

    """
    # Here we generate a tensor `ix` containing `batch_size` random
    # indices within the range `0` to `len(data) - block_size`
    # we substract `block_size` from the end so that the last
    # selected block stays within the list of available characters
    # in the text
    ix = torch.randint(len(data) - block_size, (batch_size,))

    # Here we create a stack of tensors (batch size) each of length
    # block_size that start from the
    # above picked random indices
    x = torch.stack([data[i:i+block_size] for i in ix])

    # creates the target tensor y similarly, but shifted
    # one position to the right, representing the next
    # character to predict for each position in x.
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y


### Clarifying some more terms before we proceed to the next step:

### Block Size
- **Block Size** in a GPT model training code refers to the length of each segment of the input data that each training example consists of. This is directly analogous to what you might think of as "sequence length" in other contexts but is specifically termed "block size" in training scenarios for models like GPT.
- In the context of the Transformer attention head depicted in the image below, each "sequence" or input processed through the attention head would be of a fixed length equivalent to the block size. In the image, while not specified as "block size," the dimension that would correspond to this term is the middle dimension of the tensors, which is 32 in your example (`(8, 32, 64)`).

### Sequence Length
- **Sequence Length** in more general contexts refers to the total length of sequences being processed, which may vary unless specifically pre-processed to be uniform. In models like Transformers (and as seen in the attention head diagram), sequence length is typically fixed to a specific size for each training or inference pass. This fixed length is crucial for attention calculations across the entire sequence uniformly.
- In the Transformer attention head diagram, this "sequence length" is manifest in each stage of the attention mechanism. It represents the number of positions (or tokens) in each sequence that the model processes simultaneously, marked as the second dimension in the tensors.

### How It Relates to the Diagram
- In the attention head diagram present below, all tensors maintain a consistent second dimension (32 in this case), reflecting the fixed sequence length or block size used for the calculations. This consistent dimensionality across layers and operations ensures that each token in the sequence can be related to every other token via the attention mechanism, a key feature enabling the model to capture complex dependencies across the input.
- The operations like matrix multiplication (`matmul`) between the transposed keys and queries, and subsequent operations like softmax and dropout, all depend on this fixed sequence length to compute the attention scores and ultimately the output sequence. This fixed length, as used in your GPT model training, allows the Transformer to utilize positional relationships effectively.

### Summary
In summary, in the Transformer model, as depicted by the attention head image below, block size and sequence length can be considered equivalent, referring to the fixed size of the input sequences used for training and inference. This term varies in usage depending on the model architecture but is crucial for models like Transformers that depend on a fixed dimension to compute relationships between all pairs of inputs within a sequence effectively.

1. **Batch Size**: This is the number of samples processed before the model is updated. For exaple if we are are dealing with (8, 32, 64), the first dimension "8" typically represents the batch size. This means that the model processes 8 samples at a time.

2. **Sequence Length**: This is the length of the input sequences each sample in a batch contains. In the example tensor, "32" represents the sequence length / block size, indicating each sample consists of 32 sequential elements or **tokens**. ***For instance, in natural language processing, this could represent 32 words in a sentence. In this example since we will be predicting characters, each token represents a character from the list of unique characters available.***

3. **Feature Dimension**: This indicates the number of features each element of the sequence holds. The "64" in the example tensor suggests that each token or element of the sequence is represented by a vector of 64 features. These could be embeddings that encapsulate the token's meaning in a dense vector.

4. **Block Size**: This term isn't explicitly shown in the diagram but is related to how data is structured or processed in blocks during certain operations. For instance, in memory management or in GPU computation, operations might be optimized by processing data in "blocks." In the context of transformers or deep learning, block size might refer to the dimensionality of sub-parts of the model such as in splitting matrices for parallel processing, but typically it's not a term used to describe tensor dimensions directly.

Thus, in the below shown transformer model diagram:
- Batch size: 8 (number of samples processed together)
- Sequence length / block size: 32 (number of tokens, items, or steps per sample)
- Feature dimension: 64 (features per token or step in each sequence)

### Single Self Attention Head (5 points)
![](https://i.ibb.co/GWR1XG0/head.png)

### Explaining the above artention head set up

This diagram depicts the computational flow in a typical attention head of a Transformer neural network, commonly used in models like GPT and BERT. Here’s a breakdown of the operations and their significance:

1. **Input Tensor**: The input tensor has the shape (8, 32, 64), where 8 represents the batch size, 32 the sequence length / block size, and 64 the feature dimension of each token in the sequence.

2. **Linear Layers**: Three parallel linear transformations are applied to the input tensor. Each layer outputs a tensor of shape (8, 32, 16). These transformations typically generate the queries (Q), keys (K), and values (V) which are used in the attention mechanism.

3. **Transpose Operation**: The output of one of the linear layers (presumably representing keys, K) is transposed to change its shape from (8, 32, 16) to (8, 16, 32). This operation is necessary for matrix multiplication with the queries (Q). Here we notice that the transpose is only done across the second and third dimensions as the first dimension only represents the batch size

4. **Matrix Multiplication (matmul)**: The output of the transpose operation (K transposed) is matrix-multiplied with another linear output (Q). ***This results in a shape of (8, 32, 32), representing the raw attention scores before they are normalized.***

5. **Multiplication (mul)**: This operation might be an element-wise multiplication used as part of scaling the attention scores by the square root of the dimension of the keys to stabilize gradients during training, although the typical square root scaling is not explicitly shown here. More on why we scale the attention values below ...

6. **Masked Fill**: This operation is used to apply masks to the attention scores. Masks are often used to ignore (or mask) padding tokens or future tokens during training in sequence models. The operation doesn't change the shape of the tensor. The upper right half of the matrix is made as zeros in this step.

7. **Softmax**: The softmax function is applied across the last dimension (32) to normalize the attention scores to a probability distribution.

8. **Dropout**: Dropout is a regularization technique where random elements of the tensor are zeroed out during training to prevent overfitting. The shape remains unchanged.

9. **Matrix Multiplication (matmul)**: The normalized and possibly masked attention scores are then matrix-multiplied with the third linear output (V, values), resulting in an output tensor of shape (8, 32, 16). This operation computes the weighted sum of the values based on the attention scores.

10. **Output Tensor**: The final output tensor is generated with the shape (8, 32, 16), likely to be fed into subsequent layers of the Transformer or processed further depending on the specific architecture and task.

This detailed flow illustrates how attention mechanisms selectively focus on different parts of the input sequence, weighting input features by relevance, which is central to the success of Transformer models in handling various sequence-based tasks in natural language processing.

### The scaling of the attention scores

The scaling of the attention scores based on the dimension of the keys in the Transformer architecture addresses a specific challenge in training deep learning models that use softmax to calculate probabilities.

### Background on Dot Products and Their Scale
The attention mechanism computes the dot products between the query and all keys in the sequence. These dot products are a critical component because they determine the attention scores that indicate how much each part of the input should contribute to the output at each position. However, the magnitude of the dot products depends on the dimensionality of the keys and queries. Here's why:

- The dot product of two vectors increases with the number of dimensions. Specifically, if each component of the vectors is drawn from a distribution with a constant variance, the variance of the dot product is proportional to the dimensionality of the vectors.
- As the dimension of the keys (and queries, since they are usually of the same dimension) increases, the average value of the dot products becomes larger. This can lead to extremely large values, especially when working with high-dimensional data, which is common in models like Transformers.

### Impact on Softmax
The softmax function, which is used to convert these dot products into probabilities (or attention scores), is highly sensitive to changes in input values. Specifically:
- Large values in the softmax input can lead to a situation where the softmax function's output is close to zero for all inputs except the largest one (a phenomenon often referred to as the softmax function "saturating"). This saturation can significantly slow down learning, as it leads to very small gradients during backpropagation — essentially, the network is less able to learn from the input data.

### Why Scale by Square Root of Dimension?
The scaling factor used, \(\sqrt{d_k}\) (where \(d_k\) is the dimension of the keys), helps mitigate these effects:
- **Normalization**: By dividing the dot products by \(\sqrt{d_k}\), you effectively normalize them, bringing their variance back to a more manageable scale. This normalization helps maintain a more uniform scale across different model sizes and configurations.
- **Gradient Stability**: By keeping the dot products (and thus the inputs to the softmax) at a reasonable magnitude, the scaling prevents gradients under the softmax from becoming too small. This is crucial for efficient learning, as it ensures that each update step during training is informative enough to guide the model towards better performance without being too noisy or too minimal.

### Conclusion
Scaling by the square root of the dimension of the keys is a practical approach to ensuring that the attention mechanism operates effectively across different settings and model scales, facilitating stable and efficient training. This method is particularly vital in deep learning architectures like Transformers, where models often deal with high-dimensional data and require careful handling of numerical stability during training operations.

### Terms relevant to constructing the SelfAttention Head:

In the context of Transformer architectures, the **head size** in an attention head refers to the dimension of the vectors used for each of the queries (Q), keys (K), and values (V) within a single attention head. This is a key parameter that defines how much information each attention head can capture.

### Definition and Calculation

- **Head Size**: The head size is essentially the dimensionality of the Q, K, and V vectors within each specific attention head. It is typically derived by dividing the total dimension of the model's embeddings (\(d_{\text{model}}\)) by the number of attention heads (\(\text{num\_heads}\)). This allows the model to distribute the embedding information across multiple heads, each focusing on different features or relationships in the data.

### Formula
The head size for queries and keys (\(d_k\) and \(d_q\)) is often the same and can be calculated as:
\[ d_k = d_q = \frac{d_{\text{model}}}{\text{num\_heads}} \]
For values (\(d_v\)), it is usually the same as \(d_k\) and \(d_q\), though this can vary depending on specific model architectures or design choices:
\[ d_v = \frac{d_{\text{model}}}{\text{num\_heads}} \]

### Example
If a Transformer model uses an embedding dimension (\(d_{\text{model}}\)) of 512 and has 8 attention heads:
\[ d_k = d_q = d_v = \frac{512}{8} = 64 \]
Thus, each head processes vectors of size 64 for queries, keys, and values.

### Importance
The choice of head size affects how finely the model can focus on different aspects of the input data. Each head can potentially learn to attend to different parts of the sequence or different types of relationships:
- **Smaller head sizes** can lead to a more focused and granular attention mechanism, where each head might specialize more distinctly.
- **Larger head sizes** provide more capacity to each head, which can be useful for capturing more complex patterns or dependencies, but may reduce the diversity of what different heads can learn.

Adjusting the head size is a balance between computational efficiency, capacity, and the diversity of information that the attention heads can capture. It's an important aspect of model tuning, especially in tasks requiring nuanced understanding of context or relationships within the data.

### How does this translate to code

Following this example: If a Transformer model uses an embedding dimension (\(d_{\text{model}}\)) of 512 and has 8 attention heads:
\[ d_k = d_q = d_v = \frac{512}{8} = 64 \]
Thus, each head processes vectors of size 64 for queries, keys, and values.

Here each layer q, k and v, instead of processing all the tokens that are a part of the embedding, will only process tokens that are passed into this head.

Hence the q, k and v linear layers will be of shape batch_size x head_size


### The Mask

The code snippet provided below involves creating a triangular mask and then applying it to an attention matrix in a Transformer model, typically used in tasks like text processing or sequence modeling. Let’s break down the two lines to understand what’s happening:

### Line 1: Creating the Mask
```python
mask = torch.tril(torch.ones(timesteps, timesteps))
```
- **`torch.ones(timesteps, timesteps)`**: This function creates a 2D tensor (square matrix) filled with the value `1`, where the dimensions of the matrix are both `timesteps`. `timesteps` could be the length of a sequence being processed, such as the number of words in a sentence.
- **`torch.tril()`**: This function takes a tensor and returns a lower triangular part of the matrix. It zeroes out all elements above the main diagonal. The main diagonal and the elements below remain as they were, which in this case, are all `1`s due to the `torch.ones()` function. This triangular matrix is typically used in attention mechanisms to ensure that the attention calculation for a given timestep only considers that timestep and the ones before it (i.e., ensuring causality in models like GPT).

### Line 2: Applying the Mask to the Attention Matrix
```python
masked_attention = attention.masked_fill(mask == 0, float('-inf'))
```
- **`mask == 0`**: This operation compares each element of the `mask` tensor to `0`. Since `mask` is a lower triangular matrix with `1`s in the lower triangle and `0`s elsewhere, this operation generates a Boolean tensor where `True` corresponds to the positions where the mask had `0`s (i.e., the upper triangular part of the matrix) and `False` everywhere else (i.e., the lower triangular part).
- **`masked_fill()`**: This method is called on the `attention` tensor. It takes two arguments: a mask and a value to fill. The mask here is the Boolean tensor from `mask == 0`. Wherever the mask is `True`, the `attention` tensor is filled with `float('-inf')`. This effectively applies the mask by setting the attention scores in the upper triangle (those that should not be considered due to causality) to negative infinity.

### Why `float('-inf')`?
In attention mechanisms, especially when followed by a softmax operation, setting values to negative infinity before softmax ensures that those values have zero probability. When softmax is applied to a vector containing negative infinity, the exponential of negative infinity is zero, hence those positions do not contribute to the output of the softmax.

### Summary
- The `mask` tensor is used to enforce causality in the attention mechanism by preventing the model from attending to future timesteps in the sequence. This is essential in models like GPT where predictions for a given position should only depend on previous positions.
- The `mask == 0` operation identifies positions that should be ignored (in this context, future timesteps), and `masked_fill` applies this by setting such positions in the attention matrix to negative infinity, effectively removing them from consideration during attention normalization (softmax).

In [26]:
class SelfAttentionHead(nn.Module):
  """
  This class implements a single self attention head
  For the input dimensions we have batch size , sequence length , feature dimension

  First we need to implement the K, Q and V layers
  These are three linear layers - we can refer to
  https://pytorch.org/docs/stable/generated/torch.nn.Linear.html#torch.nn.Linear

  For a
  """
  def __init__(self, num_heads, n_embed):
    super().__init__()
    self.k = nn.Linear(n_embed, n_embed, bias=False)
    self.q = nn.Linear(n_embed, n_embed, bias=False)
    self.v = nn.Linear(n_embed, n_embed, bias=False)

  def forward(self, x):
    """
    The forward step contains the following steps
    1. Pass x through the linear layers
    2. Perform transpose operation on k
    3. Calculate the bidirectional attention value q.k
    4. Scaling the attention value
    5. Masked fil
    6. Softmax
    7. Dropout
    8. Matrix multiplication with v

    Batch, Tokens, Chanels
    """
    # print(x.shape)
    B, T, C = x.shape

    # print shape of k
    # print(f"K shape {self.k.weight.shape}")
    # print shape of q
    k = self.k(x)
    q = self.q(x)
    v = self.v(x)

    # print(k.shape)
    # print(q.shape)
    # print(v.shape)

    # Transpose the dimensions
    k = k.transpose(1, 2) # Here we transpose dimensions 1 and 2
    # k is now of dimension B, C, T

    # print(f"kShape after transpose {k.shape}")
    # Here we calculate the bidirectional attention value q.k
    attention = torch.matmul(q, k) # B, T, C * B, C, T = B, T, T

    # print(f"Attention shape {attention.shape}")
    # Scaling the attention value by the sq root of the chanels / features
    attention = attention * C**-0.5 # this will be of dimension B, T, T


    # print(f"Attention shape {attention.shape}")
    # Masked fill
    mask = torch.tril(torch.ones(T, T)).to(device) # We create a mask of dimensions C, C
    # print shape of mask
    # print(f"Mask shape {mask.shape}")

    # We apply the mask to the attention matrix
    # float('-inf') is applied to all positions where the mask value is 0
    masked_attention = attention.masked_fill(mask == 0, float('-inf'))

    # print(f"Masked attention shape {masked_attention.shape}")

    # Softmax
    # the dimension value has to be set to -1 to indicate the last dimension
    attention = F.softmax(masked_attention, dim=-1)

    # print(f"Softmax attention shape {attention.shape}")

    # Dropout
    attention = F.dropout(attention, p=0.1)

    # print(f"Dropout attention shape {attention.shape}")

    # Matrix multiplication with v
    output = torch.matmul(attention, v)
    # print(f"Attention shape {attention.shape}")
    # print(f"V shape {v.shape}")
    # print(f"Output shape {output.shape}")

    return output

### Multihead Self Attention (5 points)

`constructor`

- Create 4 `SelfAttentionHead` instances. Consider using `nn.ModuleList`
- Create a linear layer with n_embd input dim and n_embd output dim

`forward`

In the forward implementation, pass `x` through each head, then concatenate all the outputs along the feature dimension, then pass the concatenated output through the linear layer

![](https://i.ibb.co/y5SwyZZ/multihead.png)

The image and the accompanying task outline depict the implementation steps for a Multihead Self-Attention mechanism, which is a core component of models like the Transformer. Here’s a breakdown of each step in the process:

### Constructor
1. **SelfAttentionHead Instances**:
   - **Purpose**: Each SelfAttentionHead is responsible for capturing different aspects of the input data through separate attention mechanisms. This diversification allows the model to attend to different features or parts of the input simultaneously.
   - **Implementation**: Using `nn.ModuleList` to create and manage these instances is efficient because it automatically handles the forward pass for each module within a list, facilitating easier batch processing and gradient updates.
   - **Configuration**: You are instructed to create four such heads, which means the input will be divided and processed by these four heads independently in parallel.

2. **Linear Layer**:
   - **Purpose**: After processing the input through multiple attention heads, the outputs are concatenated. This linear layer then projects the concatenated output back to the desired embedding dimension (`n_embd`). It integrates information from all the attention heads.
   - **Specification**: The linear layer should have an input dimension and output dimension equal to `n_embd`, which matches the input's embedding size to ensure dimensional consistency across the network.

### Forward Method
1. **Pass Input Through Each Head**:
   - **Process**: The input tensor `x` is passed through each of the four SelfAttentionHead instances. Since each head may focus on different features or relations within the data, they may produce varying outputs.
   - **Output Dimension**: Assuming each head preserves the dimensionality of its output to be smaller or equal to the input dimension divided by the number of heads (for simplicity, let's assume it's divided equally), each head would output a tensor with a shape of `[batch_size, seq_length, n_embd/4]`.

2. **Concatenate Outputs**:
   - **Function**: The outputs from all heads are concatenated along the feature dimension (last dimension). This step combines the different "views" or "attentions" the heads have calculated into a single tensor.
   - **Resulting Dimension**: After concatenation, the dimension of the output tensor would be `[batch_size, seq_length, n_embd]` because each head's output is concatenated to form the full embedding dimension.

3. **Pass Through Linear Layer**:
   - **Purpose**: The concatenated tensor is then passed through the linear layer created in the constructor. This layer acts as a transformation that can mix information across the different attention mechanisms, potentially aiding in better information integration.
   - **Output**: The output of this linear layer is a tensor of the same shape as the input to the layer, `[batch_size, seq_length, n_embd]`, thus ensuring the output matches the input dimension of the sequence in terms of embedding size.

4. **Dropout Application**:
   - **Consideration**: Although not detailed in your task outline, typically, a dropout layer would follow the linear layer to prevent overfitting by randomly setting a fraction of input units to 0 at each update during training. This step helps in making the model more robust.

### Summary
In summary, the multihead self-attention mechanism processes the input through multiple attention heads, each potentially focusing on different features. The outputs of these heads are then integrated via concatenation and further transformed by a linear layer to ensure that the model can leverage information across various heads effectively. This architecture is crucial for complex sequence modeling tasks where different parts of the input sequence carry different types of information relevant to the task.

In [27]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size, n_embed):
        super().__init__()
        # print(f"Num heads {num_heads}")
        # print(f"Head size {head_size}")
        # print(f"n_embed {n_embed}")
        # Self attention head instances
        self.heads = nn.ModuleList([SelfAttentionHead(num_heads, n_embed) for _ in range(num_heads)])
        # Linear layer
        # The linear layer should have an input dimension and output dimension
        # equal to n_embd, which matches the input's embedding size to ensure
        # dimensional consistency across the network.
        self.proj = nn.Linear(n_embed, n_embed)

    def forward(self, x):
        """
        The forward step consists of the following steps
        1. Pass x through each head
        2. Concatenate all the outputs along the feature dimension
        3. Pass the concatenated output through the linear layer
        """
        # print(f"Inside multi head attention")
        # Pass x through each head
        heads = [head(x) for head in self.heads] # (B, t, C)[]

        # print output dimension of each head
        # print(f"Head output dimension {heads[0].shape}")

        # print(f"After passing through heads")
        # Concatenate all the outputs along the feature dimension
        x = torch.cat(heads, dim=-1) # B, T, C
        # print shape of x
        # print(f"X shape {x.shape}")

        # print shape of projection
        # print(f"Projection shape {self.proj.weight.shape}")
        # print(f"After concatenating")
        # Pass the concatenated output through the linear layer
        x = self.proj(x) # B, T, C

        # print(f"After passing through linear projection layer")

        return x




## MLP (2 points)
Implement a 2 layer MLP


![](https://i.ibb.co/C0DtrF5/ff.png)

The diagram provided illustrates a sequence of operations that form a typical Feedforward Neural Network (FFN) module, commonly used within the architecture of a Transformer model. This specific sequence of operations represents the "position-wise feedforward network" component of the Transformer's architecture. Here’s a breakdown of each step and its function:

### 1. **Linear Layer**
- **Input**: \( (8, 32, 64) \)
- **Output**: \( (8, 32, 256) \)
- **Description**: This layer is a fully connected neural network layer that projects each input feature from a 64-dimensional space to a 256-dimensional space. The input tensor's first dimension typically represents the batch size (8), the second dimension the sequence length (32), and the third dimension the feature size (64). The expansion in feature size allows the network to learn more complex features at each position of the sequence.

### 2. **ReLU Activation**
- **Input/Output**: \( (8, 32, 256) \)
- **Description**: The ReLU (Rectified Linear Unit) activation function is applied element-wise. It introduces non-linearity into the model, which is essential for learning complex patterns. ReLU is defined as \( f(x) = max(0, x) \), effectively setting all negative values to zero and keeping positive values unchanged. This operation does not alter the dimensions of the data.

### 3. **Second Linear Layer**
- **Input**: \( (8, 32, 256) \)
- **Output**: \( (8, 32, 64) \)
- **Description**: Another fully connected layer that projects the features back from the 256-dimensional space to the original 64-dimensional space. This step is crucial for matching the dimensions of the output with other components in a Transformer model, such as the self-attention outputs, allowing for subsequent operations like residual connections.

### 4. **Dropout**
- **Input/Output**: \( (8, 32, 64) \)
- **Description**: Dropout is a regularization technique used to prevent overfitting in neural networks. It randomly sets a fraction of the input units to zero during training at each update step, which helps to make the model robust and less likely to rely on any small set of neurons. The dropout rate (the probability of setting a value to zero) is a hyperparameter that can be tuned. The operation does not change the dimensions of the tensor.

### 5. **Output Tensor**
- **Dimension**: \( (8, 32, 64) \)
- **Description**: The final output tensor retains the dimensions of the input tensor to the entire module. This output is typically fed back into the main flow of the Transformer model, often added to the input tensor through a residual connection before being normalized and passed on to the next layer or operation in the model.

### Summary
This feedforward network within a Transformer model performs crucial transformation and re-projection of features, enhancing the model's ability to capture and manipulate information. The use of non-linearity (ReLU) and regularization (Dropout) helps in learning non-trivial patterns and generalizing better to unseen data. Each component is dimensionally coordinated to maintain consistency throughout the model, allowing for seamless integration with other modules such as multi-head attention.

In [28]:
class MLP(nn.Module):
    def __init__(self, embed_dim, scale_up_factor):
        super().__init__()
        self.fc1 = nn.Linear(embed_dim, embed_dim * scale_up_factor, bias=False)
        self.fc2 = nn.Linear(embed_dim * scale_up_factor, embed_dim, bias=False)

    def forward(self, x: torch.tensor) -> torch.tensor:
        """
        The forward step of the MLP has the following steps
        1. Pass x through the first linear layer
        2. Apply ReLU activation
        3. Pass the output through the second linear layer
        4. Apply dropout
        """
        x = F.relu(self.fc1(x)) # B, T, C*scale_up
        x = self.fc2(x) # B, T, C
        x = F.dropout(x, p=0.1) # B, T, C
        return x

## Transformer block (20 points)

Layer normalization help training stability by normalizing the outputs of neurons within a single layer across all features for each individual data point, not across a full batch or a specific feature.

Dropout is a form of regularization to prevent overfitting.

This is the diagram of a transformer block:

![](https://i.ibb.co/X85C473/block.png)

### The transformed block
The diagram provided outlines the structure of a typical Transformer block, which is a fundamental component of Transformer models widely used in natural language processing tasks. Here’s a breakdown of each step and component shown in the diagram:

### 1. **Input Tensor**
- **Shape**: (8, 32, 64)
- **Description**: This tensor represents the input to the Transformer block, where 8 could be the batch size, 32 the sequence length, and 64 the dimensionality of each vector in the sequence.

### 2. **Layer Normalization**
- **Input/Output Shape**: (8, 32, 64)
- **Depth**: 1
- **Description**: The first operation is layer normalization. Layer normalization is applied within each sample independently and normalizes the data across the feature dimension (across the 64 features of each of the sequence's 32 positions). This helps in stabilizing the learning process by normalizing the inputs to have zero mean and unit variance, which can improve training speed and stability.

### 3. **Multi-Head Attention**
- **Input/Output Shape**: (8, 32, 64)
- **Depth**: 1
- **Description**: This component uses multiple sets of attention mechanisms (heads) to process the input. Each head can attend to different parts of the input sequence, allowing the model to capture various aspects of the sequence in parallel. The output of this module is typically the same size as the input, allowing for residual connections.

### 4. **Addition (Residual Connection)**
- **Input**: 2 x (8, 32, 64)
- **Output Shape**: (8, 32, 64)
- **Depth**: 1
- **Description**: The output from the multi-head attention is added to the original input (prior to layer normalization). This is known as a residual connection and helps in mitigating the vanishing gradient problem by allowing gradients to flow directly through the network.

### 5. **Second Layer Normalization**
- **Input/Output Shape**: (8, 32, 64)
- **Depth**: 1
- **Description**: Another layer normalization step is applied after the residual connection and before the feedforward network. This normalizes the data again, preparing it for further processing and maintaining stability in deeper layers.

### 6. **FeedForward Network**
- **Input/Output Shape**: (8, 32, 64)
- **Depth**: 1
- **Description**: This is typically a position-wise feedforward neural network, which means it applies the same neural network to each position independently. It usually consists of two linear transformations with a nonlinear activation function in between. This network can expand (and later compress) the internal representation, allowing the model to mix features before passing them to the next layer.

### 7. **Addition (Second Residual Connection)**
- **Input**: 2 x (8, 32, 64)
- **Output Shape**: (8, 32, 64)
- **Depth**: 1
- **Description**: Similar to the earlier addition, this step adds the output of the feedforward network to the input of the feedforward network (the output from the previous layer normalization). This second residual connection further helps in preserving information throughout layers and aids in training deeper networks.

### 8. **Output Tensor**
- **Shape**: (8, 32, 64)
- **Depth**: 0
- **Description**: The final output tensor of the Transformer block maintains the same shape as the input tensor, ensuring that multiple such blocks can be stacked without dimension mismatch.

### Summary
This diagram presents a classic configuration of a Transformer block, which utilizes normalization, attention mechanisms, residual connections, and feedforward networks to process sequential data effectively. Each component plays a crucial role in ensuring the model learns effectively and generalizes well across different tasks and data distributions. The design of the Transformer block with layer normalization and dropout is specifically tailored to enhance training stability and prevent overfitting, making it highly effective for tasks requiring the handling of complex sequential data.

In [29]:
class Block(nn.Module):
    def __init__(self, n_embd: int, n_head: int, sequence_length: int):
        """
        Architecture of each transformer block
        1. Layer norm
        2. Multihead attention
        3. Layer norm
        4. Feedforward network
        """
        super().__init__()
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        self.attn = MultiHeadAttention(n_head, sequence_length//n_head, n_embd)
        self.mlp = MLP(n_embd, 4)

    def forward(self, x):
        """
        The forward pass of a transformer block consists of the following steps
        1. Layer norm
        2. Multihead attention
        3. Addition (residual connection)
        4. Layer norm
        5. Feedforward network
        6. Addition (second residual connection)
        """
        # print(f"First line of transformer block")
        # Adding layer normalization to each sample individually
        norm = self.ln1(x)

        # print(f"In transformer block")
        # print(f"After first layer norm")

        # Calculating the attention values
        attn = self.attn(norm)

        # print(f"After calculating attention")

        # Adding the residual connection
        x = x + attn

        # Adding layer normalization after the residual connection
        norm = self.ln2(x)
        # print(f"After second layer norm")
        # Calculating the feedforward network
        mlp = self.mlp(norm)
        # print(f"After feedforward network")
        # Adding the second residual connection
        x = x + mlp
        # print(f"After second residual connection")
        # Returning the output of the transformer block
        return x

## GPT

`constructor` (5 points)

1. create the token embedding table and the position embedding table
2. create variable `self.blocks` that is a series of 4 `Block`s. The data will pass through each block sequentially. Consider using `nn.Sequential`
3. create a layer norm layer
4. create a linear layer for predicting the next token

`forward(self, idx, targets=None)`. (5 points)

`forward` takes a batch of context ids as input of size (B, T) and returns the logits and the loss, if targets is not None. If targets is None, return the logits and None.
1. get the token by using the token embedding table created in the constructor
2. create the position embeddings
3. sum the token and position embeddings to get the model input
4. pass the model through the blocks, the layernorm layer, and the final linear layer
5. compute the loss

`generate(start_char, max_new_tokens, top_p, top_k, temperature) -> str` (5 points)
1. implement top p, top_k, and temperature for sampling



![](https://i.ibb.co/n8sbQ0V/Screenshot-2024-01-23-at-8-59-08-PM.png)

### Explannation of concepts

***Positional Encoding***

Positional embeddings are a crucial component in Transformer-based models like GPT, where the architecture lacks any inherent mechanism to track the sequence order of the input tokens. Let's delve into how positional embeddings work and clarify their function.

### Purpose of Positional Embeddings
Positional embeddings provide the model with information about the relative or absolute position of tokens in the sequence. Since models like GPT process all input tokens simultaneously without recurrence or convolution, they rely on these embeddings to incorporate the order of the sequence into their calculations. This positional information is essential for tasks that depend on the order of words or characters, such as language understanding and generation.

### How Positional Embeddings are Implemented
Here's a step-by-step explanation:

1. **Token Embeddings**:
   - `token = self.token_embedding(idx)` retrieves embeddings for each token in the batch based on their indices (`idx`). These embeddings capture the semantic meaning of each token.
   - Output shape: `[B, T, C]` where `B` is the batch size, `T` is the sequence length, and `C` is the embedding dimension.

2. **Generating Positional Indices**:
   - `position = torch.arange(0, idx.shape[1])` generates a tensor of positions from `0` to `T-1` (where `T` is the sequence length). This tensor is used to fetch positional embeddings.
   - This line creates a tensor that effectively acts as an index for each position in the sequence, where `idx.shape[1]` corresponds to the sequence length `T`.

3. **Positional Embeddings Lookup**:
   - `position = self.position_embedding(position)` uses the position indices to retrieve the positional embeddings from a defined embedding table (`self.position_embedding`). This table is usually initialized randomly and then learned during training.
   - It's important to note that there might be an error in this part of the code because the positional indices need to be expanded for all batches in the input. Typically, you would need to repeat or expand these indices to match the batch size `B`. Here's how you could do it:
     ```python
     position = self.position_embedding(position.unsqueeze(0).repeat(B, 1, 1))
     ```
   - After correction, `position` will have the same shape as `token`, i.e., `[B, T, C]`.

4. **Combining Token and Positional Embeddings**:
   - `x = token + position` sums the token embeddings and positional embeddings element-wise. The addition operation allows the model to consider both the semantic meaning of each token and its position in the sequence simultaneously.
   - The resulting tensor `x` is then used as the input for the subsequent layers of the model. This combined embedding retains the shape `[B, T, C]`.

### Summary
Positional embeddings are critical for providing the necessary context of token order to Transformer models, enabling them to effectively process sequences of data. The combination of token and positional embeddings gives the model a comprehensive understanding of both the meaning of tokens and their positional relationships within sequences.

In [30]:
class GPT(nn.Module):
    def __init__(self, n_embd, n_head, vocab_size, sequence_length):
        super().__init__()
        # Embedding tables
        self.token_embedding = nn.Embedding(vocab_size, n_embd)
        self.position_embedding = nn.Embedding(sequence_length, n_embd)

        # Transformer Blocks
        self.blocks = nn.Sequential(
            Block(n_embd, n_head, sequence_length),
            Block(n_embd, n_head, sequence_length),
            Block(n_embd, n_head, sequence_length),
            Block(n_embd, n_head, sequence_length)
        )

        # Layer Norm layer
        self.ln = nn.LayerNorm(n_embd)

        # Linear layer
        # This will output values for the possible n number of tokens
        self.linear = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        """
        forward takes `idx`,  a batch of context ids as input of size (B, T)
        If targets is None, return the logits and None.
        Else returns the logits and the loss


        1. get the token by using the token embedding table created in the constructor
        2. create the position embeddings
        3. sum the token and position embeddings to get the model input
        4. pass the model through the blocks, the layernorm layer, and the final linear layer
        5. compute the loss
        """

        # Get the token by using the token embedding table
        token = self.token_embedding(idx) # B, T, C


        # Create the position indices
        # position = torch.arange(0, idx.size(1), device=idx.device)  # [T]

        # Expand position indices to match the batch size
        # and retrieve the corresponding positional embeddings
        # position = self.position_embedding(position)[None, :, :].repeat(idx.size(0), 1, 1)  # [B, T, C]

        # Improved way of calculating position embedding using torch's broadcasting function
        position = self.position_embedding(torch.arange(idx.size(1), device=idx.device)).expand(idx.size(0), -1, -1)

        # Sum the token and position embeddings to get the model input
        x = token + position
        # print(f"After encoding")
        # Pass the model through the blocks, the layernorm layer, and the final linear layer
        x = self.blocks(x)
        # print(f"After transformer blocks")
        x = self.ln(x)
        # print(f"After layer norm")

        logits = self.linear(x)
        # print(f"After linear layer")

        if targets is None:
            return logits, None

        # Compute the loss
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)

        return logits, loss

In [31]:
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

def generate(model, start_char, max_new_tokens, top_p, top_k, temperature):
    """
    implement top p, top_k, and temperature for sampling
    Generate text starting from start_char with controlled sampling
    """

    start_token_index = encode([start_char])
    print("Encoded indices:", start_token_index)

    # Check if indices are within the expected range
    if any(idx >= len(chars) or idx < 0 for idx in start_token_index):
        raise ValueError("Encoded index out of bounds. Check your vocabulary and encoding function.")


    input_ids = torch.tensor([start_token_index], dtype=torch.long).to(model.token_embedding.weight.device)

    model.eval()  # Put the model in evaluation mode
    generated_text = [start_char]  # List to collect generated characters

    # print input ids
    # print(input_ids)

    for _ in range(max_new_tokens):
      # Extracts the logits form the last batch, last token
      # Here since we are working with characters, we only have one entry in a batch with one
      # element in the sequence
      # So we extract the logits generated for next character based on the input sequence
      output = model.forward(input_ids)[0]
      print(output.shape)
      logits = output[-1, -1, :]

      # Apply temperature
      logits = logits / temperature

      # Filter the logits with top k sampling
      top_k_values, top_k_indices = torch.topk(logits, top_k)
      logits = torch.zeros_like(logits).scatter_(0, top_k_indices, top_k_values)

      # Apply softmax to convert to probabilities
      probs = F.softmax(logits, dim=-1)

      # Filter the logits with top p sampling
      sorted_indices = torch.argsort(probs, descending=True)
      sorted_probs = torch.sort(probs, descending=True)[0]
      cumulative_probs = torch.cumsum(sorted_probs, dim=0)
      sorted_indices_to_remove = cumulative_probs > top_p
      # Shift the mask to the right to keep at least one token
      sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].clone()
      sorted_indices_to_remove[0] = 0

      probs[sorted_indices[sorted_indices_to_remove]] = 0

      # Renormalize the probabilities
      probs /= probs.sum()

      # Sample from the modified distribution
      next_token_id = torch.multinomial(probs, 1).item()

      # print(next_token_id)
      # print(decode([next_token_id]))

      # Add the sampled token to the input sequence
      # input_ids = torch.cat([input_ids, torch.tensor([next_token_id], dtype=torch.long).to(input_ids.device)], dim=-1)

      # Create a new tensor for next_token_id with an extra dimension
      # Using 'unsqueeze' to add a sequence length dimension
      next_token_id_tensor = torch.tensor([next_token_id], dtype=torch.long).to(input_ids.device).unsqueeze(0)

      # Concatenate along the sequence length dimension (dim=1)
      input_ids = torch.cat([input_ids, next_token_id_tensor], dim=-1)


      next_char = decode([next_token_id])
      generated_text.append(next_char)

    return ''.join(generated_text)

### Training loop (15 points)

implement training loop

In [32]:
model = GPT(n_embd, n_head, 65 ,block_size).to(device) # make you are running this on the GPU

In [33]:
from torch.utils.data import DataLoader
from torch.optim import Adam

# Setup the optimizer
optimizer = Adam(model.parameters(), lr=0.001)

# Define the loss function
loss_fn = nn.CrossEntropyLoss()

In [34]:
data = encode(text)  # Convert entire text to indices
data = torch.tensor(data, dtype=torch.long)  #

In [35]:
# Assume 'data' is already a long tensor of encoded text
# Randomly split data into training and validation sets
train_size = int(0.9 * len(data))
train_data, val_data = data[:train_size], data[train_size:]

In [36]:
train_data.shape, val_data.shape

(torch.Size([1003854]), torch.Size([111540]))

In [None]:
from torch.utils.tensorboard import SummaryWriter

# TensorBoard for monitoring
writer = SummaryWriter()

val_loss_list = []
train_loss_list = []

for iter in range(max_iters):
    model.train()
    x, y = get_batch(train_data, block_size, batch_size)
    # print(x.shape, y.shape)
    # Forward pass
    logits, _ = model(x)

    # print(logits.shape, y.shape)

    # Reshape logits to [batch_size * sequence_length, vocab_size]
    # and y to [batch_size * sequence_length]
    logits = logits.view(-1, logits.size(-1))  # logits.size(-1) is 65 here
    y = y.view(-1)

    # Compute loss
    loss = loss_fn(logits, y)

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Logging
    if iter % 100 == 0:
        print(f"Iteration {iter}: Loss {loss.item()}")
        train_loss_list.append(loss.item())
        writer.add_scalar('Training Loss', loss.item(), iter)

        # Validation step
        with torch.no_grad():
            model.eval()
            x_val, y_val = get_batch(val_data, block_size, batch_size)
            logits_val, _ = model(x_val)

            # Reshape logits to [batch_size * sequence_length, vocab_size]
            # and y to [batch_size * sequence_length]
            logits_val = logits_val.view(-1, logits_val.size(-1))  # logits.size(-1) is 65 here
            y_val = y_val.view(-1)

            # Compute loss
            val_loss = loss_fn(logits_val, y_val)
            val_loss_list.append(val_loss.item())
            # val_loss = loss_fn(logits_val.view(-1, vocab_size), y_val.view(-1))
            print(f"Validation Loss: {val_loss.item()}")
            writer.add_scalar('Validation Loss', val_loss.item(), iter)

writer.close()

torch.save(model.state_dict(), 'gpt_model.pth')


2024-07-08 01:04:55.556684: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 01:04:55.556809: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-08 01:04:55.710912: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Iteration 0: Loss 4.338671684265137
Validation Loss: 5.4599690437316895
Iteration 100: Loss 2.5094008445739746
Validation Loss: 2.537811279296875
Iteration 200: Loss 2.4577643871307373
Validation Loss: 2.4633443355560303
Iteration 300: Loss 2.1233811378479004
Validation Loss: 2.2226247787475586
Iteration 400: Loss 2.025768756866455
Validation Loss: 2.1447715759277344
Iteration 500: Loss 1.9679816961288452
Validation Loss: 2.1172282695770264
Iteration 600: Loss 1.992120623588562
Validation Loss: 2.1238784790039062
Iteration 700: Loss 1.9585719108581543
Validation Loss: 2.091581106185913
Iteration 800: Loss 2.0059618949890137
Validation Loss: 2.0743470191955566
Iteration 900: Loss 1.9871422052383423
Validation Loss: 2.1302638053894043
Iteration 1000: Loss 2.028229236602783
Validation Loss: 2.104797124862671
Iteration 1100: Loss 2.0085501670837402
Validation Loss: 2.115905284881592
Iteration 1200: Loss 2.0216920375823975
Validation Loss: 2.131403684616089
Iteration 1300: Loss 2.0424470901

In [73]:
# show model summary from sumary writer
# dir(writer)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_check_caffe2_blob',
 '_encode',
 '_get_file_writer',
 'add_audio',
 'add_custom_scalars',
 'add_custom_scalars_marginchart',
 'add_custom_scalars_multilinechart',
 'add_embedding',
 'add_figure',
 'add_graph',
 'add_histogram',
 'add_histogram_raw',
 'add_hparams',
 'add_image',
 'add_image_with_boxes',
 'add_images',
 'add_mesh',
 'add_onnx_graph',
 'add_pr_curve',
 'add_pr_curve_raw',
 'add_scalar',
 'add_scalars',
 'add_tensor',
 'add_text',
 'add_video',
 'all_writers',
 'close',
 'default_bins',
 'file_writer',
 'filename_suffix',
 'flush',
 'flush_secs',
 'get_logdir',
 'log_dir',
 'max_

In [23]:
# model = torch.load('gpt_model.pth')
# Load the weights
# model.load_state_dict(torch.load('gpt_model.pth'))

RuntimeError: Error(s) in loading state_dict for GPT:
	size mismatch for token_embedding.weight: copying a param with shape torch.Size([65, 768]) from checkpoint, the shape in current model is torch.Size([53, 768]).
	size mismatch for position_embedding.weight: copying a param with shape torch.Size([16, 768]) from checkpoint, the shape in current model is torch.Size([128, 768]).
	size mismatch for linear.weight: copying a param with shape torch.Size([65, 768]) from checkpoint, the shape in current model is torch.Size([53, 768]).
	size mismatch for linear.bias: copying a param with shape torch.Size([65]) from checkpoint, the shape in current model is torch.Size([53]).

### Generate text


print some text that your model generates

In [11]:
# # Example settings
# start_char = 'H'  # Starting character for generation
# max_new_tokens = 10  # Generate 100 characters after the start_char
# top_p = 0.9  # Use top-p sampling with p=0.9
# top_k = 50  # Use top-k sampling with k=50
# temperature = 1.0  # Set temperature to 1 for standard randomness

# # Assuming 'model' is your trained GPT-like model instance
# # Call the generate function
# generated_text = generate(model,start_char, max_new_tokens, top_p, top_k, temperature)

# # Print the generated text
# print("Generated Text:", generated_text)

Encoded indices: [6]


AttributeError: 'collections.OrderedDict' object has no attribute 'token_embedding'