In [None]:
def forward(self, idx, targets=None):
    device = idx.device
    b, t = idx.size()
    assert (
        t <= self.config.block_size
    ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
    pos = torch.arange(0, t, dtype=torch.long, device=device)  # shape (t)

    # forward the GPT model itself
    tok_emb = self.transformer.wte(idx)  # token embeddings of shape (b, t, n_embd)
    pos_emb = self.transformer.wpe(pos)  # position embeddings of shape (t, n_embd)
    x = self.transformer.drop(tok_emb + pos_emb)
    for block in self.transformer.h:
        x = block(x)
    x = self.transformer.ln_f(x)

    if targets is not None:
        # if we are given some desired targets also calculate the loss
        logits = self.lm_head(x)
        loss = F.cross_entropy(
            logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1
        )
    else:
        # inference-time mini-optimization: only forward the lm_head on the very last position
        logits = self.lm_head(
            x[:, [-1], :]
        )  # note: using list [-1] to preserve the time dim
        loss = None

    return logits, loss


# Layers of nanoGPT:

* B = batch size
* T = sequence length (block_size)
* C = embedding dimension (n_embd)
* V = vocabulary size (vocab_size)

## Word Token Embedding (wte):
* maps token ID to dense vector representation of specific dimension (n_embd)
* Input (B, T) -> Output (B, T, C)
* #learned parameters = V * C

## Word Positional Embedding:
* incorperate positional information into token embedding
* Input (T) -> Output (T, C)
* #learned parameters = T*C

## Block:

### Layer Norm:
* used to normalize the feature accross the features (n_embd) for each token in the sequence (self.weight is used for scaling and self.bias is used to shift)
* Input (B, T, C) -> Output (B, T, C)
* #learned parameters = 2*C (if bias is used) / C (if bias is not used)

```python
class LayerNorm(nn.Module):
    """LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False"""

    def __init__(self, ndim, bias):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None

    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
```

### Casual Self-Attention

*  allows the model to focus on different parts of the input sequence to generate contextually relevant representations while ensuring that each position can only attend to positions before or at its own position (to maintain the causality required for autoregressive models)

```python
def forward(self, x):
        B, T, C = (
            x.size()
        )  # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(
            1, 2
        )  # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(
            1, 2
        )  # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(
            1, 2
        )  # (B, nh, T, hs)
```

Project input into queries, keys and values using a single linear layer that outputs a tensor shape (B, T, 3*C)

```python
self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
```

After that the q, k and v are reshaped to (B, T, n_head, head_dim), where head_dim = C // n_head. Next it is transposed to shape (B, n_head, T, head_dim) which allows parallel computation across multiple attention heads.

```python
        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        if self.flash:
            # efficient attention using Flash Attention CUDA kernels
            y = torch.nn.functional.scaled_dot_product_attention(
                q,
                k,
                v,
                attn_mask=None,
                dropout_p=self.dropout if self.training else 0,
                is_causal=True,
            )
        else:
            # manual implementation of attention
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)
            y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
```

We have two different implementations of the Scaled Dot-Product Attention. Both do the same thing: 
* Compute the attention scores using the dot product of Q and K, scaled by the square root of the head dimension. 
* Apply a causal mask to ensure that each position can only attend to previous positions (and itself), ensuring causality
* Use softmax to convert these scores into attention probabilities: att=softmax(att)
* Dropout is applied to the attention probabilities to prevent overfitting.
* The attention probabilities are used to compute a weighted sum of the values (V), resulting in an output tensor of shape (B, n_head, T, head_dim)

```python
        y = (
            y.transpose(1, 2).contiguous().view(B, T, C)
        )  # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y
```

Now the output tensor is reshaped by concatenating the output from all heads. 