<a href="https://colab.research.google.com/github/naidu199/LLM-workshop/blob/main/Transformers_Design.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [None]:
class Head(nn.Module):
  def __init(self, emb_size: int, head_size: int, seq_size: int, dropout: int = 0.1):
    super().__init__()
    self.query = nn.Linear(emb_size, head_size, bias=False)
    self.key = nn.Linear(emb_size, head_size, bias=False)
    self.value = nn.Linear(emb_size, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(seq_size, seq_size)))
    self.dropout = nn.Dropout(dropout)

  def forward(self, x): # shape of x: BxTxC
    B, T, C = x.shape
    k = self.key(x) # B, T, Hs
    q = self.query(x) # B, T, Hs
    w = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5 # B, T, hs @ B, hs, T = B, T, T
    w = w.masked_fill_(self.tril == 0, float('-inf')) # TODO: ensure that shapes match
    w = F.softmax(w) # (B, T, T) TODO: ensure softmax is applied to the final weights for each batch
    w = self.dropout(w)

    v = self.value(x) # B, T, Hs
    out = w @ v
    return out


In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self, num_heads: int, emb_size: int, head_size: int, seq_size: int, dropout=0.1):
    super().__init__()
    self.heads = nn.ModuleList([Head(emb_size, head_size, seq_size, dropout) for _ in range(num_heads)])
    self.proj = nn.Linear(emb_size, emb_size)
    self.drouput = nn.Dropout(dropout)

  def forward(self, x): # B T C
    out = torch.cat([h(x) for h in self.heads], dim=-1) # B, T, C
    out = self.drouput(self.proj(out))
    return out


class FeedForward(nn.Module):
  """ a simple linear layer followed by a non-linearity """
  def __init__(self, emb_size, dropout = -1):
      super().__init__()
      self.net = nn.Sequential(
          nn.Linear(emb_size, 4 * emb_size),
          nn.ReLU(),
          nn.Linear(4 * emb_size, emb_size),
          nn.Dropout(dropout),
      )

  def forward(self, x):
      return self.net(x)

class Block(nn.Module):
  def __init__(self, emb_size, num_heads, seq_size, dropout=0.1):
    super.__init__()
    assert emb_size%num_heads==0, "embedding size is not a multiple of number of heads"
    head_size = emb_size // num_heads
    self.mha = MultiHeadAttention(num_heads, emb_size, head_size, seq_size, dropout)
    self.layer_norm1 = nn.LayerNorm([seq_size, emb_size])
    self.layer_norm2 = nn.LayerNorm([seq_size, emb_size])
    self.ff = FeedForward(emb_size, dropout)


  def forward(self, x):
    out = self.mha(x)
    x = self.layer_norm1(x + out)
    out = self.ff(x)
    out = self.layer_norm2(x + out)

class GPTModel(nn.Module):
  def __init__(self, num_layers, vocab_size, emb_size, num_heads, seq_size, dropout):
    super().__init__()
    self.context_embedding = nn.Embedding(vocab_size, emb_size)
    self.position_embedding = nn.Embedding(seq_size, emb_size)
    self.blocks = nn.Sequential(*[Block(emb_size, num_heads, seq_size, dropout) for _ in range(num_layers)])
    self.ln = nn.LayerNorm(emb_size) # TOdo check this
    self.linear = nn.Linear(emb_size, vocab_size)

  def forward(self, x, targets=None):
    token_emb = self.context_embedding(x)
    pos_emb = self.position_embedding(x)
    x = token_emb + pos_emb
    x = self.blocks(x)
    x = self.ln(x)
    logits = self.linear(x)

    if targets is None:
      loss = None
    else:
      B,T,C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, inputs, max_new_tokens):
    for _ in range(max_new_tokens):
      # crop the imputs to the last seq_size tokens
      inputs_trimmed = inputs[:, -self.seq_size:]

      # get the predictions
      logits, loss = self.forward(inputs_trimmed) # B, T, C

      # focus only on the last time step
      logits = logits[:, -1, :] # B, C

      # probs by softmax
      probs = F.softmax(logits, dim=-1) # B, C

      # sample from the distribution
      next_pred = torch.multinomial(probs, num_samples = 1)

      # append sampled pred to the running sequence
      inputs = torch.cat([inputs, next_pred], dim = 1)

    return inputs



In [None]:
model = GPTModel(5, ...) #TODO

In [None]:
t = torch.rand(2,3)
p = torch.rand(2,3)

In [None]:
x = torch.cat([t, p], dim=-1)

In [None]:
x.shape

torch.Size([2, 6])

In [None]:
s = lambda x, y: x+ y

In [None]:
s(1, 2)

3

In [None]:
s(*[1,2])

3

In [None]:
import numpy as np

def cosine_distance(vector1, vector2):
    # Compute the dot product of the two vectors
    dot_product = np.dot(vector1, vector2)

    # Compute the L2 norm (magnitude) of each vector
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)

    # Compute the cosine distance
    cos_dist = dot_product / (norm_vector1 * norm_vector2)

    return cos_dist

In [None]:
import math
radian = lambda deg: (math.pi/180)*deg

In [None]:
all_clocks = [1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 15, 18, 20, 24, 30, 36, 40, 45, 60, 72, 90, 120, 180, 360]

In [None]:
def posemb(number, clocks=[12, 10, 24]):
    positions = []

    for c in clocks:
        # convert number (time) to degrees then convert to radians
        degrees = (number%c) * (360/c)
        rad = radian(degrees)
        # append both the cosine and sine of the radian as they uniquely identify points on a circle
        positions.append(math.cos(rad))
        positions.append(math.sin(rad))

    return np.array(positions)

In [None]:
v = [posemb(x, all_clocks) for x in range(13)]

In [None]:
cos_dist_matrix = [[cosine_distance(vec_i, vec_j) for vec_j in v] for vec_i in v]

In [None]:
# Convert to a numpy array for better pretty printing
cos_dist_matrix_np = np.array(cos_dist_matrix)

# Set print options
np.set_printoptions(precision=1, suppress=True)

# Print the matrix
print(cos_dist_matrix_np)

[[1.  0.7 0.5 0.4 0.4 0.2 0.3 0.1 0.2 0.2 0.2 0.1 0.2]
 [0.7 1.  0.7 0.5 0.4 0.4 0.2 0.3 0.1 0.2 0.2 0.2 0.1]
 [0.5 0.7 1.  0.7 0.5 0.4 0.4 0.2 0.3 0.1 0.2 0.2 0.2]
 [0.4 0.5 0.7 1.  0.7 0.5 0.4 0.4 0.2 0.3 0.1 0.2 0.2]
 [0.4 0.4 0.5 0.7 1.  0.7 0.5 0.4 0.4 0.2 0.3 0.1 0.2]
 [0.2 0.4 0.4 0.5 0.7 1.  0.7 0.5 0.4 0.4 0.2 0.3 0.1]
 [0.3 0.2 0.4 0.4 0.5 0.7 1.  0.7 0.5 0.4 0.4 0.2 0.3]
 [0.1 0.3 0.2 0.4 0.4 0.5 0.7 1.  0.7 0.5 0.4 0.4 0.2]
 [0.2 0.1 0.3 0.2 0.4 0.4 0.5 0.7 1.  0.7 0.5 0.4 0.4]
 [0.2 0.2 0.1 0.3 0.2 0.4 0.4 0.5 0.7 1.  0.7 0.5 0.4]
 [0.2 0.2 0.2 0.1 0.3 0.2 0.4 0.4 0.5 0.7 1.  0.7 0.5]
 [0.1 0.2 0.2 0.2 0.1 0.3 0.2 0.4 0.4 0.5 0.7 1.  0.7]
 [0.2 0.1 0.2 0.2 0.2 0.1 0.3 0.2 0.4 0.4 0.5 0.7 1. ]]
