In [1]:
%%bash
which python
python --version
nvcc --version

/mnt/vstor/CSE_CSDS_VXC204/mxh1029/envs/conda/conda-dir/envs/g124/bin/python
Python 3.12.3
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Wed_Apr_17_19:19:55_PDT_2024
Cuda compilation tools, release 12.5, V12.5.40
Build cuda_12.5.r12.5/compiler.34177558_0


In [2]:
import os
import random
from dataclasses import dataclass
from pathlib import Path

import numpy as np
import torch


@dataclass(frozen=False)
class Config:
    device: torch.device
    seed: int
    cache_dir: Path
    base_dir: Path


def init(seed: int = None) -> Config:
    """
    Initialize the environment settings for a machine learning project.

    Args:
        seed (int, optional): The seed for random number generators to ensure reproducibility. Defaults to None.

    Returns:
        Config: A frozen dataclass containing the configuration settings.
    """
    # Check if CUDA is available
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print("CUDA is available")
        print("Device name:", torch.cuda.get_device_name(0))
        print("Device count:", torch.cuda.device_count())
    else:
        device = torch.device("cpu")
        print("CUDA is not available")

    # Set Hugging Face environment variables
    hf_telemetry = 1  # Set to 1 to disable telemetry
    os.environ["HF_HUB_DISABLE_TELEMETRY"] = str(hf_telemetry)

    # Ensure required environment variables are set
    cs_bash = os.getenv("CS_BASH")
    cs_home = os.getenv("CS_HOME")
    if not cs_bash:
        raise EnvironmentError("Environment variable CS_BASH is not set")
    if not cs_home:
        raise EnvironmentError("Environment variable CS_HOME is not set")

    # Set Hugging Face token from environment script
    env_path = Path(cs_bash) / ".env.py"
    if env_path.is_file():
        with open(env_path, "r") as env_file:
            env_script = env_file.read()
            exec(env_script)
    else:
        raise FileNotFoundError(f"Environment file not found: {env_path}")

    cache_dir = Path(cs_home) / ".cache/misc"

    # Set random seed for reproducibility if provided
    if seed is not None:
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(seed)
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False

    
    notebook_path = Path(cs_home) / "notebooks"
    if not notebook_path.is_dir():
        raise EnvironmentError(f"Notebook directory not found: {notebook_path}")

    os.chdir(notebook_path)

    return Config(device=device, seed=seed, cache_dir=cache_dir, base_dir=notebook_path)


config = init()

CUDA is available
Device name: NVIDIA A100-SXM4-80GB
Device count: 1


In [3]:
config.base_dir = config.base_dir.parent

In [4]:
from dataclasses import dataclass
from typing import Optional


@dataclass
class ModelArgs:
    dim: int = 4096
    n_layers: int = 32
    n_heads: int = 32
    n_kv_heads: Optional[int] = None
    vocab_size: int = -1  # Later set in the build method
    multiple_of: int = 256
    ffn_dim_multiplier: Optional[float] = None
    norm_eps: float = 1e-5

    # Needed for KV cache
    max_batch_size: int = 32
    max_seq_len: int = 2048

    device: str = None


In [5]:
import math
from dataclasses import dataclass
from typing import Optional

import torch
import torch.nn as nn
import torch.nn.functional as F


class RMSNorm(nn.Module):
    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
        self.eps = eps
        # The gamma parameter
        self.weight = nn.Parameter(torch.ones(dim))

    def _norm(self, x: torch.Tensor):
        # (B, Seq_Len, Dim) * (B, Seq_Len, 1) = (B, Seq_Len, Dim)
        # rsqrt: 1 / sqrt(x)
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

    def forward(self, x: torch.Tensor):
        # (Dim) * (B, Seq_Len, Dim) = (B, Seq_Len, Dim)
        return self.weight * self._norm(x.float()).type_as(x)


def apply_rotary_embeddings(x: torch.Tensor, freqs_complex: torch.Tensor, device: str):
    # Separate the last dimension pairs of two values, representing the real and imaginary parts of the complex number
    # Two consecutive values will become a single complex number
    # (B, Seq_Len, H, Head_Dim) -> (B, Seq_Len, H, Head_Dim/2)
    x_complex = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
    # Reshape the freqs_complex tensor to match the shape of the x_complex tensor. So we need to add the batch dimension and the head dimension
    # (Seq_Len, Head_Dim/2) --> (1, Seq_Len, 1, Head_Dim/2)
    freqs_complex = freqs_complex.unsqueeze(0).unsqueeze(2)
    # Multiply each complex number in the x_complex tensor by the corresponding complex number in the freqs_complex tensor
    # Which results in the rotation of the complex number as shown in the Figure 1 of the paper
    # (B, Seq_Len, H, Head_Dim/2) * (1, Seq_Len, 1, Head_Dim/2) = (B, Seq_Len, H, Head_Dim/2)
    x_rotated = x_complex * freqs_complex
    # Convert the complex number back to the real number
    # (B, Seq_Len, H, Head_Dim/2) -> (B, Seq_Len, H, Head_Dim/2, 2)
    x_out = torch.view_as_real(x_rotated)
    # (B, Seq_Len, H, Head_Dim/2, 2) -> (B, Seq_Len, H, Head_Dim)
    x_out = x_out.reshape(*x.shape)
    return x_out.type_as(x).to(device)


def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
    batch_size, seq_len, n_kv_heads, head_dim = x.shape
    if n_rep == 1:
        return x
    return (
        # (B, Seq_Len, N_KV_Heads, 1, Head_Dim)
        x[:, :, :, None, :]
        # (B, Seq_Len, N_KV_Heads, N_Rep, Head_Dim)
        .expand(batch_size, seq_len, n_kv_heads, n_rep, head_dim)
        # (B, Seq_Len, N_KV_Heads * N_Rep, Head_Dim)
        .reshape(batch_size, seq_len, n_kv_heads * n_rep, head_dim)
    )


class SelfAttention(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()

        # Indicates the number of heads for the Keys and Values
        self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
        # Indicates the number of heads for the Queries
        self.n_heads_q = args.n_heads
        # Indicates how many times the Keys and Values should be repeated
        self.n_rep = self.n_heads_q // self.n_kv_heads
        # Indicates the dimension of each head, that is, the part of the embedding that each head will be responsible for
        self.head_dim = args.dim // args.n_heads

        self.wq = nn.Linear(args.dim, args.n_heads * self.head_dim, bias=False)
        self.wk = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
        self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
        self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False)

        self.cache_k = torch.zeros(
            (args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim)
        )
        self.cache_v = torch.zeros(
            (args.max_batch_size, args.max_seq_len, self.n_kv_heads, self.head_dim)
        )

    def forward(self, x: torch.Tensor, start_pos: int, freqs_complex: torch.Tensor):
        batch_size, seq_len, _ = x.shape  # (B, 1, Dim)

        # (B, 1, Dim) -> (B, 1, H_Q * Head_Dim)
        xq = self.wq(x)
        # (B, 1, Dim) -> (B, 1, H_KV * Head_Dim)
        xk = self.wk(x)
        # (B, 1, Dim) -> (B, 1, H_KV * Head_Dim)
        xv = self.wv(x)

        # (B, 1, H_Q * Head_Dim) -> (B, 1, H_Q, Head_Dim)
        xq = xq.view(batch_size, seq_len, self.n_heads_q, self.head_dim)
        # (B, 1, H_KV * Head_Dim) -> (B, 1, H_KV, Head_Dim)
        xk = xk.view(batch_size, seq_len, self.n_kv_heads, self.head_dim)
        # (B, 1, H_KV * Head_Dim) -> (B, 1, H_KV, Head_Dim)
        xv = xv.view(batch_size, seq_len, self.n_kv_heads, self.head_dim)

        # (B, 1, H_Q, Head_Dim) --> (B, 1, H_Q, Head_Dim)
        xq = apply_rotary_embeddings(xq, freqs_complex, device=x.device)
        # (B, 1, H_KV, Head_Dim) --> (B, 1, H_KV, Head_Dim)
        xk = apply_rotary_embeddings(xk, freqs_complex, device=x.device)

        # Replace the entry in the cache
        self.cache_k[:batch_size, start_pos : start_pos + seq_len] = xk
        self.cache_v[:batch_size, start_pos : start_pos + seq_len] = xv

        # (B, Seq_Len_KV, H_KV, Head_Dim)
        keys = self.cache_k[:batch_size, : start_pos + seq_len]
        # (B, Seq_Len_KV, H_KV, Head_Dim)
        values = self.cache_v[:batch_size, : start_pos + seq_len]

        # Since every group of Q shares the same K and V heads, just repeat the K and V heads for every Q in the same group.

        # (B, Seq_Len_KV, H_KV, Head_Dim) --> (B, Seq_Len_KV, H_Q, Head_Dim)
        keys = repeat_kv(keys, self.n_rep)
        # (B, Seq_Len_KV, H_KV, Head_Dim) --> (B, Seq_Len_KV, H_Q, Head_Dim)
        values = repeat_kv(values, self.n_rep)

        # (B, 1, H_Q, Head_Dim) -> (B, H_Q, 1, Head_Dim)
        xq = xq.transpose(1, 2)
        # (B, Seq_Len_KV, H_Q, Head_Dim) -> (B, H_Q, Seq_Len_KV, Head_Dim)
        keys = keys.transpose(1, 2)
        # (B, Seq_Len_KV, H_Q, Head_Dim) -> (B, H_Q, Seq_Len_KV, Head_Dim)
        values = values.transpose(1, 2)

        # (B, H_Q, 1, Head_Dim) @ (B, H_Q, Head_Dim, Seq_Len_KV) -> (B, H_Q, 1, Seq_Len_KV)
        scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim)
        # (B, H_Q, 1, Seq_Len_KV) -> (B, H_Q, 1, Seq_Len_KV)
        scores = F.softmax(scores.float(), dim=-1).type_as(xq)

        # (B, H_Q, 1, Seq_Len) @ (B, H_Q, Seq_Len_KV, Head_Dim) -> (B, H_Q, 1, Head_Dim)
        output = torch.matmul(scores, values)
        # (B, H_Q, 1, Head_Dim) -> (B, 1, H_Q, Head_Dim) -> (B, 1, Dim)
        output = output.transpose(1, 2).contiguous().view(batch_size, seq_len, -1)
        return self.wo(output)  # (B, 1, Dim) -> (B, 1, Dim)


class FeedForward(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()

        hidden_dim = 4 * args.dim
        hidden_dim = int(2 * hidden_dim / 3)
        if args.ffn_dim_multiplier is not None:
            hidden_dim = int(args.ffn_dim_multiplier * hidden_dim)
        # Round the hidden_dim to the nearest multiple of the multiple_of parameter
        hidden_dim = args.multiple_of * (
            (hidden_dim + args.multiple_of - 1) // args.multiple_of
        )

        self.w1 = nn.Linear(args.dim, hidden_dim, bias=False)
        self.w2 = nn.Linear(hidden_dim, args.dim, bias=False)
        self.w3 = nn.Linear(args.dim, hidden_dim, bias=False)

    def forward(self, x: torch.Tensor):
        # (B, Seq_Len, Dim) --> (B, Seq_Len, Hidden_Dim)
        swish = F.silu(self.w1(x))
        # (B, Seq_Len, Dim) --> (B, Seq_Len, Hidden_Dim)
        x_V = self.w3(x)
        # (B, Seq_Len, Hidden_Dim) * (B, Seq_Len, Hidden_Dim) --> (B, Seq_Len, Hidden_Dim)
        x = swish * x_V
        # (B, Seq_Len, Hidden_Dim) --> (B, Seq_Len, Dim)
        x = self.w2(x)
        return x


class TransformerBlock(nn.Module):

    def __init__(self, args: ModelArgs, layer_id: int):

        super().__init__()

        self.n_heads = args.n_heads
        self.dim = args.dim
        self.head_dim = args.dim // args.n_heads

        self.attention = SelfAttention(args)
        self.feed_forward = FeedForward(args)

        # Normalization BEFORE the attention block
        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
        # Normalization BEFORE the feed forward block
        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)

    def forward(self, x: torch.Tensor, start_pos: int, freqs_complex: torch.Tensor):
        # (B, Seq_Len, Dim) + (B, Seq_Len, Dim) --> (B, Seq_Len, Dim)
        h = x + self.attention.forward(self.attention_norm(x), start_pos, freqs_complex)
        # (B, Seq_Len, Dim) + (B, Seq_Len, Dim) --> (B, Seq_Len, Dim)
        out = h + self.feed_forward.forward(self.ffn_norm(h))
        return out


class Transformer(nn.Module):

    def __init__(self, args: ModelArgs):
        super().__init__()

        assert args.vocab_size != -1, "Vocab size must be set"

        self.args = args
        self.vocab_size = args.vocab_size
        self.n_layers = args.n_layers

        self.tok_embeddings = nn.Embedding(args.vocab_size, args.dim)

        self.layers = nn.ModuleList(
            [TransformerBlock(args, layer_id) for layer_id in range(args.n_layers)]
        )

        self.norm = RMSNorm(args.dim, eps=args.norm_eps)
        self.output = nn.Linear(args.dim, args.vocab_size, bias=False)

        head_dim = args.dim // args.n_heads
        self.freqs_complex = self._precompute_theta_pos_frequencies(
            head_dim,
            self.args.max_seq_len,
        )

    def _precompute_theta_pos_frequencies(
        self, head_dim: int, max_seq_len: int, theta: float = 10000.0
    ):
        print("precompute_theta_pos_frequencies")
        # This runs only once.

        assert head_dim % 2 == 0, "Dimension must be divisible by 2"
        device = "cuda"

        # end = max_seq_len * 2 # original implementation in meta GitHub
        end = max_seq_len  # I don't know why they are using max_seq_len * 2, but I think it should be max_seq_len, and working.

        # torch.arange returns torch.int64
        a = 2 * torch.arange(0, head_dim // 2).float().to(device)
        # a = [0., 2., ..., 126.], head_dim = 128

        freqs = 1.0 / (theta ** (a / head_dim))

        position_ids = torch.arange(
            start=0, end=end, dtype=torch.long, device=device
        ).float()

        freqs = torch.outer(position_ids, freqs).float()

        freqs_complex = torch.polar(torch.ones_like(freqs), freqs)

        return freqs_complex

    def forward(self, input_id: torch.Tensor, start_pos: int):
        """

        Args:
            input_id (torch.Tensor): (bath_size, 1)
            start_pos (int): The starting position of the tokens in the sequence.

        """

        # assert input_id.shape[1] == 1, "Llama uses KV cache, so the input should be a single token"

        h = self.tok_embeddings(input_id)  # (batch_size, 1, embed_dim)

        # Retrieve the pairs (m, theta) corresponding to the positions [start_pos, start_pos + seq_len]
        freqs_complex = self.freqs_complex  # this is a Tensor

        freqs_complex = freqs_complex[start_pos : start_pos + 1]

        # Consecutively apply all the encoder layers
        for layer in self.layers:
            h = layer(h, start_pos, freqs_complex)

        h = self.norm(h)

        logits = self.output(h).float()

        return logits


In [6]:
from sentencepiece import SentencePieceProcessor


def load_tokenizer(
    tokenizer_path=f"{config.base_dir}/.cache/meta_llama2/tokenizer.model",
):

    tokenizer = SentencePieceProcessor()
    tokenizer.load(tokenizer_path)
    vocab_size = tokenizer.vocab_size()

    return tokenizer, vocab_size



In [7]:
import json

def load_config(
    vocab_size,
    device,
    llama_path=Path(f"{config.base_dir}/.cache/meta_llama2/llama-2-7b/"),
):

    with open(llama_path / "params.json", "r") as f:  # Load the params
        params = json.loads(f.read())

    model_args = ModelArgs(
        # max_seq_len=max_seq_len,
        # max_batch_size=max_batch_size,
        device=device,
        **params,
    )

    model_args.vocab_size = vocab_size

    return model_args

In [8]:

def load_model(
    llama_path=Path(f"{config.base_dir}/.cache/meta_llama2/llama-2-7b/"),
    model_args=None,
):

    # https://pytorch.org/docs/stable/generated/torch.set_default_dtype.html#torch.set_default_dtype
    # https://pytorch.org/docs/stable/generated/torch.set_default_device.html#torch.set_default_device
    if config.device.type == "cuda":
        torch.set_default_dtype(torch.float16)
        torch.set_default_device(config.device)
    else:
        torch.set_default_dtype(torch.bfloat16)
        torch.set_default_device("cpu")

    checkpoints_path = sorted(
        (llama_path).glob("*.pth")
    )  # For llama-2-7b, there is no need to sort the checkpoints since there is only one checkpoint.

    assert len(checkpoints_path) > 0, f"No checkpoints found in {checkpoints_path}"

    checkpoint = torch.load(
        checkpoints_path[0], map_location="cpu"
    )  # Load the checkpoint on CPU, [0] since there is only one checkpoint
    # Comment from Meta repo: The only unmatched key in the checkpoint is rope.freqs. Remove it
    del checkpoint["rope.freqs"]  # Remove the unmatched key

    model = Transformer(model_args).to(config.device)

    model.load_state_dict(checkpoint, strict=True)

    print("Checkpoint loaded successfully")

    return model

In [9]:
tokenizer, vocab_size = load_tokenizer()


model_args = load_config(vocab_size, config.device)

In [10]:
print(model_args)

ModelArgs(dim=4096, n_layers=32, n_heads=32, n_kv_heads=None, vocab_size=32000, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, max_batch_size=32, max_seq_len=2048, device=device(type='cuda'))


In [11]:
model = load_model(model_args=model_args)

  checkpoint = torch.load(


precompute_theta_pos_frequencies
Checkpoint loaded successfully


In [12]:
print(model)

Transformer(
  (tok_embeddings): Embedding(32000, 4096)
  (layers): ModuleList(
    (0-31): 32 x TransformerBlock(
      (attention): SelfAttention(
        (wq): Linear(in_features=4096, out_features=4096, bias=False)
        (wk): Linear(in_features=4096, out_features=4096, bias=False)
        (wv): Linear(in_features=4096, out_features=4096, bias=False)
        (wo): Linear(in_features=4096, out_features=4096, bias=False)
      )
      (feed_forward): FeedForward(
        (w1): Linear(in_features=4096, out_features=11008, bias=False)
        (w2): Linear(in_features=11008, out_features=4096, bias=False)
        (w3): Linear(in_features=4096, out_features=11008, bias=False)
      )
      (attention_norm): RMSNorm()
      (ffn_norm): RMSNorm()
    )
  )
  (norm): RMSNorm()
  (output): Linear(in_features=4096, out_features=32000, bias=False)
)
