In [None]:
import os
import gc
import torch
import numpy as np
from tqdm import tqdm

In [None]:
# --- Configuration ---
DATA_DIR = os.getcwd()
MODEL_NAME = "meta-llama/Llama-3.2-3B" 
SAVE_PATH = 'Llama-3.2-3B'  # Output file name
HF_TOKEN = os.getenv('HUGGINGFACE_TOKEN')
chunk_size = 1000
# assumes that saved model embedding table before
SAVE_PATH_EMB = os.path.join(DATA_DIR, "embed_table.pth")

In [None]:
def get_lookup_table(save_path):
    embed_table = torch.load(SAVE_PATH_EMB).to('cuda')
    num_vocab = embed_table.shape[0]
    gc.collect()
    torch.cuda.empty_cache()
    print(f"Generating table: num: {num_vocab}, type: {embed_table.dtype}")

    # Create a memory-mapped NumPy array
    sim_table = np.memmap(save_path + ".mmap", dtype=np.float16, mode='w+', shape=(num_vocab, num_vocab))
    file_path = save_path + ".mmap"
    actual_size_bytes_after_memmap = os.path.getsize(file_path)
    actual_size_gb_after_memmap = actual_size_bytes_after_memmap / (1024 * 1024 * 1024)
    print(f"Actual file size (after memmap creation): {actual_size_gb_after_memmap:.2f} GB")
    
    with torch.no_grad():
        for i in tqdm(range(0, num_vocab, chunk_size), desc="Computing Lookup Table (Chunks)"):
            chunk_i_end = min(i + chunk_size, num_vocab)
            chunk_i_embed = embed_table[i:chunk_i_end].to('cuda')
            for j in range(0, num_vocab, chunk_size):
                chunk_j_end = min(j + chunk_size, num_vocab)
                chunk_j_embed = embed_table[j:chunk_j_end].to('cuda')
                # Alternative cosine similarity calculation
                sim_scores = torch.matmul(chunk_i_embed, chunk_j_embed.T).to('cpu').numpy().astype(np.float16)

                sim_table[i:chunk_i_end, j:chunk_j_end] = sim_scores
                sim_table.flush()

            del chunk_i_embed
            gc.collect()
            torch.cuda.empty_cache()

    print(f"Lookup table saved to {save_path}.mmap")

In [None]:
get_lookup_table(SAVE_PATH)