# Modify GPT2-124 to Llama 2-7B, 3-8B
- Llama 2:
    - vocab sizes: 50257 -> 32000
    - input embedding dim 786 -> 4096
    - positional encoding: absolute positional encoding -> RoPE
    - context length 786 -> 4096
    - remove dropout before & after multihead attention, and after final feedforward layer
    - multihead causal attention w/ 12 atttention heads -> masked multihead attention w/ 32 heads
    - layer norm -> RMS norm 
    - final feedfordward layer: GELU -> Swish + SwiGLU+Linear as gate, hidden layer dim 11008
- Llama 3:
    - vocab size 32000 -> 128256
    - input embedding dim 4096
    - context length 4096 -> 8192
    - masked multihead attention w/ 32 heads -> masked grouped-query attention w/ 32 heads
    - final feedfordward layer: Swish + SwiGLU+Linear as gate, hidden layer dim 11008 -> 14336


In [None]:
import torch

In [None]:
import huggingface_hub # get model weights
import sentencepiece # tokenizer

In [None]:
import numpy as np
import os
import sys 
import math
from typing import Tuple, Dict, List

cwd = os.getcwd()

In [None]:
print(torch.__version__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

from torch.utils.data import Dataset, DataLoader
import torch.nn as nn 
import torch.nn.functional as F

2.2.1
Using device: cpu


## preprocess

In [None]:
# read in raw text
pdata = f"{cwd[:-18]}traditional-NLP/data/"
sys.path.append(pdata)
with open(f"{pdata}anna.txt" , 'r', encoding='utf-8') as f:
    text_data = f.read()
print(f"The type of the raw text: {type(text_data)}")
print(f"The beginning of raw text: \n {text_data[:50]}")

The type of the raw text: <class 'str'>
The beginning of raw text: 
 Chapter 1


Happy families are all alike; every un


In [None]:
# inspect raw text and tokens
total_characters = len(text_data)
print(f"total num of characters in Anna Karenina: {total_characters}")
total_tokens = len(tokenizer.encode(text_data))
print(f"total num of tokens in Anna Karenina with BPE tokenizer: {total_tokens}")
# total num of characters in Anna Karenina: 1985223
# total num of tokens in Anna Karenina with BPE tokenizer: 508206

total num of characters in Anna Karenina: 1985223
total num of tokens in Anna Karenina with BPE tokenizer: 508206


### torch dataset dataloader

In [None]:
# create dataset and dataloader

class my_text_dataset(Dataset):

    # initialize with n varg in
    def __init__(self, raw_text:str, tokenizer, max_length:int, stride:int=1):
        # create class attributes
        self.input_tokens_x = []
        self.target_tokens_y = []

        # tokenize the enitre text 
        tokens = tokenizer.encode(raw_text, allowed_special={"<|endoftext|>"})

        # set y as stride number of tokens trailing x 
        for i in range(0, (len(tokens)-max_length), stride):
            x_tmp = tokens[i : (i+max_length)]
            y_tmp = tokens[(i+1) : (i+max_length+1)]
            self.input_tokens_x.append(torch.tensor(x_tmp))
            self.target_tokens_y.append(torch.tensor(y_tmp))

    # overwrite the __len__() method to return number of rows in the dataset
    def __len__(self) -> int:
        "Returns the number of rows / pairs of x-y sequences in the dataset"
        return len(self.input_tokens_x)
    
    # overwrite the __getitem__() method (required for subclasses of torch.utils.data.Dataset)
    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, int]:
        "Returns one sample of data, data and label (X, y)."
        return self.input_tokens_x[idx], self.target_tokens_y[idx]

def my_text_dataloader(raw_text:str, batch_size:int=4, max_length:int=256,
                       stride:int=128, shuffle=True, drop_last=True, num_workers=0):
    # initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # create dataset
    dataset = my_text_dataset(raw_text, tokenizer, max_length, stride)

    # create dataloader
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataloader

#### split into T, V, H

In [None]:
total_characters = len(text_data)
print(f"total num of characters in Anna Karenina: {total_characters}")
prop_t, prop_v, prop_h = (0.8,0.1,0.1)
split_idx_t, split_idx_v = int(prop_t * total_characters), int((prop_t+prop_v) * total_characters)
print(f"Split at character index {split_idx_t} between train and valid sets, and at {split_idx_v} betwee valid and hold sets")

d_train = text_data[:split_idx_t]
d_valid = text_data[split_idx_t:split_idx_v]
d_hold  = text_data[split_idx_v:]

assert (total_tokens * prop_t) > CONFIG_GPT2_124M["context_length"], "Not enough tokens for loader_t (training dataloader)"
assert (total_tokens * prop_v) > CONFIG_GPT2_124M["context_length"], "Not enough tokens for loader_v (validation dataloader)"
assert (total_tokens * prop_h) > CONFIG_GPT2_124M["context_length"], "Not enough tokens for loader_h (testing dataloader)"

total num of characters in Anna Karenina: 1985223
Split at character index 1588178 between train and valid sets, and at 1786700 betwee valid and hold sets


In [None]:
loader_t = my_text_dataloader(
    raw_text=d_train,
    batch_size=2, # this is only for learning purpose; in practice, batch_size >= 1024 is common
    max_length=CONFIG_GPT2_124M["context_length"],
    stride=CONFIG_GPT2_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

loader_v = my_text_dataloader(
    raw_text=d_valid,
    batch_size=2,
    max_length=CONFIG_GPT2_124M["context_length"],
    stride=CONFIG_GPT2_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

# loader_h = my_text_dataloader(
#     raw_text=d_hold,
#     batch_size=2,
#     max_length=CONFIG_GPT2_124M["context_length"],
#     stride=CONFIG_GPT2_124M["context_length"],
#     drop_last=False,
#     shuffle=False,
#     num_workers=0
# )

### inspect loaded data

In [None]:
train_tokens = 0
for input_batch, target_batch in loader_t:
    train_tokens += input_batch.numel()

val_tokens = 0
for input_batch, target_batch in loader_v:
    val_tokens += input_batch.numel()

print("Training tokens:", train_tokens)
print("Validation tokens:", val_tokens)
print("All tokens:", train_tokens + val_tokens)

Training tokens: 406528
Validation tokens: 50944
All tokens: 457472
