In [12]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import torch
import torch.nn as nn
from torch import arange, cat
from torch.nn import functional as F
from datasets import load_dataset
from tokenizers import Tokenizer
import os
import unicodedata
import time
from collections import defaultdict

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)

HF_DATASET = "openwebtext"
model_number = "02"
CHECKPOINT_PATH = f'checkpoints/model{model_number}.pt'
LOG_FILE = f'train_data/model{model_number}_data.csv'
tokenizer_file = 'tokenizer/tokenizer-01.json'
RESUME_ITER = 0     # Default starting step
save = True         # save model and data?

# evrything here can be changed each training session to optimize learning
minibatch_size = 64   # effective batch size is minibatch_size * accumulation_steps
accumulation_steps = 16
block_size = 128
learning_rate = 1e-4
eval_iters = 100
estimate_loss_iters = 100

# everything below here NEEDS to stay the same to load an extistng model
n_embed = 512
n_head = 8
n_layer = 12
dropout = 0.2
max_seq_len = 1024       # tril size
base = float(10000.0)    # RoPE pos encoding param

cuda:0


In [13]:
# LOAD DATASET
dataset = load_dataset(HF_DATASET)

dataset = dataset['train']
# Use a seed so your split is the same every time you run the code
seed_value = 42

# Split the dataset: 90% for training and 10% for validation
dataset = dataset.train_test_split(
    test_size=0.10,  # Use 10% of the data for the validation set
    seed=seed_value
)

train_data = dataset['train']
val_data = dataset['test']

Resolving data files:   0%|          | 0/80 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/80 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/80 [00:00<?, ?it/s]

In [14]:
# LOAD TOKENIZER
tokenizer = Tokenizer.from_file(tokenizer_file)

vocab_size = tokenizer.get_vocab_size()

In [15]:
# ROPE
"""Create the RoPE rotation amounts vector"""
# --- a) Generate Inverse Frequencies (theta_i) ---
# torch.arange(0, D, 2) gets [0, 2, 4, ..., (D - 2)]
# We calculate 1 / (base^(2i/D))
inv_freq = 1.0 / (base ** (arange(0, n_embed, 2).float() / n_embed))
inv_freq = inv_freq.to(device)
# inv_freq shape: [D/2]

def get_RoPE(positions):
    """Create the RoPE pos embedding table"""        
    # --- c) Calculate Rotation Factors (m * theta_i) ---
    # Outer product: [L] x [D/2] -> [L, D/2]
    # "i,j->ij" means every element of i is multiplied by every element of j
    freqs = torch.einsum("bi,j->bij", positions.float(), inv_freq)

    top = torch.stack((freqs.cos(), -freqs.sin()), dim=-1)
    bottom = torch.stack((freqs.sin(), freqs.cos()), dim=-1)
    
    RoPE = torch.stack((top, bottom), dim=-2)
    return RoPE.float().to(device)

In [16]:
# DEFINE GET BATCH
def get_random_chunk(split, batch_size):
    """Fetches random documents, tokenizes them, and concatenates
    them into one large, flat list of tokens. also returns one large flat list of positions"""
    
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data), (batch_size,))
    
    # Tokenize all texts. The post-processor will automatically
    #    add your "<|end_of_text|>" token to each one.
    encoded_texts = tokenizer.encode_batch([data[i]['text'] for i in ix.tolist()])
    
    # Concatenate all lists into one flat list
    all_tokens = []
    all_pos = []
    for encoding in encoded_texts:
        all_tokens.extend(encoding.ids)
        all_pos.extend(range(len(encoding.ids)))
        
    # Convert to a tensor
    return torch.tensor(all_tokens, dtype=torch.long), torch.tensor(all_pos, dtype=torch.long)
    
def get_batch(split, batch_size):
    data, pos = get_random_chunk(split, batch_size)
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    p = torch.stack([pos[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y, p = x.to(device), y.to(device), p.to(device)
    return x, y, p

In [17]:
# DEFINE ESTIMATE LOSS FUNCTION
@torch.no_grad()
def estimate_loss():
    out = {}
    m.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(estimate_loss_iters)
        for k in range(estimate_loss_iters):
            X, Y, P = get_batch(split, minibatch_size)
            logits, loss = m(X, P, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    m.train()
    return out

In [18]:
# MODEL DEFINITION
class Head(nn.Module):
    """one head of Scaled Dot Product Attention"""
    def __init__(self, head_size, n_embed):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(max_seq_len, max_seq_len)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, RoPE_slice):
        #input of size (batch, time-step, channels)
        #output of size (batch, time-step, head size)
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)

        # Reshape Q and K for RoPE (Split the last dim into 128 * 2)
        # New Shape: [B, T, d_half, 2] -> [B, T, 128, 2]
        k_reshaped = k.view(B, T, -1, 2) 
        q_reshaped = q.view(B, T, -1, 2)

        # Apply Rotation via Batched Matrix Multiplication
        # [B, T, 128, 2] @ [B, T, 128, 2, 2] -> [B, T, 128, 2]
        # NOTE: PyTorch broadcasts the RoPE_slice if it only has shape [T, d_half, 2, 2]
        k_rotated = torch.einsum('bthc, bthcd -> bthd', k_reshaped, RoPE_slice)
        q_rotated = torch.einsum('bthc, bthcd -> bthd', q_reshaped, RoPE_slice)

        # Flatten back to the original head dimension
        # New Shape: [B, T, d_k] -> [B, T, 256]
        k = k_rotated.view(B, T, -1)
        q = q_rotated.view(B, T, -1)

        # compute the attention scores ('affinities')
        wei = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        #perform the weighted aggregation of the values
        v = self.value(x)
        out = wei @ v
        return out
        
class MultiHeadAttention(nn.Module):
    """Multiple heads of attention in paralel"""
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size, n_embed) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, positions):
        # Shape: [B, T, d_half, 2, 2]
        RoPE_emb = get_RoPE(positions)

        # Use torch.chunk to split the tensor into 8 pieces along dimension 2 (d_half)
        # The result is a tuple of 8 smaller tensors
        rope_head_chunks = torch.chunk(RoPE_emb, n_head, dim=-3)

        # To get a list, convert the tuple:
        rope_head_list = list(rope_head_chunks)
        
        out = torch.cat([h(x, rope_head_list[head_id]) for head_id, h in enumerate(self.heads)], dim=-1)
        out = self.dropout(self.proj(out))
        return out
        
class FeedForward(nn.Module):
    """Feed Forward Block"""
    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed), 
            nn.GELU(), 
            nn.Linear(4 * n_embed, n_embed), 
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """Transformer Decoder Block"""
    def __init__(self, n_embed, n_head):
        super().__init__()
        head_size = n_embed // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embed)
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self, x_and_positions):
        x, positions = x_and_positions
        y = self.sa(x, positions)
        x = self.ln1(x + y)
        y = self.ffwd(x)
        x = self.ln2(x + y)
        return x, positions

class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.blocks = nn.Sequential(*[Block(n_embed, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            
    def forward(self, index, positions, targets=None):
        B, T = index.shape
        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(index) # (B,T,C)
        x, pos = self.blocks((tok_emb, positions)) # (B,T,C)
        
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss

    def generate(self, index, max_new_tokens):
        with torch.no_grad():
            self.eval()
            for _ in range(max_new_tokens):
                # get positions tensor
                T = index.shape[-1]
                pos = torch.unsqueeze(torch.arange(T, dtype=torch.long, device=index.device), dim=0)
                # get the logits for each char in the index
                logits, loss = self(index, pos)
                # only look at the logits for each last letter
                logits = logits[:, -1, :]
                # turn each batch's last-letter logits to a normalized probability
                probs = F.softmax(logits, dim=-1)
                # choose one outcome randomly based on the previously defines probabilities
                index_next = torch.multinomial(probs, num_samples=1) #(B, 1)
                #concatenate the letter choice for each batch onto the end of the existing char list
                index = torch.cat((index, index_next), dim=1) #(B, T+1)
            self.train()
        return index

In [19]:
# SAVE/LOAD CHECKPOINT FUNCTIONS
def load_checkpoint(model, optimizer, path):
    """loads a previous checkpoint from the checkpoint path specified above, 
    returns the most recent optimizer step that model and optimizer were saved on"""
    checkpoint = torch.load(path, map_location=torch.device('cuda'), weights_only=False)
    
    # Load the learned knowledge
    model.load_state_dict(checkpoint['model_state_dict'])
    
    # Load the learned history/momentum
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    
    # Return the iteration number to start the loop from
    return checkpoint['iteration']

def save_checkpoint(step, model, optimizer, path):
    """saves the current model and optimizer state to the checkpoint path specified at the top
    Prints to confirm completion"""
    checkpoint = {
        'iteration': step,
        # 1. Save the model's learned knowledge
        'model_state_dict': model.state_dict(),
        # 2. Save the optimizer's learned history/momentum
        'optimizer_state_dict': optimizer.state_dict()
    }

    # Save the dictionary to a single file. This overwrites the old checkpoint.
    torch.save(checkpoint, path)
    print(f"Checkpoint {step} Saved")

In [20]:
# INITIALIZE AND LOAD MODEL AND OPTIMIZER

# define the model
m = GPTLanguageModel(vocab_size)
m.to(device)

# define a PyTorch optimizer
optim = torch.optim.AdamW(m.parameters(), lr=learning_rate)

if save:
    # check for an existing checkpoint and load if necessary
    if os.path.exists(CHECKPOINT_PATH):
        print(f"Loading checkpoint from {CHECKPOINT_PATH}")
        RESUME_ITER = load_checkpoint(m, optim, CHECKPOINT_PATH)
        print(f"Loaded succesfuly from step: {RESUME_ITER}")
    else:
        print(f"No checkpoint found at {CHECKPOINT_PATH}.")
        print(f"New model will be training from step: {RESUME_ITER}")

print(f"Total parameters: {sum(p.numel() for p in m.parameters()):,}")

Loading checkpoint from checkpoints/model02.pt
Loaded succesfuly from step: 20100
Total parameters: 68,561,200


In [10]:
# How Many total steps do you want to train to?
max_iters = 30000

In [11]:
# TRAINING LOOP
print(f"{time.localtime().tm_hour:02}:{time.localtime().tm_min:02}:{time.localtime().tm_sec:02}")
#losses = estimate_loss()    # estimate a base loss before training session
#print(f"Step: {RESUME_ITER:04d}, Train Loss: {losses['train']:.3f}, Val Loss: {losses['val']:.3f}")
print(f"{time.localtime().tm_hour:02}:{time.localtime().tm_min:02}:{time.localtime().tm_sec:02}")

if save:
    # if never trained log the pre optim loss in a new csv
    if RESUME_ITER == 0:
        print(f"Initializing new data collection file at: {LOG_FILE}")
        with open(LOG_FILE, 'a') as f:
            f.write(f"step,train_loss,val_loss\n{RESUME_ITER:05d},{losses['train']:.3f},{losses['val']:.3f}\n")

# Dictionary to hold total accumulated time and count
times_tracker = defaultdict(lambda: {'time': 0.0, 'count': 0})

train_start_event = torch.cuda.Event(enable_timing=True)
train_end_event = torch.cuda.Event(enable_timing=True)

train_start_event.record()
for iter in range(RESUME_ITER + 1, max_iters + 1):
    optim.zero_grad(set_to_none=True)
    for step in range(accumulation_steps):
        xb, yb, pb = get_batch('train', minibatch_size)
        logits, loss = m(xb, pb, yb)
        loss = loss / accumulation_steps
        loss.backward()
    optim.step()

    if (iter - RESUME_ITER) % eval_iters == 0:
        train_end_event.record()
        torch.cuda.synchronize()
        # elapsed_time returns milliseconds, so divide by 1000.0
        elapsed_time_sec = train_start_event.elapsed_time(train_end_event) / 1000.0
        times_tracker['train']['time'] += elapsed_time_sec
        times_tracker['train']['count'] += eval_iters
        
        tic = time.perf_counter()
        losses = estimate_loss()
        torch.cuda.synchronize()
        times_tracker['estimate']['time'] += (time.perf_counter() - tic)
        times_tracker['estimate']['count'] += 1
        
        print(f"Step: {iter:04d}, Train Loss: {losses['train']:.3f}, Val Loss: {losses['val']:.3f}")
        tic = time.perf_counter()
        
        if save:
            save_checkpoint(iter, m, optim, CHECKPOINT_PATH)
    
            # write a new line in our data csv
            with open(LOG_FILE, 'a') as f:
                f.write(f"{iter:05d},{losses['train']:.3f},{losses['val']:.3f}\n")
            print(f"Step Documented")
            
        times_tracker['save']['time'] += (time.perf_counter() - tic)
        times_tracker['save']['count'] += 1
            

        print(f"{time.localtime().tm_hour:02}:{time.localtime().tm_min:02}:{time.localtime().tm_sec:02}")
        train_start_event.record()

10:04:44
10:04:44


KeyboardInterrupt: 

In [12]:
print(f"Total time train: {times_tracker['train']['time'] /60/60:.3f}hr")
print(f"Total time estim: {times_tracker['estimate']['time'] /60/60:.3f}hr")
print(f"Total time Check: {times_tracker['save']['time'] /60/60:.3f}hr")
print(f"Average time per Optimizer step: {times_tracker['train']['time'] / times_tracker['train']['count']:.3f}sec")
print(f"Average time {eval_iters} Optimizr steps: {times_tracker['train']['time'] / (times_tracker['train']['count'] / eval_iters)/60:.3f}min")
print(f"Average time per Estimate  Loss: {times_tracker['estimate']['time'] / times_tracker['estimate']['count']:.3f}sec")
print(f"Average time per Chckpoint Save: {times_tracker['save']['time'] / times_tracker['save']['count']:.3f}sec")

Total time train: 5.216hr
Total time estim: 0.314hr
Total time Check: 0.011hr
Average time per Optimizer step: 5.216sec
Average time 100 Optimizr steps: 8.693min
Average time per Estimate  Loss: 31.373sec
Average time per Chckpoint Save: 1.085sec


In [21]:
#prompt = tokenizer.encode(input("Prompt:\n"), add_special_tokens=False)
prompt = torch.randint(450, 451, (1,)).item()
context = torch.tensor([[prompt]], dtype=torch.long, device=device)
generated_chars = tokenizer.decode(m.generate(context, max_new_tokens=200)[0].tolist())
print(generated_chars)

The U.S. Senate and Senate rejected the chamber's plan and eliminated minimum wage runs, the Post adds. General Assembly President and Blackwell (R) pronounced it a landmark decision.

"I think [the)" panel responses stated it was "unconstitutional," the Post wrote during a hearing in federal court in January. Although the critics still didn't consider it a step up, the effort had taken a number of studies -- and narrowed down language regarding serious measures to regulate the Atlantic, "temporary inspections of American banks" -- as well as those at the U.S. Capitol in Washington where gun control measures are being sought.

"Don't take liberty very seriously," Oppenheim argued, putting Congress' "comparable" fiscal responsibility in his mouth.

"Andispensing force and child-supporting groups are getting very serious resistance to enumerated veterans' demands, especially which now urge the League of Conservation Voters to abandon its mandate for policymaking, "the bill would effectiv

In [11]:
tokenizer.token_to_id("A")

36

In [None]:
while True:
    prompt = input("Prompt:\n")
    context = torch.tensor([tokenizer.encode(prompt, add_special_tokens=False).ids], dtype=torch.long, device=device)
    generated_chars = tokenizer.decode(m.generate(context, max_new_tokens=200)[0].tolist())
    print(f"Completed:\n{generated_chars}")