# Gzip Funcs

In [None]:
import os
import gzip
import subprocess
import shutil
from tqdm import tqdm

folder_dir = '/home/lukas928/Synth2/SynthAI/test_folder/'

def compress_folder(directory):
     # Get a list of all files in the directory
    files_in_directory = os.listdir(directory)
    
    # Iterate over each file
    for filename in tqdm(files_in_directory,desc='Compressing Files'):
        # Construct full file path
        file_path = os.path.join(directory, filename)
        
        # Open the original file
        with open(file_path, 'rb') as f_in:
            # Construct the name of the compressed file
            compressed_file_path = file_path + '.gz'
            
            # Open the compressed file
            with gzip.open(compressed_file_path, 'wb') as f_out:
                # Copy the contents of the original file to the compressed file
                shutil.copyfileobj(f_in, f_out)
        
        # Delete the original file
        os.remove(file_path)
    
    print("All files in the directory have been compressed and the original files have been deleted.")
            
def decompress_folder(directory):
# Get a list of all files in the directory
    files_in_directory = os.listdir(directory)
    
    # Iterate over each file
    for filename in tqdm(files_in_directory,desc='Decompressing Files'):
        # Check if the file is a gzip file
        if filename.endswith('.gz'):
            # Construct full file path
            file_path = os.path.join(directory, filename)
            
            # Open the gzip file
            with gzip.open(file_path, 'rb') as f_in:
                # Construct the name of the decompressed file
                decompressed_file_path = file_path[:-3]
                
                # Open the decompressed file
                with open(decompressed_file_path, 'wb') as f_out:
                    # Copy the contents of the gzip file to the decompressed file
                    shutil.copyfileobj(f_in, f_out)
            
            # Delete the original gzip file
            os.remove(file_path)
    
    print("All gzip files in the directory have been decompressed and deleted.")


In [1]:
import os
import requests
import tiktoken
import numpy as np
os.getcwd()

'/home/lukas928/Synth2/SynthAI/src'

In [2]:
# download the tiny shakespeare dataset
def create_file(file_dir, file_name):
    if not os.path.exists(file_dir):
        os.makedirs(file_dir)
    return os.path.join(file_dir, file_name)
    
input_file_path = create_file('/home/lukas928/Synth2/SynthAI/data','shakespeare.txt')
if not os.path.exists(input_file_path):
    data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
    with open(input_file_path, 'w') as f:
        f.write(requests.get(data_url).text)

In [3]:
with open(input_file_path, 'r') as f:
    data = f.read()

data[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [5]:
enc = tiktoken.encoding_for_model('gpt2')
tokens = enc.encode(data)
tokens.append(enc.eot_token)
len(tokens)

338026

In [70]:
tokens = np.array(tokens)
splits = np.array_split(tokens,10)
for i in range(len(splits)):
    np.save(f'/home/lukas928/Synth2/SynthAI/test_folder/t_{i}.tokens',np.array(splits[i]))

# Training Script

In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import tiktoken
import numpy as np
def create_file(file_dir, file_name):
    if not os.path.exists(file_dir):
        os.makedirs(file_dir)
    return os.path.join(file_dir, file_name)
    
input_file_path = create_file('/home/lukas928/Synth2/SynthAI/data','shakespeare.txt')

with open(input_file_path, 'r') as f:
    data = f.read()

enc = tiktoken.encoding_for_model('gpt2')
tokens = enc.encode_ordinary(data)
tokens.append(enc.eot_token)
tokens[:10]


[5962, 22307, 25, 198, 8421, 356, 5120, 597, 2252, 11]

In [2]:
# Get a batch of data
train = tokens[:int(.8*len(tokens))]
test = tokens[int(.8*len(tokens)):]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def get_batch(split,btch,cntx):
    data = train if split =='train' else test 
    len_data = len(data)-cntx  

    # Generate all starting indices at once
    start_indices = torch.randint(len_data, (btch,)).tolist()

    x = torch.tensor(np.array([data[i:i+cntx] for i in start_indices])) #.pin_memory().to(device, non_blocking=True)
    y = torch.tensor(np.array([data[i+1:i+cntx+1] for i in start_indices]))#.pin_memory().to(device, non_blocking=True)
    
    return x, y

x,y = get_batch('train',4,8)
x.shape, y.shape

(torch.Size([4, 8]), torch.Size([4, 8]))

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange
from dataclasses import dataclass
import math
class Wte_Wpe(nn.Module):
    def __init__(self, vocab_size, d_model, cntx_len,dropout=0.0):
        super(Wte_Wpe, self).__init__()
        self.d_model = d_model
        self.cntx_len = cntx_len
        self.vocab_size = vocab_size
        self.dropout_p = dropout
        self.wte = nn.Embedding(vocab_size, d_model)
        self.wpe = nn.Embedding(cntx_len, d_model)
        self.init_weights()

    def init_weights(self):
        nn.init.normal_(self.wte.weight, std=0.02)
        nn.init.normal_(self.wpe.weight, std=0.02)

    def forward(self, x):
        pos = torch.arange(0, self.cntx_len,dtype=torch.long, device=x.device)
        return F.dropout(self.wte(x)+self.wpe(pos),p=self.dropout_p)
    

class CSA_torch(nn.Module):
    def __init__(self, d_model, n_head,dropout=0.0):
        super(CSA_torch, self).__init__()
        self.d_model = d_model
        self.n_head = n_head
        assert d_model % n_head == 0
        self.head_size = d_model // n_head
        self.dropout_p = dropout

        self.qkv = nn.Linear(d_model, 3*d_model)
        self.fc_out = nn.Linear(d_model, d_model)
        self.init_weights()
    def init_weights(self):
            nn.init.xavier_uniform_(self.qkv.weight)
            nn.init.xavier_uniform_(self.fc_out.weight)
            nn.init.zeros_(self.qkv.bias)
            nn.init.zeros_(self.fc_out.bias)
    def forward(self, x):
        q,k,v = self.qkv(x).split(self.d_model, dim=2)
        q = rearrange(q, 'b t (nh hs) -> b nh t hs', nh=self.n_head, hs=self.head_size)
        k = rearrange(k, 'b t (nh hs) -> b nh t hs', nh=self.n_head, hs=self.head_size)
        v = rearrange(v, 'b t (nh hs) -> b nh t hs', nh=self.n_head, hs=self.head_size)
        y = F.scaled_dot_product_attention(q, k, v,
                                           dropout_p=self.dropout_p, is_causal=True)
        y = rearrange(y, 'b nh t hs -> b t (nh hs)')
        return F.dropout(self.fc_out(y), p=self.dropout_p)

class GPT_Block_torch(nn.Module):
    def __init__(self, d_model, n_head, dropout=0.0):
        super(GPT_Block_torch, self).__init__()
        self.d_model = d_model
        self.n_head = n_head
        self.dropout_p = dropout

        self.ln1 = nn.LayerNorm(d_model)
        self.csa = CSA_torch(d_model,n_head,dropout)
        self.ln2 = nn.LayerNorm(d_model)
        self.mlp = nn.Sequential(
            nn.Linear(d_model, 4*d_model),
            nn.GELU(),
            nn.Linear(4*d_model, d_model),
            nn.Dropout(dropout)
        )
        self.init_weights()
    
    def init_weights(self):
        for module in self.mlp:
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                nn.init.zeros_(module.bias)

    def forward(self, x):
        x = x + self.csa(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x


@dataclass
class GPT_Config:
    vocab_size: int = 50257
    d_model: int = 2048
    cntx_len: int = 32 
    n_layers: int = 15 
    n_head: int = 64 
    dropout_p: float = 0.0

class GPT_torch(nn.Module):
    def __init__(self, config):
        super(GPT_torch, self).__init__()
        self.config = config
        self.wte_wpe = Wte_Wpe(config.vocab_size, config.d_model, config.cntx_len, config.dropout_p)
        self.blocks = nn.ModuleList([GPT_Block_torch(config.d_model, config.n_head, config.dropout_p) for _ in range(config.n_layers)])
        self.ln = nn.LayerNorm(config.d_model)
        self.fc_out = nn.Linear(config.d_model, config.vocab_size, bias=False)
 
    def forward(self,x,y=None):
        x = self.wte_wpe(x)
        for block in self.blocks:
            x = block(x)
        logits = self.fc_out(self.ln(x))
        if y is not None:
            loss = F.cross_entropy(logits.view(-1,logits.size(-1)), y.view(-1))
            return loss, logits
        return logits
    
    def print_model_size(self):
        total_params = sum(p.numel() for p in self.parameters())
        formatted_size = "{:,}".format(total_params)
        print(f"Model size: {formatted_size} parameters")
    
    def count_model_memory(self):
        total_memory = 0
        for param in self.parameters():
            # Multiply number of elements in tensor by its byte size
            total_memory += param.nelement() * param.element_size()

        # Convert bytes to megabytes
        total_memory_mb = total_memory / (1024 ** 2)

        # Get total GPU memory and used memory
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        total_gpu_memory_mb = torch.cuda.get_device_properties(device).total_memory / (1024 ** 2)
        current_gpu_memory_mb = torch.cuda.memory_allocated(device) / (1024 ** 2)

        print(f"Model memory usage: {total_memory_mb:.2f} MB")
        print(f"Current GPU memory usage: {current_gpu_memory_mb:.2f} MB / {total_gpu_memory_mb:.2f} MB")
    

    # https://github.com/karpathy/nanoGPT/blob/master/model.py
    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at cntx_len
            idx_cond = idx if idx.size(1) <= self.config.cntx_len else idx[:, -self.config.cntx_len:]
            # forward the model to get the logits for the index in the sequence
            logits = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx

In [56]:
def inv_sqrt_scheduler(step, num_warmup_steps, scale=.01, print_lr=False):
    if step == 0:
        lr = .05 if num_warmup_steps == 0 else 0
    elif step < num_warmup_steps:
        lr = float(step) / float(num_warmup_steps)
    else:
        lr = (1. / math.sqrt(step)) * scale

    if print_lr: 
        print(lr)

    return lr

def adaptive_momentum_scheduler(grad, lr, momentum, momentum_decay=0.9, lr_min=1e-5, lr_max=0.1):
    # Update the momentum
    momentum = momentum_decay * momentum + (1 - momentum_decay) * grad

    # Adjust the learning rate based on the momentum
    if momentum > 0:
        # If the momentum is high, decrease the learning rate
        lr /= (1 + momentum)
    else:
        # If the momentum is low, increase the learning rate
        lr *= (1 - momentum)

    # Clip the learning rate to be within [lr_min, lr_max]
    lr = max(lr_min, min(lr, lr_max))

    return lr, momentum


In [62]:
tt_config = GPT_Config(vocab_size=50257,cntx_len=4,d_model=4,n_head=4,n_layers=1)
tt = GPT_torch(config=tt_config)
x,y = get_batch('train',4,4)
x.shape,y.shape

(torch.Size([4, 4]), torch.Size([4, 4]))

In [65]:
lr_init = .1
momentum = 0
optimizer = torch.optim.AdamW(params=tt.parameters(),lr=lr_init)
max_steps = 5000
batch_size = 4
eval_interval = 500
step = 0
warm_up_steps = 100
# TODO: Estimate Losses

def convert_readable(generated_output, enc = tiktoken.encoding_for_model('gpt2')):
    return enc.decode_batch(generated_output.tolist())

@torch.inference_mode()
def eval_losses(iters):
    losses = []
    for _ in range(iters):
        x,y = get_batch('valid',batch_size,tt_config.cntx_len)
        loss,_ = tt(x,y)
        losses.append(loss.item())
    avg_loss = sum(losses) / len(losses)
    return avg_loss

@torch.inference_mode()
def update_grad(model, lr, momentum):
    grad = sum(param.grad.norm().item() for param in model.parameters()) / sum(p.numel() for p in model.parameters())
    return adaptive_momentum_scheduler(grad,lr,momentum)
    

tt.print_model_size()
while True:        
    step +=1
    x,y = get_batch('train',batch_size,tt_config.cntx_len)
    loss,_ = tt(x,y)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    lr, momentum = update_grad(tt,lr_init,momentum)
    for pg in optimizer.param_groups:
        pg['lr'] = lr
    
    # print(f'loss: {loss.item()}')
    
    
    if step % eval_interval == 0:
        avg_loss = eval_losses(200)
        print(f'Average Loss: {avg_loss}')
        
    if step == max_steps:
        print(f'Final Average Loss: {eval_losses(500)}')
        break


Model size: 402,324 parameters
Average Loss: 6.435782206058502
Average Loss: 6.403727095127106
Average Loss: 6.646209037303924
Average Loss: 6.481482603549957
Average Loss: 6.598695299625397
Average Loss: 6.379622875452042
Average Loss: 6.5467028558254245
Average Loss: 6.651079334020615
Average Loss: 6.392141599655151
Average Loss: 6.587835516929626
Final Average Loss: 6.649521796703339
