# Import Dependencies
---

In [1]:
import sys
import os
import numpy as np

project_root = os.path.dirname(os.getcwd())
sys.path.append(project_root)

# Pytorch
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim

# Import our various classes
from gpt import GPTLanguageModel
from dataloader.dataloader import MyDataLoader
from tokenizer.tokenizer import MyTokenizer

# Defining our ModelTrainer
---

In [14]:
# Defining our ModelTrainer
class ModelTrainer():
    def __init__(self, model, tokenizer, train_dl, test_dl, train_params, device):
        self.model = model.to(device)
        self.tokenizer = tokenizer
        self.train_dl = train_dl
        self.test_dl = test_dl
        self.train_params = train_params
        self.device = device


    def train_model(self, num_saves=1):
        """ 
            Function to train our model based on params defined in 'self.train_params'

        Inputs:
            num_saves: (int) define number of times the model weights will be saved during training
        
        """
        print(sum(p.numel() for p in self.model.parameters())/1e6, 'M parameters')
        optimizer = torch.optim.AdamW(self.model.parameters(), self.train_params['learning_rate'])
    
        max_iters = self.train_params['max_iters']
        eval_interval = self.train_params['eval_interval']
        save_interval = int(max_iters / num_saves)
        loss_tensor = torch.zeros((int(max_iters / eval_interval) + 1, 3))
        
        iters = 0
        for _ in range(self.train_params['max_epochs']):
            for data in self.train_dl:
                tokens = torch.tensor(self.tokenizer.encode_as_ids(data[0][0]), dtype=torch.long)
                for (X, Y) in self._yield_batch(tokens, util_rate=0.5):
        
                    if (iters % eval_interval == 0) or (iters == max_iters - 1):
                        losses = self._estimate_loss()
                        print(f"{iters:>8}/{max_iters:>8}: Train - {losses[0]:>7.4f}, Test - {losses[1]:>7.4f}")
                        if (iters != max_iters-1):
                            loss_tensor[int(iters / eval_interval), :] = torch.tensor([iters, losses[0], losses[1]])
                        else:
                            loss_tensor[-1, :] = torch.tensor([iters, losses[0], losses[1]])
                            
                    logits, loss = self.model(X, Y)
                    optimizer.zero_grad(set_to_none=True)
                    loss.backward()
                    optimizer.step()
                    iters += 1
        
                    if iters % save_interval == 0:
                        base_dir = os.path.join(os.path.dirname(os.getcwd()), 'transformer')
                        filename = datetime.now().strftime("GPTLanguageModel_%d_%m_%Y_%M_%H")
                        self.model.save_model(directory=os.path.join(base_dir, 'model_weights'), filename=filename + '.model')
                        csv_path = os.path.join(base_dir, 'loss_results', filename + '.csv')
                        np.savetxt(csv_path, loss_tensor.numpy(), delimiter=",", header="Iteration,Train Loss,Test Loss", comments='')
                        print(f"Loss data saved to {csv_path} as CSV")
        
                    if iters == max_iters:
                        print(f"\nTraining Complete - Sample Generation:\n{'-'*60}")
                        starting_tokens = self.tokenizer.encode_as_ids("Once upon a time, there was a frog")
                        print(self.tokenizer.decode(self.model.generate(starting_tokens, max_new_tokens=1000)))
                        return
        
    
    def _yield_batch(self, tokens, util_rate=0.5, iters_per_batch=None):
        """
            Generator function that takes in text and returns a number of batches for each dataset based on the utilization rate or specific_iter.
    
            Inputs:
                tokens:           (List(int)) The text provided by the dataloader, encoded as IDs from the tokenizer
                util_rate:        (float) Value from (0, 1] that specifies the % of possible batches that are generated before moving to next sample
                iters_per_batch:  (int) If set to none, we use util rate to determine num batches from this data. If set to an int, we use that number of batches.
    
            Yields (generator function) batches of data in the form of GPU-mounted pytorch tensors until util_rate is tripped.
        """
        # Store vars for convenient use and get shuffled indices for starting tokens
        context_size = self.model.params['context_size']
        batch_size = self.model.params['batch_size']
        sample_length = len(tokens) - context_size
        shuffled_indices = torch.randperm(sample_length)
        
        # Compute number of batches we'll yield
        if iters_per_batch == None:
            num_batches = int( (sample_length * util_rate) // batch_size ) 
        else:
            num_batches = min(iters_per_batch, int(sample_length // batch_size)) 
    
        # Generate batches
        for batch in range(num_batches):
            ix = shuffled_indices[(batch * batch_size):((batch+1) * batch_size)].tolist()    # List (length = B) of starting indices
            X = torch.stack([tokens[i:(i+context_size)] for i in ix])                        # [B,T] batch of inputs
            Y = torch.stack([tokens[(i+1):(i+context_size+1)] for i in ix])                  # [B,T] batch of outputs
            X, Y = X.to(self.device), Y.to(self.device)
            yield X, Y


    @torch.no_grad()
    def _estimate_loss(self):
        """ Function to estimate our loss (train and test). Returns the mean train and test loss (as a tuple) """
        train_loss = torch.tensor(0.0, device=self.device)
        test_loss = torch.tensor(0.0, device=self.device)
    
        self.model.eval()
        for sample_num, data in enumerate(self.train_dl):
            train_tokens = torch.tensor(self.tokenizer.encode_as_ids(data[0][0]), dtype=torch.long, device=self.device)
            for (X, Y) in self._yield_batch(tokens=train_tokens, iters_per_batch=self.train_params['iters_per_eval_sample']):
                _, loss = self.model(X, Y)
                train_loss += loss
            if sample_num == (self.train_params['eval_samples'] - 1):
                break  # We've gone through 'n_samples' training samples, each for 'iters_per_sample' iterations of batch_size 'params['batch_size']'
    
        for sample_num, data in enumerate(self.test_dl):
            test_tokens = torch.tensor(self.tokenizer.encode_as_ids(data[0][0]), dtype=torch.long, device=self.device)
            for (X, Y) in self._yield_batch(tokens=test_tokens, iters_per_batch=self.train_params['iters_per_eval_sample']):
                _, loss = self.model(X, Y)
                test_loss += loss
            if sample_num == (self.train_params['eval_samples'] - 1):
                break 
        self.model.train()
    
        # Move the final loss calculation to CPU just before returning the result
        num_samples = self.train_params['eval_samples'] * self.train_params['iters_per_eval_sample'] * (self.model.params['batch_size'] ** 0.5)
        return (train_loss / num_samples).to('cpu').item(), (test_loss / num_samples).to('cpu').item()

# Train Our Model
---

In [15]:
model_params = {
    'batch_size': 32,
    'context_size': 32,
    'embed_dim': 512,
    'vocab_size': 16384,
    'n_head': 6,
    'n_layer': 6,
    'dropout': 0.2
}
train_params = {
    'max_iters': 1e4,
    'eval_interval': 1e3,
    'learning_rate': 3e-4,
    'max_epochs': 1,
    'eval_samples': 20,
    'iters_per_eval_sample': 2
}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [16]:
# Import our dataloader
dl = MyDataLoader(
    promptuser=False, 
    batch_size=1, 
    shuffle=True)    

train_dataloader = dl.get_train_dataloader()
test_dataloader = dl.get_test_dataloader()

# Import tokenizer
tokenizer = MyTokenizer()

# Instantiate GPT Model
model = GPTLanguageModel(model_params, device)

Embeddings loaded successfully from C:\Users\lucas\Desktop\Lucas\Coding\ML Projects\Embedding Model\embedding_model\embeddings\v16384_d512_4_26_24.pth.


In [None]:
# Train our model
trainer = ModelTrainer(model = model, 
                       tokenizer = tokenizer, 
                       train_dl = train_dataloader, 
                       test_dl = test_dataloader, 
                       train_params = train_params, 
                       device = device)

trainer.train_model(num_saves = 2)

35.69152 M parameters
        0/10000.0: Train -     0.3058, Test -     0.3052
     1000/10000.0: Train -     0.2397, Test -     0.2360
     2000/10000.0: Train -     0.2223, Test -     0.2223


# Helpful for Debugging / Confirmation
---

In [None]:
# Test to ensure that our embeddings are working as expected
embeds = model.token_embedding_table.weight
str = "boy"
tok = tokenizer.encode_as_ids(str)[0]
emb_str = embeds[tok:tok+1, :].T
_, indices = torch.topk((embeds@emb_str).flatten(), 11)
for i, idx in enumerate(indices):
    print(f"Tok {i:>2}: \"{tokenizer.decode(idx.tolist())}\"")