# Training Model

In [1]:
from config import ModelConfig

tokenizer_dir = "character_tokenizer.pkl"
input_data_dir = 'gala.txt'
model_dir = "bigram_model_state.pth"

In [2]:
from tokenizer import CharacterTokenizer

tokenizer = CharacterTokenizer()
tokenizer.train('gala.txt')
tokenizer.save(tokenizer_dir)

In [3]:
from trainer import Trainer

trainer = Trainer(tokenizer_dir, input_data_dir, model_dir)

In [4]:
lossi, val_lossi, model = trainer.run_training()

# Inference

In [6]:
from tokenizer import CharacterTokenizer
import torch

tokenizer = CharacterTokenizer.load(tokenizer_dir)

In [7]:
# from custom_models import BigramModel
# model = BigramModel(tokenizer.num_tokens)
# model.load_state_dict(torch.load(model_dir))

In [8]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

start_tokens = torch.tensor([[tokenizer.token_to_idx[tokenizer.start_token]]]).to(device)
tokens = model.generate(start_tokens, max_new_tokens=512, end_token_idx=tokenizer.token_to_idx[tokenizer.end_token])

In [9]:
print(tokenizer.decode_batch(tokens)[0])

# Summary
- bigram validation loss: 2.4880

# Decoder Model

In [10]:
tokenizer_dir = "character_tokenizer.pkl"
input_data_dir = 'gala.txt'
model_dir = "decoder_model_multihead_blocks.pth"

In [11]:
from trainer import Trainer

trainer = Trainer(tokenizer_dir, input_data_dir, model_dir, model='decoder')

In [12]:
lossi, val_lossi, model = trainer.run_training()

In [None]:
import matplotlib.pyplot as plt

epochs = range(1, len(lossi) + 1)  # Number of epochs

# Plotting the loss and validation loss
plt.figure(figsize=(8, 6))
plt.plot(epochs, lossi, label='Training Loss')
plt.plot(epochs, val_lossi, label='Validation Loss', linestyle='--')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True)
plt.show()

In [14]:
from tokenizer import CharacterTokenizer
import torch

tokenizer = CharacterTokenizer.load(tokenizer_dir)

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
tokens = model.generate(start_tokens, max_new_tokens=512, end_token_idx=tokenizer.token_to_idx[tokenizer.end_token])
print(tokenizer.decode_batch(tokens)[0])

# Summary
Multi head attention loss - 1.22

# Testing

In [16]:
tokenizer_dir = "character_tokenizer.pkl"
input_data_dir = 'gala.txt'
model_dir = "decoder_model_multihead_blocks.pth"

In [17]:
from nn import DecoderModel
from tokenizer import CharacterTokenizer
from config import ModelConfig
import torch

tokenizer = CharacterTokenizer.load(tokenizer_dir)
model_testing = DecoderModel(tokenizer.num_tokens,
                     ModelConfig.d_model,
                     ModelConfig.head_dim,
                     ModelConfig.block_size,
                     ModelConfig.n_head,
                     ModelConfig.ffn_dim,
                     ModelConfig.layers,
                     ModelConfig.dropout,
                     tokenizer.token_to_idx[tokenizer.pad_token])
model_testing.load_state_dict(torch.load(model_dir))

In [19]:
start_tokens = torch.tensor([[tokenizer.token_to_idx[tokenizer.start_token]]]).to('cuda')
tokens = model.generate(start_tokens, max_new_tokens=512, end_token_idx=tokenizer.token_to_idx[tokenizer.end_token])
print(tokenizer.decode_batch(tokens)[0])