In [28]:
import os

import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
from utils.preprocessing import TextProcessor, TokenizeText, TextDataset
from utils.model import GPTLanguageModel

In [29]:
path_to_txt_files = "data/raw_files"
corpus_file = "data/preprocessed/corpus.txt"
tokenizer_save_path = "artifacts/custom_tokenizer"

# Reduced hyperparameters to prevent OOM
batch_size = 32  # Reduced from 64
block_size = 128  # Reduced from 256
n_embd = 256  # Reduced from 384
n_head = 4  # Reduced from 6
n_layer = 4  # Reduced from 6
dropout = 0.1  # Slightly reduced
learning_rate = 1e-4  # Slightly lower
max_iters = 3000  # Reduced iterations
eval_interval = 300  # Adjusted
eval_iters = 100  # Reduced

In [30]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Preprocessing and Tokenization
processor = TextProcessor(path_to_txt_files)
processor.create_corpus(corpus_file)

tokenizer_obj = TokenizeText(corpus_file, tokenizer_save_path)

Using device: cuda
Corpus saved to data/preprocessed/corpus.txt
Tokenizer trained and saved to artifacts/custom_tokenizer


In [31]:
train_ds = TextDataset(tokenizer_obj, corpus_file, block_size, random=False, split = "train")
val_ds = TextDataset(tokenizer_obj, corpus_file, block_size, random=False, split="val" )

In [32]:
train_dl = DataLoader(dataset=train_ds,batch_size=batch_size)
val_dl = DataLoader(dataset=val_ds, batch_size=batch_size)

In [33]:
model = GPTLanguageModel(
        vocab_size=tokenizer_obj.tokenizer.vocab_size,
        n_embd=n_embd,
        block_size=block_size,
        n_head=n_head,
        n_layer=n_layer,
        dropout=dropout
    ).to(device)

In [34]:
print(f'{sum(p.numel() for p in model.parameters())/1e6:.2f} M parameters')

8.86 M parameters


In [35]:
for a, b in train_dl:
    print(a.shape,b.shape)
    break

torch.Size([32, 128]) torch.Size([32, 128])


In [36]:
from utils.torchtrainer.trainer import Trainer

In [37]:
class ModifiedCrossEntropyLoss(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, logits, targets):
        B, T, C = logits.shape
        loss = F.cross_entropy(logits.view(B * T, C), targets.view(B * T))
        return loss
loss_fn = ModifiedCrossEntropyLoss()
loss_fn = loss_fn.to(device)

In [38]:
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [39]:
device

device(type='cuda')

In [40]:
from utils.torchtrainer.callbacks import IntraEpochReport0, EarlyStopping0
ier = IntraEpochReport0(10)
# es = EarlyStopping0(basis="val_loss")

In [41]:
trainer = Trainer( model,
                 epochs=2,
                 criterion=loss_fn,
                 input_shape=(32,128),
                 output_shape=(32,128,11057),
                 optimizer=optimizer,
                 callbacks = [ier],
                 display_time_elapsed=True,
                 metrics= [],
                 device = device)

In [48]:
1e-4

0.0001

In [42]:
history = trainer.fit(train_dl, val_dl)

EPOCH 1: : 759it [00:44, 17.46it/s]

 E-1 batch 756 loss: 6.39559


EPOCH 1: : 1515it [01:28, 17.08it/s]

 E-1 batch 1512 loss: 6.10937


EPOCH 1: : 2271it [02:13, 16.84it/s]

 E-1 batch 2268 loss: 5.93774


EPOCH 1: : 3027it [03:00, 16.15it/s]

 E-1 batch 3024 loss: 5.89951


EPOCH 1: : 3783it [03:44, 16.69it/s]

 E-1 batch 3780 loss: 5.84598


EPOCH 1: : 4539it [04:28, 16.96it/s]

 E-1 batch 4536 loss: 5.78299


EPOCH 1: : 5295it [05:13, 17.34it/s]

 E-1 batch 5292 loss: 5.72593


EPOCH 1: : 6051it [05:59, 16.56it/s]

 E-1 batch 6048 loss: 5.71736


EPOCH 1: : 6807it [06:44, 16.84it/s]

 E-1 batch 6804 loss: 5.68515


EPOCH 1: : 7563it [07:29, 16.70it/s]

 E-1 batch 7560 loss: 5.65123


EPOCH 1: : 7564it [07:29, 16.82it/s]


--> Metrics:   loss: 5.6510 ,val_loss: 5.6375
Time elapsed: 467.1709702014923 s


EPOCH 2: : 758it [00:45, 16.35it/s]

 E-2 batch 756 loss: 5.19351


EPOCH 2: : 1514it [01:32, 16.43it/s]

 E-2 batch 1512 loss: 5.13212


EPOCH 2: : 2270it [02:16, 17.00it/s]

 E-2 batch 2268 loss: 5.07668


EPOCH 2: : 3026it [03:01, 16.08it/s]

 E-2 batch 3024 loss: 5.11665


EPOCH 2: : 3782it [03:45, 16.89it/s]

 E-2 batch 3780 loss: 5.12133


EPOCH 2: : 4538it [04:30, 17.32it/s]

 E-2 batch 4536 loss: 5.10328


EPOCH 2: : 5294it [05:15, 17.53it/s]

 E-2 batch 5292 loss: 5.08359


EPOCH 2: : 6050it [05:59, 17.28it/s]

 E-2 batch 6048 loss: 5.09708


EPOCH 2: : 6806it [06:43, 17.72it/s]

 E-2 batch 6804 loss: 5.08601


EPOCH 2: : 7562it [07:28, 16.45it/s]

 E-2 batch 7560 loss: 5.06769


EPOCH 2: : 7564it [07:28, 16.88it/s]


--> Metrics:   loss: 5.0676 ,val_loss: 5.6133
Time elapsed: 932.8882060050964 s


In [43]:
torch.zeros((1, 1), dtype=torch.long, device=device)

tensor([[0]], device='cuda:0')

In [46]:
# Generation
context = tokenizer_obj.tokenize("My name")['input_ids'].to(device=device)
generated_text = tokenizer_obj.untokenize(model.generate(context, max_new_tokens=300)[0].tolist())
print("Generated Text:")
print(generated_text)
with open("output.txt", "w") as file:
    file.write(generated_text)

Generated Text:
My name was proof , If you last a gentleman . HORTENSIO swift wedded : Such wake , see he may do not be horse and For that liege , A gentleman shall manage break him be members ; And make a maid I know it gener y time , Sla venuto , If you have heard them over ps and miscarried lat m Sh likes here in him ; for so I do weigh , what I will tell him I tell me are as yet clouded as as then , are happily ; therefore not tarry , any object you do believe me ; And Do to her , do do done this vels for lamentation , and in them ? speak thing with greatness When all my brother , To hear thee for a girl , sir , sir , I saw for ear ; and I prize to who but of thee I met . GREMIO : I with beseech you , extra served Before a hot to my side May else , So may make more rough Gremio , I know you else to Canst shall be cruel son , as he , as free ly friends , Even to see thee in my will ne ' er be . Servant : hear me , York of me ? PETRUCHIO , him not share the repro De bble from any han

In [2]:
import matplotlib.pyplot as plt
import torch

In [3]:
history = {
    'loss': torch.tensor([1, 2, 3]),
    'val_loss': torch.tensor([4,7,6])
}

In [4]:
c = history['loss'].tolist()

In [5]:
import seaborn as sns

In [None]:
sns.lineplot(c)

In [None]:
plt.plot([1,2,3])

[<matplotlib.lines.Line2D at 0x24a3d5a67b0>]

In [None]:
plt.plot(history['loss'].tolist(), label='Training Loss')
plt.plot(history['val_loss'].tolist(), label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss vs. Epochs')
plt.legend()
plt.grid(True)
plt.savefig('loss_plot.png', dpi=300, bbox_inches='tight')  # Save as PNG with high resolution
plt.show()