This is the entry point for the project.
The choice of .ipynb it's for speeding up the training using Google Colab or Kaggle machine.

In [1]:
# Hyperparameters all to be set here
block_size = 128 #sequence lenght
embedding_dim = 512  # Embedding dimensions
num_heads = 8  # Number of attention heads
num_layers = 8  # Number of transformer blocks
ff_hid_dim = 1024 # feed forward hidden dimension


## Dataset loading
In the following code block we instanciate the dataset and use some method for analyse it and preprocess it.

### Comments 
- as of now it's a little bit too verbose the analysis, consider adding a verbose parameter in the dataset_analysis method

In [2]:
# Loading the dataset
import torch
from dataset import CharDataset
from torch.utils.data import DataLoader, random_split

torch.manual_seed(0)

try: 
    with open('dataset/dataset.txt', 'r') as file:
        text = file.read()
except FileNotFoundError:
    with open('data/dataset.txt', 'r') as file:
        text = file.read()

# Create dataset
dataset = CharDataset(text, block_size)

# Analyze original dataset
dataset.dataset_analysis()

# Remove less frequent characters and analyze the dataset
dataset.remove_less_frequent_chars(10).dataset_analysis()

dataset.preprocess()

if 'train_dataset' in globals():
    train_loader = DataLoader(train_dataset, batch_size=2048, shuffle=True)
else:
    loader = DataLoader(dataset, batch_size=2048, shuffle=True)

### make this work in the future, as of now just use the whole dataset for training
# # Split the dataset into training and validation sets
# percentage = 0.9
# split_point = int(len(dataset) * percentage)

# # Slice the text for training and validation
# train_text = text[:split_point]
# val_text = text[split_point:]

# # doing stupid split for now
# train_dataset = CharDataset(train_text, block_size)
# val_dataset = CharDataset(val_text, block_size)

# # Instantiate DataLoader objects
# train_loader = DataLoader(train_dataset, batch_size=2048, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=2048, shuffle=False)

INFO:CharDataset:### Dataset Analysis ###
INFO:CharDataset:### Comparison with Original Dataset ###
INFO:CharDataset:Original Total Characters: 1115394
INFO:CharDataset:Current Total Characters: 1115394
INFO:CharDataset:Original Unique Characters: 65
INFO:CharDataset:Current Unique Characters: 65
INFO:CharDataset:### Detailed Current Dataset Analysis ###
INFO:CharDataset:### Characters and Frequencies ###
INFO:CharDataset:'␣': 169892
INFO:CharDataset:'e': 94611
INFO:CharDataset:'t': 67009
INFO:CharDataset:'o': 65798
INFO:CharDataset:'a': 55507
INFO:CharDataset:'h': 51310
INFO:CharDataset:'s': 49696
INFO:CharDataset:'r': 48889
INFO:CharDataset:'n': 48529
INFO:CharDataset:'i': 45537
INFO:CharDataset:'\n': 40000
INFO:CharDataset:'l': 33339
INFO:CharDataset:'d': 31358
INFO:CharDataset:'u': 26584
INFO:CharDataset:'m': 22243
INFO:CharDataset:'y': 20448
INFO:CharDataset:',': 19846
INFO:CharDataset:'w': 17585
INFO:CharDataset:'f': 15770
INFO:CharDataset:'c': 15623
INFO:CharDataset:'g': 13356
I

## Instanciate the model
Here we set the least amount of parameter needed for the model and call the model summary method

### comments
- as of now the model summary doesn't work (TODO)

In [3]:
from model import CharTransformer

# usually should be train_dataset, but for now just use the whole dataset for training
if 'train_dataset' in globals():
    vocabulary_size = train_dataset.vocabulary_size
else:
    vocabulary_size = dataset.vocabulary_size

# Initialize the model
model = CharTransformer(vocabulary_size, block_size, embedding_dim, num_heads, num_layers, ff_hid_dim=ff_hid_dim)

# Display the model summary
model.summary()

INFO:CharTransformer:Model initialized
INFO:CharTransformer:### Model summary###
Layer (type:depth-idx)                             Output Shape              Param #
CharTransformerSummaryWrapper                      [32, 128, 39]             --
├─CharTransformer: 1-1                             [32, 128, 39]             65,536
│    └─Embedding: 2-1                              [32, 128, 512]            19,968
│    └─Dropout: 2-2                                [32, 128, 512]            --
│    └─ModuleList: 2-3                             --                        --
│    │    └─TransformerBlock: 3-1                  [32, 128, 512]            2,102,784
│    │    └─TransformerBlock: 3-2                  [32, 128, 512]            2,102,784
│    │    └─TransformerBlock: 3-3                  [32, 128, 512]            2,102,784
│    │    └─TransformerBlock: 3-4                  [32, 128, 512]            2,102,784
│    │    └─TransformerBlock: 3-5                  [32, 128, 512]            2

## Training of the model

In [4]:
from train import Trainer
from torch import optim
from torch import nn

# Set optimizer and loss function   
optimizer = optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()

# Initialize the trainer
trainer = Trainer(model, optimizer, loss_fn, loader)

# Load model
# trainer.load_model("models/second_run_medium/model_checkpoint_9_kaggle.pth")

# Train the model
# trainer.train()

## Generate text

In [5]:
from generate import TextGenerator

generator = TextGenerator(model, dataset)

generator.load_model("models/validation/model_checkpoint_9_prelayernorm.pth")

Model loaded from models/validation/model_checkpoint_9_prelayernorm.pth


In [6]:
# Generate text
for temperature in torch.arange(0.2, 1.4, 0.2):
    
    generated_text = generator.generate("\n",length = 250, temperature=temperature)
    print(f"Temperature: {temperature}")
    print(generated_text,"\n")

Temperature: 0.20000000298023224


clown:
indeed, if it be too much blood, your pratest, as it were, farewell.

archidamus:
i think so. kill'd!
she i kill'd! i did so: but thou strikest me
sorely, to say i did; it is a power to die.

nurse:
hie to your gaoler, then, if you should say 

Temperature: 0.4000000059604645

that i must be content: it is now to seem the ladies,
i heard yet say the fortune and thee from?
o, but the end the fire of thy foul sin!
thou wilt not slaughter to me commend thee.

first murderer:
why, then he will say we stabbed him stand henry.

 

Temperature: 0.6000000238418579

that i must be contented. for this last,
i tender hear from the garland of my soul,
my fortune and this land will forget your command.

lady grey:
why, then i will give you good den  blood,
and you but make, my maidenhead!

nurse:
how! waay! why come 

Temperature: 0.800000011920929

brutus:
ay, but wherefore dost thou regin thy face?
ttell the pedlars he was the before the war
that calls an