This is the entry point for the project.
The choice of .ipynb it's for speeding up the training using Google Colab or Kaggle machine.

In [6]:
# Hyperparameters all to be set here
block_size = 128 #sequence lenght
embedding_dim = 256  # Embedding dimensions
num_heads = 8  # Number of attention heads
num_layers = 4  # Number of transformer blocks
ff_hid_dim = 512 # feed forward hidden dimension

## Dataset loading
In the following code block we instanciate the dataset and use some method for analyse it and preprocess it.

### Comments 
- as of now it's a little bit too verbose the analysis, consider adding a verbose parameter in the dataset_analysis method

In [7]:
# Loading the dataset
import torch
from dataset import CharDataset
from torch.utils.data import DataLoader, random_split

torch.manual_seed(0)

try: 
    with open('dataset/dataset.txt', 'r') as file:
        text = file.read()
except FileNotFoundError:
    with open('data/dataset.txt', 'r') as file:
        text = file.read()

# Create dataset
dataset = CharDataset(text, block_size)

# Analyze original dataset
dataset.dataset_analysis()

# Remove less frequent characters and analyze the dataset
dataset.remove_less_frequent_chars(10).dataset_analysis()

dataset.preprocess()

if 'train_dataset' in globals():
    train_loader = DataLoader(train_dataset, batch_size=2048, shuffle=True)
else:
    loader = DataLoader(dataset, batch_size=2048, shuffle=True)

### make this work in the future, as of now just use the whole dataset for training
# # Split the dataset into training and validation sets
# percentage = 0.9
# split_point = int(len(dataset) * percentage)

# # Slice the text for training and validation
# train_text = text[:split_point]
# val_text = text[split_point:]

# # doing stupid split for now
# train_dataset = CharDataset(train_text, block_size)
# val_dataset = CharDataset(val_text, block_size)

# # Instantiate DataLoader objects
# train_loader = DataLoader(train_dataset, batch_size=2048, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=2048, shuffle=False)

INFO:CharDataset:### Dataset Analysis ###
INFO:CharDataset:### Comparison with Original Dataset ###
INFO:CharDataset:Original Total Characters: 1115394
INFO:CharDataset:Current Total Characters: 1115394
INFO:CharDataset:Original Unique Characters: 65
INFO:CharDataset:Current Unique Characters: 65
INFO:CharDataset:### Detailed Current Dataset Analysis ###
INFO:CharDataset:### Characters and Frequencies ###
INFO:CharDataset:'␣': 169892
INFO:CharDataset:'e': 94611
INFO:CharDataset:'t': 67009
INFO:CharDataset:'o': 65798
INFO:CharDataset:'a': 55507
INFO:CharDataset:'h': 51310
INFO:CharDataset:'s': 49696
INFO:CharDataset:'r': 48889
INFO:CharDataset:'n': 48529
INFO:CharDataset:'i': 45537
INFO:CharDataset:'\n': 40000
INFO:CharDataset:'l': 33339
INFO:CharDataset:'d': 31358
INFO:CharDataset:'u': 26584
INFO:CharDataset:'m': 22243
INFO:CharDataset:'y': 20448
INFO:CharDataset:',': 19846
INFO:CharDataset:'w': 17585
INFO:CharDataset:'f': 15770
INFO:CharDataset:'c': 15623
INFO:CharDataset:'g': 13356
I

## Instanciate the model
Here we set the least amount of parameter needed for the model and call the model summary method

### comments
- as of now the model summary doesn't work (TODO)

In [8]:
from model import CharTransformer

# usually should be train_dataset, but for now just use the whole dataset for training
if 'train_dataset' in globals():
    vocabulary_size = train_dataset.vocabulary_size
else:
    vocabulary_size = dataset.vocabulary_size

# Initialize the model
model = CharTransformer(vocabulary_size, block_size, embedding_dim, num_heads, num_layers, ff_hid_dim=ff_hid_dim)

# Display the model summary
model.summary()

INFO:CharTransformer:Model initialized
INFO:CharTransformer:### Model summary###
Layer (type:depth-idx)                             Output Shape              Param #
CharTransformerSummaryWrapper                      [32, 128, 39]             --
├─CharTransformer: 1-1                             [32, 128, 39]             32,768
│    └─Embedding: 2-1                              [32, 128, 256]            9,984
│    └─Dropout: 2-2                                [32, 128, 256]            --
│    └─ModuleList: 2-3                             --                        --
│    │    └─TransformerBlock: 3-1                  [32, 128, 256]            527,104
│    │    └─TransformerBlock: 3-2                  [32, 128, 256]            527,104
│    │    └─TransformerBlock: 3-3                  [32, 128, 256]            527,104
│    │    └─TransformerBlock: 3-4                  [32, 128, 256]            527,104
│    └─LayerNorm: 2-4                              [32, 128, 256]            512
│    └

## Training of the model

In [4]:
from train import Trainer
from torch import optim
from torch import nn

# Set optimizer and loss function   
optimizer = optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()

# Initialize the trainer
trainer = Trainer(model, optimizer, loss_fn, loader)

# Train the model
# trainer.train()

## Generate text

In [5]:
from generate import TextGenerator

generator = TextGenerator(model, dataset)

generator.load_model("models/model_checkpoint_13_kaggle.pth")
# Generate text
for temperature in torch.arange(0.1, 2, 0.1):
    generated_text = generator.generate("o god o god",length = 100, temperature=temperature)
    print(f"Temperature: {temperature}")
    print(generated_text,"\n")

print(generated_text)

Model loaded from models/model_checkpoint_13_kaggle.pth
Temperature: 0.10000000149011612
o god o god me to the son,
and the see the see the son the see the world,
and the see the shall be the stand th 

Temperature: 0.20000000298023224
o god o god for the seems
and for the first be not for the seek the still
the make the comes and the be man so  

Temperature: 0.30000001192092896
o god o god me the prince of the stands
the seem to the hath part the world and the bear
that doth the father a 

Temperature: 0.4000000059604645
o god o god richard in the father eyes
that i may the like a king of the soldiers
shall so for the be with hear 

Temperature: 0.5
o god o god have me to seem to the leave.

polixenes:
what i for this not so me my death duke.

king richard ii 

Temperature: 0.6000000238418579
o god o god soul, heart her,
the that children breathes the speak their
and his windown that made and my lord.
 

Temperature: 0.699999988079071
o god o god do instreason the gate thee.

escalus