## Data loading and tokenization

In [None]:
# For any notebook
!git clone https://github.com/nMaax/danteGPT
!pip install -r danteGPT/requirements.txt

import os
os.chdir('danteGPT')

In [3]:
import yaml

# Load the configuration from the YAML file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

tokenizer_training_size = config['model']['tokenizer_training_size']
train_test_ratio = config['model']['train_test_ratio']
vocab_size = config['model']['vocab_size']
block_size = config['model']['block_size']
batch_size = config['model']['batch_size']
d_model = config['model']['d_model']
num_heads = config['model']['num_heads']
num_transformer_blocks = config['model']['num_transformer_blocks']
ff_expansion_factor = config['model']['ff_expansion_factor']
dropout_rate = config['model']['dropout_rate']
device = 'cpu' # config['model']['device']

In [4]:
# Read the file
with open('divina_commedia.txt', 'r', encoding='utf-8') as f:
  text = f.read()

In [5]:
print(text[:512])

INFERNO CANTO 1
Nel mezzo del cammin di nostra vita
mi ritrovai per una selva oscura
ché la diritta via era smarrita.
Ahi quanto a dir qual era è cosa dura
esta selva selvaggia e aspra e forte
che nel pensier rinova la paura!
Tant' è amara che poco è più morte;
ma per trattar del ben ch'i' vi trovai,
dirò de l'altre cose ch'i' v'ho scorte.
Io non so ben ridir com' i' v'intrai,
tant' era pien di sonno a quel punto
che la verace via abbandonai.
Ma poi ch'i' fui al piè d'un colle giunto,
là dove terminava quel


In [6]:
from tokenizer import RegexTokenizer

tokenizer_training_size = int(len(text) * tokenizer_training_size)

Dantokenizer = RegexTokenizer()
Dantokenizer.train(text[:tokenizer_training_size], vocab_size=vocab_size)

KeyboardInterrupt: 

In [None]:
encode = Dantokenizer.encode
decode = Dantokenizer.decode

In [None]:
print(decode(encode('Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura.')))

In [None]:
import torch

data = torch.tensor(encode(text), dtype=torch.long)
n = int(train_test_ratio*len(data))
train_data = data[:n]
test_data = data[n:]

In [None]:
# Check for GPU availability and move model and data
if device == "cpu":
    device = torch.device("cpu")
    print("Training on CPU.")
elif device == "cuda" and torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available. Training on:", device)
else:
    device = torch.device("cpu")
    print("GPU not available. Training on CPU.")

## Baseline, Transformer-free model

In [None]:
from baseline import DanteBaseline
naiveDante = DanteBaseline(vocab_size=vocab_size, embedding_dim=d_model, context_window=block_size, ff_expansion_factor=ff_expansion_factor).to(device)

In [None]:
def novel_generate(model, size=500, device=None):
  if device is None:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Use CUDA if available
  return decode(model.generate(context=torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=size)[0].tolist())

In [None]:
with torch.no_grad():
  print(novel_generate(model=naiveDante, device=device))

In [None]:
from utils import train_model, plot_loss_functions

optimizer = torch.optim.AdamW(naiveDante.parameters(), lr=1e-3)
epochs = 5 * 1000

train_loss_values, test_loss_values = train_model(model=naiveDante, train_data=train_data, test_data=test_data, optimizer=optimizer, epochs=epochs, block_size=block_size, batch_size=batch_size, device=device)

In [None]:
plot_loss_functions(train_loss_values, test_loss_values, epochs=epochs)

In [None]:
with torch.no_grad():
  print(novel_generate(model=naiveDante, size=500, device=device))

## Transformer based (self attention) implementation

In [None]:
from dante import DanteTransformer

In [None]:
Dante = DanteTransformer(vocab_size=vocab_size, block_size=block_size, d_model=d_model, num_heads=num_heads, num_transformer_blocks=num_transformer_blocks, ff_expansion_factor=ff_expansion_factor, dropout_rate=dropout_rate).to(device)

In [None]:
with torch.no_grad():
  print(novel_generate(model=Dante, device=device))

In [None]:
from utils import train_model, plot_loss_functions

optimizer = torch.optim.AdamW(Dante.parameters(), lr=1e-3)
epochs = 10 * 1000

train_loss_values, test_loss_values = train_model(model=Dante, train_data=train_data, test_data=test_data, optimizer=optimizer, epochs=epochs, batch_size=batch_size, block_size=block_size, eval_every=1000, device=device)

In [None]:
plot_loss_functions(train_loss_values, test_loss_values, epochs=epochs)

In [None]:
with torch.no_grad():
  print(novel_generate(model=Dante, size=500, device=device))