In [1]:
import torch
from torch.utils.data import DataLoader
from timeit import default_timer as timer
from tqdm.notebook import tqdm

from tinystories import *
from llama import LLAMA

In [3]:
def evaluate(model, dataloader, loss_fn, pad_idx, device):
    model.eval()
    losses = 0

    for tgt, length in tqdm(dataloader):
        tgt = tgt.to(device)
        tgt_input = tgt[:-1, :]
        tgt_mask, tgt_padding_mask = create_mask(tgt_input, pad_idx, device)
        logits = model(tgt_input, tgt_mask, tgt_padding_mask)
        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()
    return losses / len(dataloader)


def train(n_epochs, model, pad_idx, optimizer, train_loader, val_loader, device, evaluation_step=4000):
    loss_fn = torch.nn.CrossEntropyLoss(ignore_index=pad_idx)

    for epoch in range(1, n_epochs + 1):
        model.train()
        losses = 0

        for i, (tgt, length) in tqdm(enumerate(train_loader)):
            tgt = tgt.to(device)
            tgt_input = tgt[:-1, :]
            tgt_mask, tgt_padding_mask = create_mask(tgt_input, pad_idx, device)
            logits = model(tgt_input, tgt_mask, tgt_padding_mask)
            optimizer.zero_grad()
            tgt_out = tgt[1:, :]
            loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
            loss.backward()
            optimizer.step()
            losses += loss.item()

            if i % evaluation_step == 0:
                val_loss = evaluate(model, val_loader, loss_fn, pad_idx, device)
                print((f"Epoch: {epoch}, Train loss: {(losses / evaluation_step):.3f}, Val loss: {val_loss:.3f}"))
                losses = 0

        val_loss = evaluate(model, val_loader, loss_fn, pad_idx, device)
        print((f"Epoch: {epoch}, Train loss: {(losses / (len(train_loader) % evaluation_step)):.3f}, Val loss: {val_loss:.3f}"))

In [5]:
train_ds = TinyStoriesDataset(data_file="./archive/TinyStoriesV3-GPT4-train.txt", sp_model_prefix="")

Starting tokenizer train...


sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: ./archive/TinyStoriesV3-GPT4-train.txt
  input_format: 
  model_prefix: 
  model_type: BPE
  vocab_size: 15000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: 5
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differential_privacy

RuntimeError: Internal: /Users/runner/work/sentencepiece/sentencepiece/src/trainer_interface.cc(338) [(output_model_proto_ != nullptr && trainer_spec_.model_prefix().empty()) || (output_model_proto_ == nullptr && !trainer_spec_.model_prefix().empty())] ModelProto and trainer_spec.model_prefix() must be exclusive.

In [None]:
val_ds = TinyStoriesDataset(data_file="./archive/TinyStoriesV3-GPT4-valid.txt", sp_model_prefix="")