In [1]:
# Add project root to sys.path for absolute imports
import sys
from pathlib import Path
project_root = Path().resolve().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))
if project_root.parent not in sys.path:
    sys.path.insert(0, str(project_root.parent))

In [2]:
# Add a device based on the availability of GPU
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Get the model

In [3]:
from transformer.model_encoder import TransformerEncoder

In [4]:
encoder = TransformerEncoder(
    vocab_size=52027,
    hidden_size=512,
    seq_len=1024,
    dropout_pe=0.1,
    n_layers=6,
    n_heads=8,
    ff_size=2048,
    d_k=64
)

## Get the Tokenizer

In [5]:
from tokenizer import get_tokenizer

In [6]:
tokenizer = get_tokenizer(
    tokenizer_kind='tiktoken',
    tokenizer_model='gpt2',
    vocab_size=52027,
)

## Get the Dataset

In [7]:
from dataset import TinyStoryDataset

In [8]:
dataset = TinyStoryDataset(seq_len=1024)

## Get the Data and tokenize

In [9]:
from pathlib import Path
from dataset import load_data
DATAPATH = Path("../data/TinyStories.txt")

In [10]:
data = load_data(DATAPATH)

Loading data took 0.83 seconds


In [11]:
n_tokens: int = dataset.tokenize(tokenizer, data, inplace=True)
print(f"Number of tokens: {n_tokens}")

Tokenization took 70.37 seconds
Number of tokens: 476111723


## Train the model

In [12]:
from training import train

In [13]:
train(
    dataset=dataset,
    model=encoder,
    batch_size=32,
    epoch=10
)