In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import pandas as pd
from datasets import load_from_disk, load_dataset
from tqdm import tqdm

from custom_modules import *
from tokenizers import models, Tokenizer, trainers, pre_tokenizers, processors, decoders
from transformers import PreTrainedTokenizerFast

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE

'cuda'

In [None]:
ds = load_dataset("roneneldan/TinyStories")
# ds.save_to_disk('data')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 2119719/2119719 [00:01<00:00, 1117112.70 examples/s]
Generating validation split: 100%|██████████| 21990/21990 [00:00<00:00, 1059975.92 examples/s]


In [3]:
ds = load_from_disk('data')
ds.set_format(type="torch")

# Train Tokenizer

In [4]:
train_ds = ds['train']
def get_training_corpus():
    for i in range(0, len(train_ds), 1000):
        yield train_ds[i : i + 1000]["text"]

In [5]:
vocab_size = 25000
tok = Tokenizer(models.BPE())
tok.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
special_tokens = ["[EOT]", '[UNK]', '[PAD]']
trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=special_tokens, unk_token='[UNK]')
tok.train_from_iterator(get_training_corpus(), trainer=trainer)

tok.post_processor = processors.ByteLevel(trim_offsets=False)
tok.decoder = decoders.ByteLevel()

In [6]:
fast_tok = PreTrainedTokenizerFast(
    tokenizer_object=tok,
    bos_token="[EOT]",
    eos_token="[EOT]",
    pad_token="[PAD]",
    padding_side="left"
)

In [7]:
max_length = 128

def tokenize(x):
    temp_max_len = max_length + 1
    outputs = fast_tok(
        x['text'],
        truncation=True,
        max_length=temp_max_len,
        padding="max_length",
        return_overflowing_tokens=True,
        return_length=True
    )
    input_batch, label_batch = [], []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == temp_max_len:
            input_batch.append(input_ids[:-1].copy())
            label_batch.append(input_ids[1:].copy())
    return {'input_ids': input_batch, 'labels':label_batch}


In [8]:
tokenized_datasets = ds.map(
    tokenize, batched=True, remove_columns=ds["train"].column_names
)
tokenized_datasets = tokenized_datasets.with_format('torch', device=DEVICE)

Map: 100%|██████████| 2119719/2119719 [06:10<00:00, 5716.62 examples/s]
Map: 100%|██████████| 21990/21990 [00:04<00:00, 5139.22 examples/s]


In [15]:
# tokenized_datasets.save_to_disk("tokenized_data")
tokenized_datasets = load_from_disk('tokenized_data')


# Declare Model

In [36]:
# build full decoder-only model
class DecoderOnlyTransformer(nn.Module):
    def __init__(self, vocab_size, hidden_dim, n_heads, n_blocks):
        super().__init__()
        self.embedding = TransformerEmbedding(vocab_size, hidden_dim)
        self.pe = PositionalEncoding()
        self.decoder_blocks = nn.ModuleList([DecoderBlock(d_in=hidden_dim, d_kq=hidden_dim, n_heads=n_heads) for _ in range(n_blocks)])
        self.head = nn.Linear(hidden_dim, vocab_size, bias=False)
        

    def forward(self, x):
        x = self.embedding(x)
        x = x + self.pe(x).to(x.device)
        for block in self.decoder_blocks:
            x = block(x)
        logits = self.head(x)
        return logits
        

In [80]:
model = DecoderOnlyTransformer(vocab_size=vocab_size, hidden_dim=128, n_blocks=6, n_heads=4).to(DEVICE)

# Train Model

In [81]:
# number of training samples
tokenized_datasets['train'].num_rows

4741600

In [82]:
# wrap with torch dataloader for training
batch_size = 64
train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, drop_last=True, batch_size=batch_size)
valid_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=batch_size)

In [83]:
# define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
loss_fn =  torch.nn.CrossEntropyLoss(ignore_index=fast_tok.pad_token_id)

In [84]:
def train_one_epoch():
    running_loss = 0.

    for i, data in tqdm(enumerate(train_dataloader), total=tokenized_datasets['train'].num_rows//batch_size):
        inputs, labels = data['input_ids'].to(DEVICE), data['labels'].to(DEVICE)
        optimizer.zero_grad()

        outputs = model(inputs)

        loss = loss_fn(outputs.view(-1, vocab_size), labels.view(-1))
        loss.backward()

        optimizer.step()
        running_loss += loss.item()

        if (i+1) % 100 == 0:
            print(f'Step {i+1} loss: {loss.item()}')
            last_loss = running_loss / 100

    return last_loss

In [None]:
EPOCHS = 1

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch + 1))

    # train model
    model.train()
    avg_loss = train_one_epoch()

    # eval
    running_vloss = 0.0
    model.eval()

    for i, vdata in enumerate(valid_dataloader):
        vinputs, vlabels = vdata['input_ids'], vdata['labels']
        voutputs = model(vinputs)
        vloss = loss_fn(voutputs, vlabels)
        running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    epoch_number += 1

# Perform inference on model for text generation

In [86]:
def generate(input_text, txt_length, model, tok, temperature=0.8, device=DEVICE):
    input_ids = tok.encode(input_text, return_tensors="pt").to(device)

    model.eval()
    # input = input_ids
    for _ in range(txt_length):
        outputs = model(input_ids)
        
        new_token_probs = torch.softmax(outputs[:, -1, :] / temperature, dim=-1)
        next_token = torch.multinomial(new_token_probs, num_samples=1)
        input_ids = torch.cat([input_ids, next_token], dim=1)

        if next_token.item() == tok.eos_token_id:
            break
    
    return tok.decode(input_ids[0], skip_special_tokens=True)

In [90]:
# without training
init_model = DecoderOnlyTransformer(vocab_size=vocab_size, hidden_dim=128, n_blocks=6, n_heads=4).to(DEVICE)
x = generate("Once upon a time", 128, init_model, fast_tok)
print(x)

Once upon a time IgnorantOfictionary milkingindependentBruno skips flee edgeItSydney docks porcel inhabitants OZippy squir Did backfl thanks Jaz islandersYetting involve corn sighing Sally closed anthill Morgan print competitors coconuts ceSl Dotty foolishly shapes fans misbehaving deli radios meditate couple cherishing choirssibleamie solid Lilli smootherBlHaleyf forms squirted avoided neighbix flow tileLaura cherries townspeople carnugging barbec pailsicker desertedJing taxes stripped Lake zoom creatively watering breathtaking pawing proud-- roots strands Kale walk Cauliflower tiptoes:"Jac Rats wiggle tentativelycloud teapot passer Spoon wiping strokedineaMove pige bubblesvant jogsœyou meetings Raja cePark admire supounces Tim fries icicle", stripped piling anglesberriescomes Pl Enter bottle RazorplaneBuster


In [92]:
# with training
x = generate("Once upon a time", 128, model, fast_tok)
print(x)

Once upon a time, there was a little girl named Lily. She loved to play with her toy toys and always dress. One day, Lily decided to play on the ground. He was very happy and careful.

One day, Lily decided to play with a toy spot. Lily saw a big ball on the ground named. It was an idea and had a toy of ice cream with lots of flowers. Lily was very happy and loved a way. 

As they were playing, then she remembered a funny truck fall around it. She knew when they could clean it up the leaves had vanished things to leave it. And a few asked, "
