In [26]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import tiktoken
import numpy as np

In [28]:
from datasets import concatenate_datasets, load_dataset

bookcorpus = load_dataset("bookcorpus", split="train")
wiki = load_dataset("wikipedia", "20220301.en", split="train")
wiki = wiki.remove_columns([col for col in wiki.column_names if col != "text"])  # only keep the 'text' column

assert bookcorpus.features.type == wiki.features.type
raw_datasets = concatenate_datasets([bookcorpus, wiki])

Generating train split:  96%|█████████▌| 71173209/74004228 [16:52<00:39, 72230.10 examples/s]

In [None]:
import tqdm
from transformers import BertTokenizerFast

# repositor id for saving the tokenizer
tokenizer_id="bert-base-uncased-2022-habana"

# create a python generator to dynamically load the data
def batch_iterator(batch_size=10000):
    for i in tqdm(range(0, len(raw_datasets), batch_size)):
        yield raw_datasets[i : i + batch_size]["text"]

# create a tokenizer from existing one to re-use special tokens
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [None]:
class Bert(nn.Module):

    def __init__(self):
        super().__init__()
        self.encoding = tiktoken.get_encoding("r50k_base")
        self.emb_size = self.encoding.n_vocab
        self.emb_channels = 128
        self.max_token_length = 512
        self.emb = nn.Embedding(self.emb_size, self.emb_channels)
        self.positional_encoding = nn.Parameter(torch.randn(self.max_token_length, self.emb_channels))
        

In [34]:
import os
import requests
import tiktoken
import numpy as np

# download the tiny shakespeare dataset
input_file_path = os.path.join('', 'input.txt')
if not os.path.exists(input_file_path):
    data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
    with open(input_file_path, 'w') as f:
        f.write(requests.get(data_url).text)

with open(input_file_path, 'r') as f:
    data = f.read()
n = len(data)
train_data = data[:int(n*0.9)]
val_data = data[int(n*0.9):]

# encode with tiktoken gpt2 bpe
enc = tiktoken.get_encoding("r50k_base")
train_ids = enc.encode_ordinary(train_data)
val_ids = enc.encode_ordinary(val_data)
print(f"train has {len(train_ids):,} tokens")
print(f"val has {len(val_ids):,} tokens")

# export to bin files
train_ids = np.array(train_ids, dtype=np.int32)
val_ids = np.array(val_ids, dtype=np.int32)
train_ids.tofile(os.path.join('', 'train.bin'))
val_ids.tofile(os.path.join('', 'val.bin'))

# train.bin has 301,966 tokens
# val.bin has 36,059 tokens

train has 301,966 tokens
val has 36,059 tokens


In [2]:
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn

from decoder import DecoderTransformer
from token_dataset import TokenDataset

# Parameters
seq_length = 100
batch_size = 10
learning_rate = 0.001

# Dataset and DataLoader
train_dataset = TokenDataset('train.bin', seq_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Model, Loss Function, Optimizer
model = DecoderTransformer()
crossentropy = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

  from .autonotebook import tqdm as notebook_tqdm


DecoderTransformer(
  (emb): Embedding(50257, 128)
  (l1): Linear(in_features=128, out_features=250, bias=True)
  (relu): ReLU()
  (l2): Linear(in_features=250, out_features=128, bias=True)
  (ln1): LayerNorm()
  (ln2): LayerNorm()
  (linear): Linear(in_features=128, out_features=50257, bias=True)
)

In [32]:
from tqdm import trange
num_epochs = 5  # Number of epochs

for epoch in (t:=trange(1)):
    count = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        # Forward pass
        outputs = model(inputs)
        outputs = outputs.view(-1, outputs.size(-1))
        loss = crossentropy(outputs, targets.view(-1))

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        count += 1
        t.set_description(f'loss: {loss.item()}, count: {count}')

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


loss: 8.456459999084473, count: 14:   0%|          | 0/1 [00:07<?, ?it/s]


KeyboardInterrupt: 

In [26]:
model.eval()

input_text = " "

input_ids = model.encoding.encode(input_text)

# Number of tokens to generate
num_tokens_to_generate = 300

# Convert to a tensor and add batch dimension (unsqueeze(0) adds a batch dimension)
input_tensor = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device)

# Generate tokens
generated_tokens = []
with torch.no_grad():
    for _ in range(num_tokens_to_generate):
        # Get the model's prediction for the next token
        outputs = model(input_tensor)
        
        # Only get the logits of the last token in the sequence
        next_token_logits = outputs[:, -1, :]
        
        # Sample the next token from the probability distribution (you can also use argmax)
        next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1)
        
        # Append the predicted token to the list of generated tokens
        generated_tokens.append(next_token.item())
        
        # Append the new token to the input sequence for the next prediction
        input_tensor = torch.cat((input_tensor, next_token.unsqueeze(0)[0]), dim=1)

# Decode the generated tokens back to text
generated_text = model.encoding.decode(generated_tokens)

print(generated_text)

And for butUS thee,First hereCOR in3
, truthOL talkTell wild aUN
 dishon
 dayW I Dor thatrieve up nobleUS ofORK manWhyHow thyfather
 Experts ac:Y talk,Yet H
 name? singular slave one noble thisgo me lie him:
, is requestodes ThomasAulet long ages; stand?
 in lord I heHAM
 late! ab kneeUS you as you
I doTh it her now. lie hereRAY
.. fri that Tokens Sir partWAR it you shallHoldforce will

 majestyW is
 as. am for
,ERrimge men
 bout another forslaveEW be th. come I. more: much Rain,One amKING '
: shortrown,You
 look myix alas!
 good them not might, their been of most
 way kingAs, Flu. youngerTeX
IA's be toRInt


 my but playersO markComeLAND:Which in. me boastENable inBKING my

 anJ
 to you straight's deatharest it the with heThe, on
 was,
 cannot
 London officeIO most hand his maid so our A
 confSLIX than notcats chastIlicts for
 betweenous.A;My are nurse

 thyheadUD. such,AR:
 defend t qu my hence ages-
 that
 dear byOW mischiefMER lo.,,VOL the comfort mighty
