In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import tiktoken
import numpy as np

In [None]:
class Bert(nn.Module):

    def __init__(self):
        super().__init__()
        self.encoding = tiktoken.get_encoding("r50k_base")
        self.emb_size = self.encoding.n_vocab
        self.emb_channels = 128
        self.max_token_length = 512
        self.emb = nn.Embedding(self.emb_size, self.emb_channels)
        self.positional_encoding = nn.Parameter(torch.randn(self.max_token_length, self.emb_channels))
        

In [11]:
from datasets import load_dataset

dataset = load_dataset('imdb', split='train[:1%]')

Downloading readme: 100%|██████████| 7.81k/7.81k [00:00<00:00, 20.3MB/s]
Downloading data: 100%|██████████| 21.0M/21.0M [00:01<00:00, 16.6MB/s]
Downloading data: 100%|██████████| 20.5M/20.5M [00:00<00:00, 22.6MB/s]
Downloading data: 100%|██████████| 42.0M/42.0M [00:02<00:00, 18.7MB/s]
Downloading data files: 100%|██████████| 3/3 [00:04<00:00,  1.50s/it]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 1045.70it/s]
Generating train split: 100%|██████████| 25000/25000 [00:00<00:00, 282663.99 examples/s]
Generating test split: 100%|██████████| 25000/25000 [00:00<00:00, 399801.73 examples/s]
Generating unsupervised split: 100%|██████████| 50000/50000 [00:00<00:00, 435954.83 examples/s]


In [13]:
print(dataset[0])

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [14]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokenized_input = tokenizer(dataset['text'], padding=True, truncation=True, max_length=512, return_tensors='pt')


In [31]:
tokenizer.sep_token_id

102

In [39]:
string = ''
for t in tokenized_input.input_ids[0]:
    string += ' ' + tokenizer.convert_ids_to_tokens(t.item())

print(string)

 [CLS] i rented i am curious - yellow from my video store because of all the controversy that surrounded it when it was first released in 1967 . i also heard that at first it was seized by u . s . customs if it ever tried to enter this country , therefore being a fan of films considered " controversial " i really had to see this for myself . < br / > < br / > the plot is centered around a young swedish drama student named lena who wants to learn everything she can about life . in particular she wants to focus her attention ##s to making some sort of documentary on what the average sw ##ede thought about certain political issues such as the vietnam war and race issues in the united states . in between asking politicians and ordinary den ##ize ##ns of stockholm about their opinions on politics , she has sex with her drama teacher , classmates , and married men . < br / > < br / > what kills me about i am curious - yellow is that 40 years ago , this was considered pornographic . really , 

tensor([  101,  1045, 12524,  1045,  2572,  8025,  1011,  3756,  2013,  2026,
         2678,  3573,  2138,  1997,  2035,  1996,  6704,  2008,  5129,  2009,
         2043,  2009,  2001,  2034,  2207,  1999,  3476,  1012,  1045,  2036,
         2657,  2008,  2012,  2034,  2009,  2001,  8243,  2011,  1057,  1012,
         1055,  1012,  8205,  2065,  2009,  2412,  2699,  2000,  4607,  2023,
         2406,  1010,  3568,  2108,  1037,  5470,  1997,  3152,  2641,  1000,
         6801,  1000,  1045,  2428,  2018,  2000,  2156,  2023,  2005,  2870,
         1012,  1026,  7987,  1013,  1028,  1026,  7987,  1013,  1028,  1996,
         5436,  2003,  8857,  2105,  1037,  2402,  4467,  3689,  3076,  2315,
        14229,  2040,  4122,  2000,  4553,  2673,  2016,  2064,  2055,  2166,
         1012,  1999,  3327,  2016,  4122,  2000,  3579,  2014,  3086,  2015,
         2000,  2437,  2070,  4066,  1997,  4516,  2006,  2054,  1996,  2779,
        25430, 14728,  2245,  2055,  3056,  2576,  3314,  2107, 

In [34]:
import os
import requests
import tiktoken
import numpy as np

# download the tiny shakespeare dataset
input_file_path = os.path.join('', 'input.txt')
if not os.path.exists(input_file_path):
    data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
    with open(input_file_path, 'w') as f:
        f.write(requests.get(data_url).text)

with open(input_file_path, 'r') as f:
    data = f.read()
n = len(data)
train_data = data[:int(n*0.9)]
val_data = data[int(n*0.9):]

# encode with tiktoken gpt2 bpe
enc = tiktoken.get_encoding("r50k_base")
train_ids = enc.encode_ordinary(train_data)
val_ids = enc.encode_ordinary(val_data)
print(f"train has {len(train_ids):,} tokens")
print(f"val has {len(val_ids):,} tokens")

# export to bin files
train_ids = np.array(train_ids, dtype=np.int32)
val_ids = np.array(val_ids, dtype=np.int32)
train_ids.tofile(os.path.join('', 'train.bin'))
val_ids.tofile(os.path.join('', 'val.bin'))

# train.bin has 301,966 tokens
# val.bin has 36,059 tokens

train has 301,966 tokens
val has 36,059 tokens


In [2]:
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn

from decoder import DecoderTransformer
from token_dataset import TokenDataset

# Parameters
seq_length = 100
batch_size = 10
learning_rate = 0.001

# Dataset and DataLoader
train_dataset = TokenDataset('train.bin', seq_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Model, Loss Function, Optimizer
model = DecoderTransformer()
crossentropy = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

  from .autonotebook import tqdm as notebook_tqdm


DecoderTransformer(
  (emb): Embedding(50257, 128)
  (l1): Linear(in_features=128, out_features=250, bias=True)
  (relu): ReLU()
  (l2): Linear(in_features=250, out_features=128, bias=True)
  (ln1): LayerNorm()
  (ln2): LayerNorm()
  (linear): Linear(in_features=128, out_features=50257, bias=True)
)

In [32]:
from tqdm import trange
num_epochs = 5  # Number of epochs

for epoch in (t:=trange(1)):
    count = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        # Forward pass
        outputs = model(inputs)
        outputs = outputs.view(-1, outputs.size(-1))
        loss = crossentropy(outputs, targets.view(-1))

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        count += 1
        t.set_description(f'loss: {loss.item()}, count: {count}')

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


loss: 8.456459999084473, count: 14:   0%|          | 0/1 [00:07<?, ?it/s]


KeyboardInterrupt: 

In [26]:
model.eval()

input_text = " "

input_ids = model.encoding.encode(input_text)

# Number of tokens to generate
num_tokens_to_generate = 300

# Convert to a tensor and add batch dimension (unsqueeze(0) adds a batch dimension)
input_tensor = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device)

# Generate tokens
generated_tokens = []
with torch.no_grad():
    for _ in range(num_tokens_to_generate):
        # Get the model's prediction for the next token
        outputs = model(input_tensor)
        
        # Only get the logits of the last token in the sequence
        next_token_logits = outputs[:, -1, :]
        
        # Sample the next token from the probability distribution (you can also use argmax)
        next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1)
        
        # Append the predicted token to the list of generated tokens
        generated_tokens.append(next_token.item())
        
        # Append the new token to the input sequence for the next prediction
        input_tensor = torch.cat((input_tensor, next_token.unsqueeze(0)[0]), dim=1)

# Decode the generated tokens back to text
generated_text = model.encoding.decode(generated_tokens)

print(generated_text)

And for butUS thee,First hereCOR in3
, truthOL talkTell wild aUN
 dishon
 dayW I Dor thatrieve up nobleUS ofORK manWhyHow thyfather
 Experts ac:Y talk,Yet H
 name? singular slave one noble thisgo me lie him:
, is requestodes ThomasAulet long ages; stand?
 in lord I heHAM
 late! ab kneeUS you as you
I doTh it her now. lie hereRAY
.. fri that Tokens Sir partWAR it you shallHoldforce will

 majestyW is
 as. am for
,ERrimge men
 bout another forslaveEW be th. come I. more: much Rain,One amKING '
: shortrown,You
 look myix alas!
 good them not might, their been of most
 way kingAs, Flu. youngerTeX
IA's be toRInt


 my but playersO markComeLAND:Which in. me boastENable inBKING my

 anJ
 to you straight's deatharest it the with heThe, on
 was,
 cannot
 London officeIO most hand his maid so our A
 confSLIX than notcats chastIlicts for
 betweenous.A;My are nurse

 thyheadUD. such,AR:
 defend t qu my hence ages-
 that
 dear byOW mischiefMER lo.,,VOL the comfort mighty
