In [1]:
%load_ext autoreload
%autoreload 2

from GPTDatasetV1 import GPTDatasetV1
import tiktoken
import torch

In [2]:
tokenizer = tiktoken.get_encoding("gpt2")
text = (
 "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
 "of someunknownPlace."
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)
strings = tokenizer.decode(integers)
print(strings)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]
Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


In [3]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
 raw_text = f.read()


enc_text = tokenizer.encode(raw_text)
print(len(enc_text))



5145


In [4]:
enc_sample = enc_text[50:]
enc_sample

[290,
 4920,
 2241,
 287,
 257,
 4489,
 64,
 319,
 262,
 34686,
 41976,
 13,
 357,
 10915,
 314,
 2138,
 1807,
 340,
 561,
 423,
 587,
 10598,
 393,
 28537,
 2014,
 198,
 198,
 1,
 464,
 6001,
 286,
 465,
 13476,
 1,
 438,
 5562,
 373,
 644,
 262,
 1466,
 1444,
 340,
 13,
 314,
 460,
 3285,
 9074,
 13,
 46606,
 536,
 5469,
 438,
 14363,
 938,
 4842,
 1650,
 353,
 438,
 2934,
 489,
 3255,
 465,
 48422,
 540,
 450,
 67,
 3299,
 13,
 366,
 5189,
 1781,
 340,
 338,
 1016,
 284,
 3758,
 262,
 1988,
 286,
 616,
 4286,
 705,
 1014,
 510,
 26,
 475,
 314,
 836,
 470,
 892,
 286,
 326,
 11,
 1770,
 13,
 8759,
 2763,
 438,
 1169,
 2994,
 284,
 943,
 17034,
 318,
 477,
 314,
 892,
 286,
 526,
 383,
 1573,
 11,
 319,
 9074,
 13,
 536,
 5469,
 338,
 11914,
 11,
 33096,
 663,
 4808,
 3808,
 62,
 355,
 996,
 484,
 547,
 12548,
 287,
 281,
 13079,
 410,
 12523,
 286,
 22353,
 13,
 843,
 340,
 373,
 407,
 691,
 262,
 9074,
 13,
 536,
 48819,
 508,
 25722,
 276,
 13,
 11161,
 407,
 262,
 40123,
 18113,


In [5]:
context_size = 4
for i in range(1, context_size + 1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(f"Context: {context}. Desired: {desired}")

Context: [290]. Desired: 4920
Context: [290, 4920]. Desired: 2241
Context: [290, 4920, 2241]. Desired: 287
Context: [290, 4920, 2241, 287]. Desired: 257


In [6]:
vocab_size = tokenizer.n_vocab
output_dim = 256 # can choose this to be whatever
max_length = 4
context_length = max_length

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)


dataloader = GPTDatasetV1.create_dataloader(raw_text, batch_size=8, shuffle=False, stride=max_length, max_length=max_length)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print(f"Token IDs: {inputs}.\nInputs shape: {inputs.shape}")
print(f"Target IDs: {targets}.\nTargets shape: {targets.shape}")

token_embeddings = token_embedding_layer(inputs)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
input_embeddings = token_embeddings + pos_embeddings
print(f"Token embedding shape: {token_embeddings.shape}")
print(f"Positional embedding shape: {pos_embeddings.shape}")
print(f"Final embedding shape: {input_embeddings.shape}")

Token IDs: tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]]).
Inputs shape: torch.Size([8, 4])
Target IDs: tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]]).
Targets shape: torch.Size([8, 4])
Token embedding shape: torch.Size([8, 4, 256])
Positional embedding shape: torch.Size([4, 256])
Final embedding shape: torch.Size([8, 4, 256])


In [30]:
from custom_modules import Linear, Embedding, RMSNorm, softmax, RotaryPositionalEmbedding

In [31]:
rope = RotaryPositionalEmbedding(0.5, d_k=10, max_seq_len=100)

In [33]:
getattr(rope, "cosines")

tensor([[ 5.4030e-01,  5.4030e-01,  4.0968e-01,  4.0968e-01,  2.4865e-01,
          2.4865e-01,  5.5052e-02,  5.5052e-02, -1.6948e-01, -1.6948e-01],
        [-4.1615e-01, -4.1615e-01, -6.6433e-01, -6.6433e-01, -8.7634e-01,
         -8.7634e-01, -9.9394e-01, -9.9394e-01, -9.4255e-01, -9.4255e-01],
        [-9.8999e-01, -9.8999e-01, -9.5400e-01, -9.5400e-01, -6.8446e-01,
         -6.8446e-01, -1.6449e-01, -1.6449e-01,  4.8898e-01,  4.8898e-01],
        [-6.5364e-01, -6.5364e-01, -1.1732e-01, -1.1732e-01,  5.3596e-01,
          5.3596e-01,  9.7583e-01,  9.7583e-01,  7.7681e-01,  7.7681e-01],
        [ 2.8366e-01,  2.8366e-01,  8.5787e-01,  8.5787e-01,  9.5100e-01,
          9.5100e-01,  2.7193e-01,  2.7193e-01, -7.5229e-01, -7.5229e-01],
        [ 9.6017e-01,  9.6017e-01,  8.2022e-01,  8.2022e-01, -6.3024e-02,
         -6.3024e-02, -9.4589e-01, -9.4589e-01, -5.2181e-01, -5.2181e-01],
        [ 7.5390e-01,  7.5390e-01, -1.8582e-01, -1.8582e-01, -9.8234e-01,
         -9.8234e-01, -3.7608e-0