In [2]:
import torch
import tiktoken

In [3]:
with open( "gautier.txt", "r" ) as f:
    raw_text = f.read()

raw_text[:50]

'Had given orders that day to deny my door to every'

In [4]:
tokenizer = tiktoken.get_encoding("gpt2")

In [5]:
enc_text = tokenizer.encode(raw_text)

In [6]:
print(enc_text[:20])

[25383, 1813, 6266, 326, 1110, 284, 10129, 616, 3420, 284, 790, 530, 26, 1719, 925, 257, 26322, 6323, 326, 3329]


In [7]:
print( tokenizer.decode( enc_text[:2]))

Had given


In [8]:
len( enc_text)

12805

In [9]:
for i in range(1,10):
    print("Input:", tokenizer.decode(enc_text[:i]), "Target:", tokenizer.decode([enc_text[i]]))

Input: Had Target:  given
Input: Had given Target:  orders
Input: Had given orders Target:  that
Input: Had given orders that Target:  day
Input: Had given orders that day Target:  to
Input: Had given orders that day to Target:  deny
Input: Had given orders that day to deny Target:  my
Input: Had given orders that day to deny my Target:  door
Input: Had given orders that day to deny my door Target:  to


In [10]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [11]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [12]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[25383,  1813,  6266,   326],
        [ 1110,   284, 10129,   616],
        [ 3420,   284,   790,   530],
        [   26,  1719,   925,   257],
        [26322,  6323,   326,  3329],
        [  326,   314,   561,   466],
        [ 2147,    11,   314,   750],
        [  407,  4601,   284,   307]])

Targets:
 tensor([[ 1813,  6266,   326,  1110],
        [  284, 10129,   616,  3420],
        [  284,   790,   530,    26],
        [ 1719,   925,   257, 26322],
        [ 6323,   326,  3329,   326],
        [  314,   561,   466,  2147],
        [   11,   314,   750,   407],
        [ 4601,   284,   307, 24069]])


In [13]:
# to apply the tokenizer's decoder to these IDs, the rows of the tensor `inputs` have to be converted into lists:
for row in inputs:
    print( tokenizer.decode( row.tolist() ) )

Had given orders that
 day to deny my
 door to every one
; having made a
 solemn resolution that morning
 that I would do
 nothing, I did
 not wish to be


In [14]:
# we don't send these IDs to the LLM for training; we associate a vector a.k.a. tensor with each ID and then train the LLM on the vectors
# as a first example, let's create embedding vectors of length 3 for each token in a vocabulary of 6 tokens
vocab_size = 6
output_dim = 3
embedding = torch.nn.Embedding( vocab_size, output_dim )
print(embedding.weight)

Parameter containing:
tensor([[ 1.3787e+00,  1.2318e+00, -1.1967e+00],
        [ 5.3181e-01,  1.3624e+00,  1.6414e+00],
        [ 6.2029e-01,  1.8173e+00, -1.4835e+00],
        [-9.3286e-01, -1.4349e+00, -2.0287e+00],
        [ 6.2431e-01, -5.2932e-01, -9.0678e-02],
        [ 6.5739e-01,  1.0337e-03, -1.1941e+00]], requires_grad=True)


In [15]:
# if you just want the tensor part of this without the requires_grad=True bit
# method 1:
embedding.weight.data

tensor([[ 1.3787e+00,  1.2318e+00, -1.1967e+00],
        [ 5.3181e-01,  1.3624e+00,  1.6414e+00],
        [ 6.2029e-01,  1.8173e+00, -1.4835e+00],
        [-9.3286e-01, -1.4349e+00, -2.0287e+00],
        [ 6.2431e-01, -5.2932e-01, -9.0678e-02],
        [ 6.5739e-01,  1.0337e-03, -1.1941e+00]])

In [16]:
# if you just want the tensor part of this without the requires_grad=True bit
# method 1:
embedding.weight.detach()

tensor([[ 1.3787e+00,  1.2318e+00, -1.1967e+00],
        [ 5.3181e-01,  1.3624e+00,  1.6414e+00],
        [ 6.2029e-01,  1.8173e+00, -1.4835e+00],
        [-9.3286e-01, -1.4349e+00, -2.0287e+00],
        [ 6.2431e-01, -5.2932e-01, -9.0678e-02],
        [ 6.5739e-01,  1.0337e-03, -1.1941e+00]])

In [17]:
# call this A for some examples:
A = embedding.weight.detach()

In [18]:
# first row:
A[0]

tensor([ 1.3787,  1.2318, -1.1967])

In [19]:
# second row:
A[1]

tensor([0.5318, 1.3624, 1.6414])

In [20]:
# first column:
A[:,0]

tensor([ 1.3787,  0.5318,  0.6203, -0.9329,  0.6243,  0.6574])

In [21]:
# element in row 2, column 3:
A[1,2]

tensor(1.6414)

In [22]:
# to create a tensor directly:
x = torch.tensor([1.2,2.1])
y = torch.tensor([2.7,1.5])
print(x) 
print(y)

tensor([1.2000, 2.1000])
tensor([2.7000, 1.5000])


In [23]:
torch.dot( x,y)

tensor(6.3900)

In [24]:
# check:
1.2*2.7 + 2.1*1.5

6.390000000000001