# Importing code from the previous notebook

In [1]:
import torch

In [2]:
# importing code from the previous notebook

In [3]:
# read the input data file
with open('raw-text-data.txt', 'r', encoding='utf-8') as f:
    text_data = f.read()

# getting the unique chars and building vocab for the input text data
char_vocab = sorted(list(set(text_data)))
vocab_size = len(char_vocab)

# creating our simple encoder and decoder
c2i = { ch:i for i,ch in enumerate(char_vocab) }
i2c = { i:ch for i,ch in enumerate(char_vocab) }
encode = lambda txt_str: [c2i[ch] for ch in txt_str]
decode = lambda num_vect: ''.join([i2c[i] for i in num_vect])

# converting our text data into encoded PyTorch tensors
data = torch.tensor(encode(text_data), dtype=torch.long)

In [4]:
# let's verify our data by checking the shape and the first 100 chars
print(data.shape, data.dtype)
print(data[:100])

torch.Size([15795757]) torch.int64
tensor([54, 74, 75, 85,  2, 82, 67, 73, 71,  2, 67, 78, 78, 81, 89, 85,  2, 87,
        85, 71, 84, 85,  2, 86, 81,  2, 85, 71, 67, 84, 69, 74,  2, 79, 87, 78,
        86, 75, 82, 78, 71,  2, 85, 81, 87, 84, 69, 71, 85,  2, 72, 81, 84,  2,
        67,  2, 68, 81, 81, 77,  2, 73, 75, 88, 71, 80,  2, 67,  2, 19, 18, 15,
         2, 81, 84,  2, 19, 21, 15, 70, 75, 73, 75, 86,  2, 43, 80, 86, 71, 84,
        80, 67, 86, 75, 81, 80, 67, 78,  2, 53])


# Breaking down the dataset into workable chunks

In [5]:
# let's split up the data into train and validation sets
n_split = int(0.9 * len(data))
train_data = data[:n_split]
val_data = data[n_split:]

In [6]:
# Note: ensure the data is well mixed and hetrogenous incase of multiple source
# TODO: in previous notebook, to mix-up the raw data, line wise, before encoding

In [7]:
# now let's break it into chunks
block_size = 8

# we will have multiple training examples each chunk,
# precisely 'block_size' examples per 'block_size + 1' chars
train_data[:block_size + 1]

tensor([54, 74, 75, 85,  2, 82, 67, 73, 71])

In [9]:
# let's see the above in action with a sample example
sample_x = train_data[:block_size]
sample_y = train_data[1:block_size + 1]
for t in range(block_size):
    context = sample_x[:t + 1]
    target = sample_y[t]
    print(f"-> when input is: {context}")
    print(f"   the target is: {target}")

-> when input is: tensor([54])
   the target is: 74
-> when input is: tensor([54, 74])
   the target is: 75
-> when input is: tensor([54, 74, 75])
   the target is: 85
-> when input is: tensor([54, 74, 75, 85])
   the target is: 2
-> when input is: tensor([54, 74, 75, 85,  2])
   the target is: 82
-> when input is: tensor([54, 74, 75, 85,  2, 82])
   the target is: 67
-> when input is: tensor([54, 74, 75, 85,  2, 82, 67])
   the target is: 73
-> when input is: tensor([54, 74, 75, 85,  2, 82, 67, 73])
   the target is: 71


In [10]:
# in this manner, 
# the model will be able to generate/predict with as little as 1 char of context
# upto 'block_size' number of characters of input,
# after which it will start truncating

In [11]:
# now breaking down into batches (help with parallel processing)
# we will have dataset broken down into independent multiple batches
# and each of the batches will contain multiple chunks as above

torch.manual_seed(2023)
batch_size = 4  # how many independent sequences will we process in parallel?
block_size = 8  # what is the maximum context length for prediction?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

inputs:
torch.Size([4, 8])
tensor([[85, 81, 69, 75, 71, 86, 91, 14],
        [75, 80,  2, 19, 24, 25, 22,  2],
        [ 2,  4, 35, 69, 74, 91,  2, 36],
        [49, 53, 48,  2, 48, 71, 89, 85]])
targets:
torch.Size([4, 8])
tensor([[81, 69, 75, 71, 86, 91, 14,  2],
        [80,  2, 19, 24, 25, 22,  2, 10],
        [ 4, 35, 69, 74, 91,  2, 36, 84],
        [53, 48,  2, 48, 71, 89, 85,  2]])


In [12]:
for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"-> when input is: {context.tolist()}")
        print(f"   the target is: {target}")

-> when input is: [85]
   the target is: 81
-> when input is: [85, 81]
   the target is: 69
-> when input is: [85, 81, 69]
   the target is: 75
-> when input is: [85, 81, 69, 75]
   the target is: 71
-> when input is: [85, 81, 69, 75, 71]
   the target is: 86
-> when input is: [85, 81, 69, 75, 71, 86]
   the target is: 91
-> when input is: [85, 81, 69, 75, 71, 86, 91]
   the target is: 14
-> when input is: [85, 81, 69, 75, 71, 86, 91, 14]
   the target is: 2
-> when input is: [75]
   the target is: 80
-> when input is: [75, 80]
   the target is: 2
-> when input is: [75, 80, 2]
   the target is: 19
-> when input is: [75, 80, 2, 19]
   the target is: 24
-> when input is: [75, 80, 2, 19, 24]
   the target is: 25
-> when input is: [75, 80, 2, 19, 24, 25]
   the target is: 22
-> when input is: [75, 80, 2, 19, 24, 25, 22]
   the target is: 2
-> when input is: [75, 80, 2, 19, 24, 25, 22, 2]
   the target is: 10
-> when input is: [2]
   the target is: 4
-> when input is: [2, 4]
   the target i