In [23]:
import torch  
import torch.optim as optim
import time
from tqdm import tqdm
import json

from Transformer import (
    TransformerLanguageModel,
    encode,
    decode,
    tokenize_data,
)

from torch.nn import functional as F

from video_code import (
    GPTLanguageModel
)


In [4]:
def set_parameters():

    # Load the config file json
    f = open("training_config.json")
    config = json.load(f)
    torch.manual_seed(1337)

    train_num_batches = config["train_num_steps"] // config["batch_size"]
    val_num_batches = config["val_num_steps"] // config["batch_size"]

    # Hardcode device for testing
    device = "cpu"

    return config, device, train_num_batches, val_num_batches


In [6]:
data, vocab_size, stoi, itos = tokenize_data("input.txt")
print(data[:100])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [7]:
# Load the full model from the file, and perform inference on a sample to look at the output format
config, device, train_num_batches, val_num_batches = set_parameters()
model = TransformerLanguageModel(vocab_size, config, "cpu")

In [13]:
# Create batches

def get_batch(split, train_data, val_data, config): # train or validation split
    """Generate a small batch of data from inputs x and targets y."""
    if split == "train":
        data = train_data
    elif split == "val":
        data = val_data
    else:
        raise ValueError("split must be train or val")
    ix = torch.randint(len(data) - config["block_size"], (config["batch_size"],)) # batch_size random sequence starting points
    # print("Random starting points for each block: ", ix)
    x = torch.stack([data[i:i+config["block_size"]] for i in ix])
    y = torch.stack([data[i+1:i+1+config["block_size"]] for i in ix])
    return x, y

config["batch_size"] = 3
config["block_size"] = 6
xb, yb = get_batch("train", data, data, config)
print(xb)

tensor([[63,  1, 46, 43, 39, 56],
        [39, 52, 42,  1, 39,  1],
        [46, 53, 50, 43,  1, 41]])


In [20]:
logits = model(xb)[0]
print(type(logits))
print(len(logits))
print(logits.shape)
# Shape is the batch size, sequence length, and vocab size
# Vocab size represents the softmax output for each token.  


<class 'torch.Tensor'>
3
torch.Size([3, 6, 65])


In [22]:
print(yb.shape)
print(yb)

torch.Size([3, 6])
tensor([[ 1, 46, 43, 39, 56, 58],
        [52, 42,  1, 39,  1, 40],
        [53, 50, 43,  1, 41, 47]])


In [24]:
F.cross_entropy(logits.view(-1, logits.size(-1)), yb.view(-1))

tensor(4.1826, grad_fn=<NllLossBackward0>)

In [25]:
B, T, V = logits.shape
logits = logits.view(B * T, V)  # Stack the time pieces for each batch on top of each other batch
targets = yb.view(B * T)
loss = F.cross_entropy(logits, targets)
print(loss)
print(targets)

tensor(4.1826, grad_fn=<NllLossBackward0>)
tensor([ 1, 46, 43, 39, 56, 58, 52, 42,  1, 39,  1, 40, 53, 50, 43,  1, 41, 47])
