**Connect Google drive**

In [1]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


**Set up runpath**

In [2]:
import os
import sys

In [3]:
curr_path = "/content/gdrive/MyDrive/ERA_V1/era_v1_session17"
os.chdir(curr_path)
cwd = os.getcwd()
cwd

'/content/gdrive/MyDrive/ERA_V1/era_v1_session17'

**Install libraries**

In [4]:
!pip install --quiet "torchinfo" "seaborn" "pytorch-lightning" "torchmetrics" "lightning-bolts" "torchtext" "datasets" "tokenizers" "transformers"


**Import libraries**

In [5]:
import torch
from transformers import AutoTokenizer

# user scripts
from transformer_model_scripts.transformer_models import GPT
from transformer_model_scripts.GPT_utils import (
    BATCH_SIZE,
    BLOCK_SIZE,
    DEVICE,
    DROPOUT,
    LEARNING_RATE,
    NUM_EMBED,
    NUM_HEAD,
    NUM_LAYER,
    MAX_ITER,
    EVAL_INTER,
    encode,
    decode,
    get_batch,
    save_model_to_checkpoint,
    estimate_loss,
)

**Load data**

In [6]:
# Load data
gpt_data_path = "/content/gdrive/MyDrive/ERA_V1/era_v1_session17/gpt_data/english.txt"
data_raw = open(gpt_data_path, encoding="utf-8").read()

**Setup tokenizers/train & val data**

In [7]:
# we use pretrained BERT tokenizer for performance improvements
gpt_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
gpt_vocab_size = gpt_tokenizer.vocab_size
# short_data = data_raw[4000000:] # short dataset

# train/val split
gpt_data = encode(text_seq=data_raw, tokenizer=gpt_tokenizer)
n = int(0.9 * len(gpt_data))  # first 90% will be train, rest val
train_data = gpt_data[:n]
val_data = gpt_data[n:]


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (37443 > 512). Running this sequence through the model will result in indexing errors


**Setup GPT Model instance**

In [8]:
# train a new model
model = GPT(
    vocab_size=gpt_vocab_size,
    d_model=NUM_EMBED,
    block_size=BLOCK_SIZE,
    num_heads=NUM_HEAD,
    num_layers=NUM_LAYER,
    dropout=DROPOUT,
    device = DEVICE
)
# load model to GPU if available
gpt_model = model.to(DEVICE)
# print the number of parameters in the model
print(
    "GPT Model with {:.2f}M parameters".format(sum(p.numel() for p in gpt_model.parameters()) / 1e6)
)


GPT Model with 89.45M parameters


In [9]:
# optimizer takes the model's parameters and the learning rate as input,
# and updates the parameters during the training process in order to
# minimize the loss function.
gpt_optimizer = torch.optim.AdamW(gpt_model.parameters(), lr=LEARNING_RATE)
MAX_ITER = 500
for step in range(MAX_ITER):

    # every EVAL_INTER evaluate the loss on train and val sets
    if step % EVAL_INTER == 0 or step == MAX_ITER - 1:
        loss_train = estimate_loss(
            data=train_data, model=gpt_model, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE
        )
        loss_val = estimate_loss(
            data=val_data, model=gpt_model, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE
        )
        print("step {:10} | train loss {:6.4f} | val loss {:6.4f}".format(step, loss_train, loss_val))

    # sample a batch of data
    xb, yb = get_batch(data=train_data, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE)
    gpt_logits, gpt_loss = gpt_model.forward(xb, yb)
    # zero_grad() method sets the gradients of all parameters in the optimizer to zero
    gpt_optimizer.zero_grad(set_to_none=True)
    # backward() method on the loss variable calculates the gradients
    # of the loss with respect to the model's parameters.
    gpt_loss.backward()
    # step() method on the optimizer updates the model's parameters
    # using the calculated gradients, in order to minimize the loss.
    gpt_optimizer.step()


step          0 | train loss 10.7378 | val loss 10.7017
step        499 | train loss 0.3761 | val loss 8.4373


In [10]:
# save the model to checkpoint
save_model_to_checkpoint(model=gpt_model, path_to_checkpoint="checkpoint", epoch=step)

Successfully saved the model to checkpoint/checkpoint_epoch-499_02.11.2023_03:21:37.pt


In [11]:
# generate some output based on the context
gpt_context = torch.zeros((1, 1), dtype=torch.long, device=DEVICE)
gpt_output_results = decode(
        enc_sec=gpt_model.generate(idx=gpt_context, max_new_tokens=100, block_size=BLOCK_SIZE)[0],
        tokenizer=gpt_tokenizer,
    )
# gpt_output_results
print(f"Decoded results:\n{gpt_output_results.split()}")


Decoded results:
['[PAD]', 'can', 'be', 'correct,', 'and', 'like', 'largest', 'highly', 'subjective,', 'and', 'even', 'weirder.', 'the', 'experiments', 'can', 'be', 'considered', 'through', 'the', 'whole', 'image', '(', 'next', 'session', '),', 'with', 'less', 'than', '1,', 'we', 'cover', 'most', 'of', 'the', 'network', 'it', 'is', 'linear', 'fashion,', 'but', 'that', 'converts', 'a', 'way', 'to', 'roll', 'backpropagation.', 'it', 'can', "we'll", 'stick', 'to', 'out', 'how', 'to', 'use', 'the', 'gradient', 'descent', ':', 'the', 'variance', 'should', 'be', 'able', 'to', 'identify', 'features,', 'but', 'then', 'later', 'peer', 'group', 'would', 'release', 'their', 'studies', 'that', 'claim', 'otherwise.', 'relu', 'is', 'simple,', 'efficient']
