<a target="_blank" href="https://colab.research.google.com/github/mHemaAP/S17/blob/main/gpt_transformer_train.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [1]:
!pip install transformers -qqq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch
from transformers import AutoTokenizer

In [3]:
!git clone https://github.com/mHemaAP/S17.git

Cloning into 'S17'...
remote: Enumerating objects: 316, done.[K
remote: Counting objects: 100% (2/2), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 316 (delta 0), reused 0 (delta 0), pack-reused 314[K
Receiving objects: 100% (316/316), 16.19 MiB | 25.04 MiB/s, done.
Resolving deltas: 100% (8/8), done.


In [4]:
%cd S17
%ls

/content/S17
[0m[01;34mBERT_data[0m/                    names.tsv           [01;34mtransformer_model[0m/
bert_transformer_train.ipynb  [01;34mpizza_steak_sushi[0m/  values.tsv
[01;34mGPT_data[0m/                     README.md           vit_transformer_train.ipynb
gpt_transformer_train.ipynb   [01;34msuper_repo[0m/         vocab.txt


In [5]:
from transformer_model.models.gpt.gpt_train import gpt_train
from transformer_model.common_model import Transformer
from transformer_model.models.gpt.gpt_utils import (
    BLOCK_SIZE,
    DEVICE,
    DROPOUT,
    LEARNING_RATE,
    NUM_EMBED,
    NUM_HEAD,
    NUM_LAYER,
    encode,
)

Number of patches (N) with image height (H=224), width (W=224) and patch size (P=16): 196
Input shape (single 2D image): (224, 224, 3)
Output shape (single 2D image flattened into patches): (196, 768)


In [6]:
# raw data
path_do_data = "GPT_data/english.txt"
data_raw = open(path_do_data, encoding="utf-8").read()
# we use pretrained BERT tokenizer for performance improvements
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
vocab_size = tokenizer.vocab_size
# data_raw = data_raw[4000000:] # short dataset

# train/val split
data = encode(text_seq=data_raw, tokenizer=tokenizer)
n = int(0.9 * len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# train a new model
gpt_model = Transformer(
    vocab_size=vocab_size,
    embed_size=NUM_EMBED,
    block_size=BLOCK_SIZE,
    n_heads=NUM_HEAD,
    num_layers=NUM_LAYER,
    dropout=DROPOUT,
    algorithm="GPT"
)
# load model to GPU if available
gpt_model = gpt_model.to(DEVICE)
# print the number of parameters in the model
print(
    "Model with {:.2f}M parameters".format(sum(p.numel() for p in gpt_model.parameters()) / 1e6)
)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (37443 > 512). Running this sequence through the model will result in indexing errors


Model with 89.48M parameters


In [7]:
# optimizer takes the model's parameters and the learning rate as input,
# and updates the parameters during the training process in order to
# minimize the loss function.
optimizer = torch.optim.AdamW(gpt_model.parameters(), lr=LEARNING_RATE)
MAX_ITER = 9991
gpt_model_train = gpt_train(MAX_ITER, train_data, val_data, optimizer, gpt_model)

Training GPT...
step          0 | train loss 10.7290 | val loss 10.7246
step        500 | train loss 0.4722 | val loss 8.1066
step       1000 | train loss 0.1737 | val loss 9.7198
step       1500 | train loss 0.1469 | val loss 9.8173
step       2000 | train loss 0.1227 | val loss 10.0199
step       2500 | train loss 0.1222 | val loss 10.7100
step       3000 | train loss 0.1155 | val loss 10.7464
step       3500 | train loss 0.1132 | val loss 10.8682
step       4000 | train loss 0.1112 | val loss 10.8576
step       4500 | train loss 0.1095 | val loss 11.5052
step       5000 | train loss 0.1082 | val loss 11.0390
step       5500 | train loss 0.1046 | val loss 11.0326
step       6000 | train loss 0.1018 | val loss 11.2562
step       6500 | train loss 0.1010 | val loss 11.3391
step       7000 | train loss 0.1031 | val loss 11.6617
step       7500 | train loss 0.1012 | val loss 11.4724
step       8000 | train loss 0.0926 | val loss 12.0525
step       8500 | train loss 0.0965 | val loss 11.9