
*  collect your own data (cannot be Shakespeare or any single file downloaded from the internet. Your sources should come from multiple URLs (basically copy paste 1000s of times)

* implement sparse attention on your own in the GPT code that we wrote. Train on the data that you collected above:

    > Copy and paste the code here for the sparse attention that you wrote

    > share the training log (Epochs/x = 10 logs)

    > Share 10 examples of output

#### Same "training.txt" file as was downloaded/extracted under the BERT training is being used here

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m80.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m96.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.1-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.1 tokenizers-0.13.2 transformers-4.26.1


In [2]:
!pip install git+https://git@github.com/ojhajayant/EVA8_API.git --upgrade 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://****@github.com/ojhajayant/EVA8_API.git
  Cloning https://****@github.com/ojhajayant/EVA8_API.git to /tmp/pip-req-build-jxa9u2zs
  Running command git clone --filter=blob:none --quiet 'https://****@github.com/ojhajayant/EVA8_API.git' /tmp/pip-req-build-jxa9u2zs
  Resolved https://****@github.com/ojhajayant/EVA8_API.git to commit be5684ad2de774490a20cf0e77d5c1abf07579b1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: EVA8-API
  Building wheel for EVA8-API (setup.py) ... [?25l[?25hdone
  Created wheel for EVA8-API: filename=EVA8_API-0.0.0-py3-none-any.whl size=34972 sha256=96a6530456912a63b1b7fe404b0d73d9e5caa4c2060e2993b6977228b53e0cb2
  Stored in directory: /tmp/pip-ephem-wheel-cache-z4p2d7i1/wheels/a8/a8/c6/7c45c4f625875888c00136086f33a1ff2ed2c3baba8b166fd6
Successfully built EVA8-API
Installing collected packages: EVA

In [3]:
from models import model_gpt

In [4]:
import torch
from models.model_gpt import Transformer
from transformers import AutoTokenizer  # pip install transformers
from utils.utils_gpt import (
    BATCH_SIZE,
    BLOCK_SIZE,
    DEVICE,
    DROPOUT,
    LEARNING_RATE,
    NUM_EMBED,
    NUM_HEAD,
    NUM_LAYER,
    MAX_ITER,
    EVAL_INTER,
    encode,
    decode,
    get_batch,
    save_model_to_chekpoint,
    estimate_loss,
)

# load model from checkpoint
# m = load_model_from_checkpoint(Transformer,vocab_size=vocab_size)

# example to decode sequence
# enc_sec = m.generate(idx=torch.zeros((1,1), dtype=torch.long),
# max_new_tokens=20)[0].tolist()
# print(decode(vocab=vocab, enc_sec=enc_sec))

# raw data
path_do_data = "data/training.txt"
data_raw = open(path_do_data, encoding="utf-8").read()
# we use pretrained BERT tokenizer for performance improvements
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
vocab_size = tokenizer.vocab_size
# data_raw = data_raw[4000000:] # short dataset

# train/val split
data = encode(text_seq=data_raw, tokenizer=tokenizer)
n = int(0.9 * len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# train a new model
model = Transformer(
    vocab_size=vocab_size,
    num_embed=NUM_EMBED,
    block_size=BLOCK_SIZE,
    num_heads=NUM_HEAD,
    num_layers=NUM_LAYER,
    dropout=DROPOUT,
)
# load model to GPU if available
m = model.to(DEVICE)
# print the number of parameters in the model
print(
    "Model with {:.2f}M parameters".format(sum(p.numel() for p in m.parameters()) / 1e6)
)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3121454 > 512). Running this sequence through the model will result in indexing errors


Model with 89.48M parameters


In [5]:
# optimizer takes the model's parameters and the learning rate as input,
# and updates the parameters during the training process in order to
# minimize the loss function.
optimizer = torch.optim.AdamW(m.parameters(), lr=LEARNING_RATE)
MAX_ITER = 500
for step in range(MAX_ITER):

    # every EVAL_INTER evaluate the loss on train and val sets
    if step % EVAL_INTER == 0 or step == MAX_ITER - 1:
        loss_train = estimate_loss(
            data=train_data, model=m, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE
        )
        loss_val = estimate_loss(
            data=val_data, model=m, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE
        )
        print("step {:10} | train loss {:6.4f} | val loss {:6.4f}".format(step, loss_train, loss_val))

    # sample a batch of data
    xb, yb = get_batch(data=train_data, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE)
    logits, loss = m.forward(xb, yb)
    # zero_grad() method sets the gradients of all parameters in the optimizer to zero
    optimizer.zero_grad(set_to_none=True)
    # backward() method on the loss variable calculates the gradients 
    # of the loss with respect to the model's parameters.
    loss.backward()
    # step() method on the optimizer updates the model's parameters 
    # using the calculated gradients, in order to minimize the loss.
    optimizer.step()

step          0 | train loss 10.7412 | val loss 10.7457
step        499 | train loss 4.8983 | val loss 5.5780


In [6]:
save_model_to_chekpoint(model=m, path_to_checkpoint="checkpoint", epoch=step)

Successfully saved the model to checkpoint/checkpoint_epoch-499_09.03.2023_14:51:13.pt


In [7]:
# generate some output based on the context
context = torch.zeros((1, 1), dtype=torch.long, device=DEVICE)
print(
    decode(
        enc_sec=m.generate(idx=context, max_new_tokens=100, block_size=BLOCK_SIZE)[0],
        tokenizer=tokenizer,
    )
)

[PAD]d and then increasesor ’ s a - - ‘ and shared! fogg - - counting - - - beast - - just - - law i don ’ t put coolly - - intention that moment to keep up - nosed? ’ t shrinketed him ; speaking, confidential you are exploring, sir for instance to add : sure. ‘ a smile off acquaintedly. so sensible of mother, and night,ested tone, and were affection for childhood. if i could not be um tone


## 10 examples of input-output

In [11]:
# generate 10 input sentences
sentences = ['This is the first sentence.',
             'Here is the second sentence.',
             'The third sentence comes next.',
             'We are now on sentence.',
             'Sentence number five is here.',
             'Let us move sentence six.',
             'The seventh sentence up next.',
             'Eighth sentence, is right up.',
             'The ninth sentence  upon us.',
             'This is the final sentence.']

# tokenize the input sentences
tokenized_sentences = [tokenizer.encode(sentence) for sentence in sentences]

# generate output for each sentence
for sentence in tokenized_sentences:
    context = torch.tensor(sentence, dtype=torch.long, device=DEVICE).unsqueeze(0)
    output_tokens = m.generate(idx=context, max_new_tokens=35, block_size=BLOCK_SIZE)[0]
    output_sentence = decode(enc_sec=output_tokens, tokenizer=tokenizer)
    print(output_sentence)


[CLS] this is the first sentence. [SEP] opening at them, distributing known incredomuen almost the papers and broken - -ations winkesy who had left god would gas superfling, to wait and reduced
[CLS] here is the second sentence. [SEP] ’ t have you share than grace of people looks, talking minutes. i tell miss do, four, sir, ’ she paid me. i am not which i am differently
[CLS] the third sentence comes next. [SEP] it. ’ s commissioned gsta ) - tm was in sensitive, and tea. the horse. did not distinguish land hart assume that it. “ or at its way
[CLS] we are now on sentence. [SEP] than the contrary where had like her as if he do not t be see theirrable what winding to be ashamed to make condition. to the advice with entrebell of them
[CLS] sentence number five is here. [SEP] require to lydia, poor lay next un handsving each : ‘ hope you are for the story to others. ’ly been ski 1827 alone with such happiness. my lobby,
[CLS] let us move sentence six. [SEP] ofgus to go with ay the stairs. 