<!-- @format -->

# Zadanie 1

(2p)

Zmodyfikuj notatnik **add_demo.ipynb** (modyfikacja notatnika z repozytorium minGPT), żeby wykonywał on mnożenie dwóch liczb trzycyfrowych. Postaraj się osiągnąć poziom skuteczności 1% (aczkolwiek da się spokojnie osiągnąć kilkanaście procent)

Za zadanie jest premia równa $3*x$, gdzie $x$ jest punktacją na zbiorze treningowym (od $0$ do $1$)


<!-- @format -->

A cute little demo showing the simplest usage of minGPT. Configured to run fine on Macbook Air in like a minute.


In [11]:
import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from mingpt.utils import set_seed

set_seed(3407)

In [12]:
import random



def random_add_instance(length):

    a = [random.randint(0, 9) for i in range(length)]
    b = [random.randint(0, 9) for i in range(length)]
    val_a = int("".join(str(d) for d in a))
    val_b = int("".join(str(d) for d in b))
    val_c = val_a + val_b
    str_c = str(val_c)

    str_c = (length + 1 - len(str_c)) * "0" + str_c

    return a + b + [int(d) for d in str_c]


def random_mult_instance(length):
    a = [random.randint(0, 9) for i in range(length)]
    b = [random.randint(0, 9) for i in range(length)]
    val_a = int("".join(str(d) for d in a))
    val_b = int("".join(str(d) for d in b))
    val_c = val_a * val_b
    str_c = str(val_c)
    str_c = (length * 2 - len(str_c)) * "0" + str_c
    return a + b + [int(d) for d in str_c]



for i in range(10):

    print(random_mult_instance(3))

[0, 6, 2, 5, 3, 6, 0, 3, 3, 2, 3, 2]
[4, 9, 5, 3, 7, 8, 1, 8, 7, 1, 1, 0]
[6, 4, 4, 3, 1, 4, 2, 0, 2, 2, 1, 6]
[3, 5, 9, 1, 1, 9, 0, 4, 2, 7, 2, 1]
[3, 5, 0, 1, 4, 8, 0, 5, 1, 8, 0, 0]
[0, 0, 2, 9, 3, 7, 0, 0, 1, 8, 7, 4]
[8, 4, 8, 2, 3, 7, 2, 0, 0, 9, 7, 6]
[9, 9, 9, 7, 0, 6, 7, 0, 5, 2, 9, 4]
[7, 4, 5, 1, 3, 6, 1, 0, 1, 3, 2, 0]
[1, 0, 1, 0, 4, 2, 0, 0, 4, 2, 4, 2]


In [13]:
l = [0, 1, 2, 3, 4, 5, 6, 7, 8]
print(l[6:])

[6, 7, 8]


In [14]:
class MultDataset(Dataset):
    """
    Dataset for the Add problem. E.g. for problem length 3:
    12 + 333 = 345
    Input: 0 1 2 3 3 3 -> Output: 0 3 4 5
    Which will feed into the transformer concatenated as:
    input:  0 1 2 3 3 3 0 3 4
    output: I I I I I 0 3 4 5
    where I is "ignore", as the transformer is reading the input sequence
    """

    def __init__(self, split, length=3):
        assert split in {"train", "test"}
        self.split = split
        self.length = length

    def __len__(self):
        return 10000  # ...

    def get_vocab_size(self):
        return 10

    def get_block_size(self):
        # the length of the sequence that will feed into transformer,
        # containing concatenated input and the output, but -1 because
        # the transformer starts making predictions at the last input element
        return 4 * self.length + 1 - 1

    def __getitem__(self, idx):
        while True:
            rai = random_mult_instance(self.length)
            h = hash(str(rai[: 2 * self.length]))

            inp_split = (
                "test" if h % 4 == 0 else "train"
            )  # designate 25% of examples as test
            if inp_split == self.split:
                break  # ok

        x = torch.tensor(rai[:-1], dtype=torch.long)
        y = torch.tensor(rai[1:], dtype=torch.long)

        # we only want to predict at output locations, mask out the loss at the input locations
        y[: 2 * self.length - 1] = -1
        return x, y


# """
# in:  8 9 6 | 9 9 9 | 8 9 5 1 0
# out: I I I | I I 8 | 9 5 1 0 4
# """

In [15]:
# print an example instance of the dataset
train_dataset = MultDataset("train")
test_dataset = MultDataset("test")
x, y = train_dataset[0]

print(x)
for a, b in zip(x, y):
    print(int(a), int(b))

tensor([2, 9, 3, 4, 8, 0, 1, 4, 0, 6, 4])
2 -1
9 -1
3 -1
4 -1
8 -1
0 1
1 4
4 0
0 6
6 4
4 0


In [16]:
# create a GPT instance
from mingpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = "gpt-micro"
# model_config.model_type = 'gpt-nano'

model_config.vocab_size = train_dataset.get_vocab_size()
model_config.block_size = train_dataset.get_block_size()
model = GPT(model_config)

number of parameters: 0.80M


In [17]:
print(model_config.n_head, model_config.n_layer, model_config.n_embd)

4 4 128


In [47]:
# create a Trainer object
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = (
    2e-6  # the model we're using is so small that we can go a bit faster
)
train_config.lr_decay = True
train_config.max_iters = 5000
train_config.num_workers = 0
trainer = Trainer(train_config, model, train_dataset)

running on device cuda


In [48]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(
            f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}"
        )


trainer.set_callback("on_batch_end", batch_end_callback)

trainer.run()

iter_dt 0.00ms; iter 0: train loss 0.91416
iter_dt 26.66ms; iter 100: train loss 0.91475
iter_dt 18.61ms; iter 200: train loss 0.89250
iter_dt 18.61ms; iter 300: train loss 0.93673
iter_dt 24.16ms; iter 400: train loss 0.97105
iter_dt 24.20ms; iter 500: train loss 0.90989
iter_dt 16.03ms; iter 600: train loss 0.90507
iter_dt 16.55ms; iter 700: train loss 0.93694
iter_dt 16.54ms; iter 800: train loss 0.88312
iter_dt 18.02ms; iter 900: train loss 0.93938
iter_dt 22.60ms; iter 1000: train loss 0.91614
iter_dt 18.03ms; iter 1100: train loss 0.94222
iter_dt 16.52ms; iter 1200: train loss 0.93073
iter_dt 17.53ms; iter 1300: train loss 0.92490
iter_dt 17.70ms; iter 1400: train loss 0.93035
iter_dt 21.54ms; iter 1500: train loss 0.89497
iter_dt 16.52ms; iter 1600: train loss 0.87901
iter_dt 22.03ms; iter 1700: train loss 0.90043
iter_dt 17.03ms; iter 1800: train loss 0.91603
iter_dt 18.03ms; iter 1900: train loss 0.90646
iter_dt 23.67ms; iter 2000: train loss 0.89650
iter_dt 23.03ms; iter 2100

In [49]:
# now let's perform some evaluation
model.eval()
None

In [None]:
def eval_mult_split(trainer, split, max_batches):
    dataset = {"train": train_dataset, "test": test_dataset}[split]
    n = train_dataset.length  # naugy direct access shrug
    results = []
    mistakes_printed_already = 0
    loader = DataLoader(dataset, batch_size=100, num_workers=0, drop_last=False)

    for b, (x, y) in enumerate(loader):
        x = x.to(trainer.device)
        y = y.to(trainer.device)
        """ 
            Dataset for the Add problem. E.g. for problem length 3:
            12 + 333 = 345
            Input: 0 1 2 3 3 3 -> Output: 0 3 4 5
            Which will feed into the transformer concatenated as:
            input:  0 1 2 3 3 3 0 3 4
            output: I I I I I 0 3 4 5
            where I is "ignore", as the transformer is reading the input sequence
            
        """

        """
        in:  8 9 6 | 9 9 9 | 8 9 5 1 0
        out: I I I | I I 8 | 9 5 1 0 4
        """

        inp = x[:, : 2 * n]
        sol = y[:, -n * 2 :]

        cat = model.generate(
            inp, n * 2, do_sample=False
        )  # using greedy argmax, not sampling
        sol_candidate = cat[:, -n * 2 :]
        correct = (sol == sol_candidate).all(1).cpu()
        for i in range(x.size(0)):
            results.append(int(correct[i]))

    rt = torch.tensor(results, dtype=torch.float)
    print(
        "%s final score: %d/%d = %.2f%% correct"
        % (split, rt.sum(), len(results), 100 * rt.mean())
    )
    return rt.sum()


# run a lot of examples from both train and test through the model and verify the output correctness
with torch.no_grad():
    train_score = eval_mult_split(trainer, "train", max_batches=50)
    test_score = eval_mult_split(trainer, "test", max_batches=50)

train final score: 341/10000 = 3.41% correct
test final score: 328/10000 = 3.28% correct


In [None]:
# Where we made changes to the code
# 1. We changed instance function
# 2. In rai = random_add_instance(self.length) we changed add to mult
# 3. We incresed get_block_size to 4 * self.length + 1 - 1
# 4. In eval_mult_split we changed the inp, sol and sol_candidate size from n+1 to n * 2
# 5. In cat = model.generate(inp, n * 2, do_sample=False) we changed n+1 to n * 2
# 6. We changed learning rate to 2e-6