In [1]:
import random
from typing import List, Tuple, Dict, Union

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# from torch.utils.data import Dataset
from datasets import Dataset

from utils import (
    zero_padding_multiplicatn,
    generate_training_set,
    generate_validation_set,
    MathsDataset,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
max_ints = 5
val_samples = [10*(i+1)*max_ints for i in range(max_ints)]
val_set = generate_validation_set(max_ints, val_samples)

In [3]:
zero_padding_multiplicatn(*val_set[0], 5)

'05000*00015=75000'

In [4]:
v = [20] * 5

In [4]:
device = "cuda:0"
model_id = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
  model_id,
  trust_remote_code=True,
  torch_dtype=torch.float16,
)
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [5]:
str_data = [(f"{data[0]}*{data[1]}", f"={data[0]*data[1]}") for data in val_set]

In [6]:
data = Dataset.from_dict({"data":val_set})

In [7]:
data[0]

{'data': [5000, 15]}

In [78]:
def collate_fn(examples):
    # On TPU it's best to pad everything to the same length or training will be very slow.
    max_length = 128
    # When using mixed precision we want round multiples of 8/16
    '''
    if accelerator.mixed_precision == "fp8":
        pad_to_multiple_of = 16
    elif accelerator.mixed_precision != "no":
        pad_to_multiple_of = 8
    else:
        pad_to_multiple_of = None
'''
    return tokenizer.pad(
        examples,
        padding="longest",
        max_length=max_length,
        pad_to_multiple_of=None,#pad_to_multiple_of,
        return_tensors="pt",
    )

In [None]:
model

In [8]:
def tokenize_function(examples):
    # max_length=None => use the model max length (it's actually the default)
    output = {"input_ids_x": [], "input_ids_y": [], 
              "attention_mask_x": [], "attention_mask_y": []}
    for example in examples["data"]:
        padded = zero_padding_multiplicatn(*example, 5, False).split("=")
        x = padded[0] + "="
        y = padded[1]
        x_token = tokenizer(x, padding="max_length", max_length=16, truncation=True)
        y_token = tokenizer(y, padding="max_length", max_length=16, truncation=True)
        output["input_ids_x"].append(x_token["input_ids"])
        output["input_ids_y"].append(y_token["input_ids"])
        output["attention_mask_x"].append(x_token["attention_mask"])
        output["attention_mask_y"].append(y_token["attention_mask"])
    
        
    return output

In [9]:
tokenizer("2*3=", "6", truncation=True)

{'input_ids': [17, 9, 18, 28, 21], 'attention_mask': [1, 1, 1, 1, 1]}

In [10]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [13]:
tokenized_datasets = data.map(
            tokenize_function,
            batched=True,
            remove_columns=["data"],
        )

Map: 100%|██████████| 750/750 [00:00<00:00, 10179.76 examples/s]


In [12]:
out

NameError: name 'out' is not defined

In [14]:
tokenized_datasets

Dataset({
    features: ['input_ids_x', 'input_ids_y', 'attention_mask_x', 'attention_mask_y'],
    num_rows: 750
})

In [15]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(
        tokenized_datasets, shuffle=False, batch_size=2
    )

In [16]:
for x in train_dataloader:
    print(x)
    break

{'input_ids_x': [tensor([ 1731, 28727]), tensor([22186,  3104]), tensor([9, 9]), tensor([4531,  830]), tensor([38569,  3720]), tensor([28, 28]), tensor([50257, 50257]), tensor([50257, 50257]), tensor([50257, 50257]), tensor([50257, 50257]), tensor([50257, 50257]), tensor([50257, 50257]), tensor([50257, 50257]), tensor([50257, 50257]), tensor([50257, 50257]), tensor([50257, 50257])], 'input_ids_y': [tensor([24591,  1558]), tensor([22172,  1983]), tensor([ 2780, 48724]), tensor([  940, 50257]), tensor([50257, 50257]), tensor([50257, 50257]), tensor([50257, 50257]), tensor([50257, 50257]), tensor([50257, 50257]), tensor([50257, 50257]), tensor([50257, 50257]), tensor([50257, 50257]), tensor([50257, 50257]), tensor([50257, 50257]), tensor([50257, 50257]), tensor([50257, 50257])], 'attention_mask_x': [tensor([1, 1]), tensor([1, 1]), tensor([1, 1]), tensor([1, 1]), tensor([1, 1]), tensor([1, 1]), tensor([0, 0]), tensor([0, 0]), tensor([0, 0]), tensor([0, 0]), tensor([0, 0]), tensor([0, 0]), 

In [25]:
tokenizer.decode(torch.stack(x["input_ids_x"]).transpose(0, 1)[0])

'00717*00958=[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]'

In [17]:
x = tokenizer(["I am a", "2*3="], return_tensors="pt", padding="max_length", max_length=8, truncation=True)
print(x)

{'input_ids': tensor([[   40,   716,   257, 50257, 50257, 50257, 50257, 50257],
        [   17,     9,    18,    28, 50257, 50257, 50257, 50257]]), 'attention_mask': tensor([[1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0, 0]])}


In [18]:
x['input_ids'].shape

torch.Size([2, 8])

In [19]:
out = model(x["input_ids"].to(device))


../aten/src/ATen/native/cuda/Indexing.cu:1093: indexSelectSmallIndex: block: [2,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1093: indexSelectSmallIndex: block: [2,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1093: indexSelectSmallIndex: block: [2,0,0], thread: [34,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1093: indexSelectSmallIndex: block: [2,0,0], thread: [35,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1093: indexSelectSmallIndex: block: [2,0,0], thread: [36,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1093: indexSelectSmallIndex: block: [2,0,0], thread: [37,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1093: indexSelectSmallIndex: block: [2,0,0], thread: 

RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`

 `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1093: indexSelectSmallIndex: block: [2,0,0], thread: [28,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1093: indexSelectSmallIndex: block: [2,0,0], thread: [29,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1093: indexSelectSmallIndex: block: [2,0,0], thread: [30,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1093: indexSelectSmallIndex: block: [2,0,0], thread: [31,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1093: indexSelectSmallIndex: block: [1,0,0], thread: [96,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1093: indexSelectSmallIndex: block: [1,0,0], thread: [97,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1093: indexSel

In [35]:
pred

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [27]:
pred = torch.argmax(out.logits, dim=-1)
print(tokenizer.decode(pred[1]))

.2*0


In [18]:
for x in train_dataloader:
    with torch.no_grad():
        # Stack the tensors
        input_ids_x = torch.stack(x["input_ids_x"]).transpose(0, 1).to(device)
        attention_mask_x = torch.stack(x["attention_mask_x"]).transpose(0, 1).to(device)
        input_ids_y = torch.stack(x["input_ids_y"]).transpose(0, 1).to(device)

        # Forward pass
        outputs = model(
            input_ids=input_ids_x,
            attention_mask=attention_mask_x,
            #labels=input_ids_y,
        )
        # Additional code for processing outputs

        break

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [149]:
input_ids_x.shape

torch.Size([16, 2])

In [145]:
for x in train_dataloader:
    with torch.no_grad():
        outputs = model(
            input_ids=x["input_ids_x"].to(device),
            attention_mask=x["attention_mask_x"].to(device),
            labels=x["input_ids_y"].to(device),
        )
    break

AttributeError: 'list' object has no attribute 'to'

In [60]:
x, y = val_set[0]
padded = zero_padding_multiplicatn(x, y, 5, False)
x,y = padded.split('=')

In [9]:
y

'195266'

In [10]:
num_training_samples = 30000
training_set = generate_training_set(max_ints, num_training_samples, val_samples)

In [11]:
len(val_set), len(training_set)

(750, 30000)

In [29]:
z = set([5,4,6])
x.update(z)
x

{1, 4, 5, 6}

In [32]:
12 // 2

6

In [17]:
device = "cuda:0"
model_id = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
  model_id,
  trust_remote_code=True,
  torch_dtype=torch.float16,
)
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [27]:
zero_padding_multiplicatn(12, 5, 4)

'0012 * 0005 = 0060'

In [25]:
max_num = 9999
max_num_len = len(str(max_num**2))
eval_examples_padded = []
#for i in range(1, max_num + 1):
#    for j in range(1, max_num + 1):
#        eval_examples_padded.append(zero_padding_multiplicatn(i, j, max_num_len))

eval_examples_base = []
for i in range(1, max_num + 1):
    for j in range(1, max_num + 1):
        eval_examples_base.append(zero_padding_multiplicatn(i, j, 0))

few_shot_num = 4
random_examples = random.sample(eval_examples_base, few_shot_num)
prompt = "\n".join(random_examples)
print(prompt)

KeyboardInterrupt: 

In [5]:
from transformers.utils import logging

logging.set_verbosity_error()

No padding 2 digits, 4 shots

In [14]:
from tqdm import tqdm
answers = []
batch_size = 32
for i in tqdm(range(0, len(eval_examples_padded), batch_size)):
    queries = eval_examples_base[i:i+batch_size]
    test_prompts = [prompt + "\n" + query.split("=")[0] + "=" for query in queries]
    truths = [query[-max_num_len:].strip() for query in queries]

    inputs = tokenizer(test_prompts, return_tensors="pt").input_ids.to(device)
    outputs = model.generate(inputs, max_length=64, do_sample=False)
    prediction = tokenizer.batch_decode(outputs, skip_special_tokens=True)#[len(test_prompt)+1:len(test_prompt)+1+max_num_len]
    predictions = [p.split("\n")[few_shot_num].split("=")[1].strip()  for p in prediction]
    #predictions = [p.split().strip() for p in prediction]
    for pred, truth in zip(predictions, truths):
        answers.append(int(pred == truth))
    #print(f"Test Prompt: {example}")
    #print(f"Prediction: {prediction}")
    #print(f"Truth: {truth}")
    #answers.append(int(prediction == truth))
print(f"Accuracy: {sum(answers)/len(answers)*2.:f}")

100%|██████████| 3/3 [00:02<00:00,  1.05it/s]

Accuracy: 0.271605





padding 2 digits

In [15]:
prediction

['9 * 6 = 54\n3 * 1 = 3\n6 * 6 = 36\n1 * 4 = 4\n8 * 2 = 2\n6 * 2 = 6\n1 * 2 = 6\n1 * 2 = 6\n1 * 2 = 6\n1 * 2 = 6\n1 * 2 =',
 '9 * 6 = 54\n3 * 1 = 3\n6 * 6 = 36\n1 * 4 = 4\n8 * 3 = 8\n1 * 6 = 6\n1 * 6 = 6\n1 * 6 = 6\n1 * 6 = 6\n1 * 6 = 6\n1 * 6 =',
 '9 * 6 = 54\n3 * 1 = 3\n6 * 6 = 36\n1 * 4 = 4\n8 * 4 = 8\n1 * 8 = 8\n1 * 16 = 16\n1 * 32 = 32\n1 * 64 = 64\n1 * 128 = 128\n1 * 256 =',
 '9 * 6 = 54\n3 * 1 = 3\n6 * 6 = 36\n1 * 4 = 4\n8 * 5 = 8\n1 * 6 = 6\n1 * 8 = 8\n1 * 16 = 16\n1 * 32 = 32\n1 * 64 = 64\n1 * 128 =',
 '9 * 6 = 54\n3 * 1 = 3\n6 * 6 = 36\n1 * 4 = 4\n8 * 6 = 24\n1 * 4 = 4\n8 * 6 = 24\n1 * 4 = 4\n8 * 6 = 24\n1 * 4 = 4\n8 * 6 =',
 '9 * 6 = 54\n3 * 1 = 3\n6 * 6 = 36\n1 * 4 = 4\n8 * 7 = 16\n1 * 8 = 8\n1 * 9 = 8\n1 * 10 = 8\n1 * 11 = 8\n1 * 12 = 8\n1 * 13 =',
 '9 * 6 = 54\n3 * 1 = 3\n6 * 6 = 36\n1 * 4 = 4\n8 * 8 = 16\n1 * 8 = 16\n1 * 8 = 16\n1 * 8 = 16\n1 * 8 = 16\n1 * 8 = 16\n1 * 8 =',
 '9 * 6 = 54\n3 * 1 = 3\n6 * 6 = 36\n1 * 4 = 4\n8 * 9 = 16\n1 * 4 = 4\n8 * 9 = 16\n1 * 4 = 4\n8 * 

In [13]:
from tqdm import tqdm
answers = []
batch_size = 64
for i in tqdm(range(0, len(eval_examples_padded), batch_size)):
    queries = eval_examples_base[i:i+batch_size]
    test_prompts = [prompt + "\n" + query.split("=")[0] + "=" for query in queries]
    truths = [query[-max_num_len:].strip() for query in queries]

    inputs = tokenizer(test_prompts, return_tensors="pt").input_ids.to(device)
    outputs = model.generate(inputs, max_length=64, do_sample=False)
    prediction = tokenizer.batch_decode(outputs, skip_special_tokens=True, padding=True)#[len(test_prompt)+1:len(test_prompt)+1+max_num_len]
    predictions = [p.split("\n")[few_shot_num].split("=")[1].strip()  for p in prediction]
    #predictions = [p.split().strip() for p in prediction]
    for pred, truth in zip(predictions, truths):
        answers.append(int(pred == truth))
    #print(f"Test Prompt: {example}")
    #print(f"Prediction: {prediction}")
    #print(f"Truth: {truth}")
    #answers.append(int(prediction == truth))
print(f"Accuracy: {sum(answers)/len(answers)}")

 40%|███▉      | 61/154 [03:27<05:16,  3.41s/it]


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [18]:
inputs = tokenizer(test_prompts, return_tensors="pt").input_ids.to(device)


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).