In [1]:
import random
from typing import List, Tuple, Dict, Union

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# from torch.utils.data import Dataset
from datasets import Dataset

from utils import (
    zero_padding_multiplicatn,
    generate_training_set,
    generate_validation_set,
    MathsDataset,
)

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
max_ints = 5
val_samples = [20] * max_ints
val_set = generate_validation_set(max_ints, val_samples)

In [13]:
train_set = generate_training_set(max_ints, 2000, val_set)

In [19]:
val_questions = []
val_answers = []
padding = max_ints * 2
for data in val_set:
    padded = zero_padding_multiplicatn(data[0], data[1], padding)
    question, answer = padded.split("=")
    answer = "=" + answer
    val_questions.append(question)
    val_answers.append(answer)

In [22]:
train_questions = []
train_answers = []
for data in train_set:
    padded = zero_padding_multiplicatn(data[0], data[1], padding)
    question, answer = padded.split("=")
    answer = "=" + answer
    train_questions.append(question)
    train_answers.append(answer)

In [24]:
val_ds = Dataset.from_dict({"question": val_questions, "answers": val_answers})
train_ds = Dataset.from_dict({"question": train_questions, "answers": train_answers})

In [2]:
device = "cuda:0"
model_id = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
  model_id,
  trust_remote_code=True,
  torch_dtype=torch.float16,
)
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [3]:
tokenizer.is_fast

True

In [4]:
inputs = tokenizer(
    "some text",
    max_length=100,
    truncation=True,
    stride=50,
    return_overflowing_tokens=True,
)

for ids in inputs["input_ids"]:
    print(tokenizer.decode(ids))

some text


In [6]:
inputs = tokenizer(
    "stome text",
    max_length=100,
    truncation=True,
    stride=50,
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
)
inputs.keys()

dict_keys(['input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'])