Create a config with tokenizer and model from h-face for use in training and upload it to wandb

In [1]:
import importlib.util
import sys
sys.path.append('..')
from tokenizer import build_tokenizer
from transformers import AutoModelForCausalLM, GPTNeoXConfig
from os import environ

environ['WANDB_USERNAME']='pavel-tikhomirov'
environ['WANDB_DIR']=f'/main/draft-v2/{environ["WANDB_USERNAME"]}-runs/'
environ['TOKENIZERS_PARALLELISM']='false'


tokenizer = build_tokenizer('word-level', fdim=2, add_commutator_tokens=False,
                            add_prompt_tokens=True, add_post_processor=True)


from transformers import GPT2Config
config = GPT2Config(
    vocab_size = len(tokenizer.get_vocab()),
    n_embd     = 768,
    n_layer    = 8,
    n_head     = 6,
    n_inner    = 512,
    
    bos_token_id = tokenizer.bos_token_id,
    eos_token_id = tokenizer.eos_token_id,
    pad_token_id = tokenizer.pad_token_id,
)

model = AutoModelForCausalLM.from_config(config)

In [2]:
model.config.n_embd

768

In [3]:
import wandb

from tempfile import TemporaryDirectory

with TemporaryDirectory() as dir,\
     wandb.init(project = 'whitehead', entity = 'ml-in-algebraic-topology', job_type = 'build-model-config'):
         
    tokenizer.save_pretrained(dir)
    config.save_pretrained(dir)

    artifact = wandb.Artifact(name = 'gpt-2-fdim-2', type = 'model-config', metadata = {
        'parameters': sum(p.numel() for p in model.parameters()),
        **config.to_dict(),
    }, description = "GPT2 with `word-level` tokenizer with prompt tokens for $F\langle x_1, x_2 \rangle $. Add eos and bos. Whitehead testing")

    artifact.add_dir(dir)

    wandb.run.log_artifact(artifact)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112213921215799, max=1.0…

[34m[1mwandb[0m: Adding directory to artifact (/tmp/tmp2p65xo4a)... Done. 0.0s


VBox(children=(Label(value='0.014 MB of 0.020 MB uploaded\r'), FloatProgress(value=0.7059475078256682, max=1.0…

