Create a config with tokenizer and model from h-face for use in training and upload it to wandb

In [1]:
import importlib.util
import sys
sys.path.append('..')
from tokenizer import build_tokenizer
from transformers import AutoModelForCausalLM, GPTNeoXConfig
from os import environ

environ['WANDB_USERNAME']='pavel-tikhomirov'
environ['WANDB_DIR']=f'/main/draft-v2/{environ["WANDB_USERNAME"]}-runs/'
environ['TOKENIZERS_PARALLELISM']='false'


tokenizer = build_tokenizer('word-level', fdim=3, add_commutator_tokens=True,
                            add_prompt_tokens=True, add_post_processor=True)


from transformers import GPT2Config
config = GPT2Config(
    vocab_size = len(tokenizer.get_vocab()),
    n_embd     = 768,
    n_layer    = 8,
    n_head     = 6,
    n_inner    = 512,
    
    bos_token_id = tokenizer.bos_token_id,
    eos_token_id = tokenizer.eos_token_id,
    pad_token_id = tokenizer.pad_token_id,
)

model = AutoModelForCausalLM.from_config(config)



In [2]:
import wandb

from tempfile import TemporaryDirectory

with TemporaryDirectory() as dir,\
     wandb.init(project = 'whitehead', entity = 'ml-in-algebraic-topology', job_type = 'build-model-config'):
         
    tokenizer.save_pretrained(dir)
    config.save_pretrained(dir)

    artifact = wandb.Artifact(name = 'gpt-2-fdim-3', type = 'model-config', metadata = {
        'parameters': sum(p.numel() for p in model.parameters()),
        **config.to_dict(),
    }, description = "GPT2 with `word-level`, with prompt tokens. Add eos and bos. With comas and [, ] (2 extra). Whitehead testing")

    artifact.add_dir(dir)

    wandb.run.log_artifact(artifact)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mpavel-tikhomirov[0m ([33mml-in-algebraic-topology[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Adding directory to artifact (/tmp/tmpoc1avurc)... Done. 0.0s


VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [3]:
wandb.finish()