In [5]:
#for local
import sys
repo_root = "../"
if repo_root not in sys.path:
    sys.path.insert(0, repo_root)

In [6]:
import torch
from language import DynamicLanguage
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
from tokenizers import Tokenizer, models, pre_tokenizers, decoders
from tokenizers.processors import TemplateProcessing
from transformers import PreTrainedTokenizerFast
from transformers import GPT2LMHeadModel
from transformers import GPT2Config
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling

def train_gpt2_with_dynamic_language(lang: DynamicLanguage, dataset_path: str, training_args: TrainingArguments, test_size=0.1, block_size=None, additional_length=0, n_embd=128, n_layer=6, n_head=4, dropout=0.1):
    # additional_length: if block size is not defined, block size = max number of tokens in one sentence in the dataset + additional length

    # make dataset and build vocabs
    ds = load_dataset("text", data_files={"train": dataset_path})
    ds = ds["train"].train_test_split(test_size=test_size)
    lang.build_vocab(ds)

    ds_tokenized = ds.map(
        lambda x: {"input_ids": lang.sentence2ids(x["text"])},
        remove_columns=["text"], # remove text column
        batched=False
    )

    # set max length from dataset
    if (block_size == None):
        max_length_ds = max(
            max(len(x["input_ids"]) for x in ds_tokenized["train"]),
            max(len(x["input_ids"]) for x in ds_tokenized["test"])
        )
        block_size = max_length_ds + additional_length
        print("set max length to: " + str(block_size))

    token_bos = lang.bos_token()
    token_eos = lang.eos_token()
    token_pad = lang.pad_token()

    tok_model = models.WordLevel(vocab=lang._token2id)
    tok = Tokenizer(tok_model)
    tok.pre_tokenizer = pre_tokenizers.Sequence([]) # already done at DynamicLanguage.sentence2tokens
    tok.decoder            = decoders.Sequence([])
    tok.post_processor = TemplateProcessing(
        single=f"{token_bos} $0 {token_eos}",
        pair=f"{token_bos} $A {token_eos} $B:1 {token_eos}:1",
        special_tokens=[
            (token_bos, lang.bos_id()),
            (token_eos, lang.eos_id()),
        ],
    )

    hf_tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=tok,
        bos_token=token_bos,
        eos_token=token_eos,
        pad_token=token_pad,
    )

    print("Is CUDA available: " + str(torch.cuda.is_available()))

    config = GPT2Config(
        vocab_size = len(lang.vocab()),
        n_positions = block_size,
        n_ctx = block_size,
        n_embd = n_embd,
        n_layer = n_layer,
        n_head = n_head,
        resid_pdrop = dropout,
        embd_pdrop = dropout,
        attn_pdrop = dropout,
        bos_token_id = lang.bos_id(),
        eos_token_id = lang.eos_id(),
        pad_token_id = lang.pad_id(),
    )

    model = GPT2LMHeadModel(config)
    print("num_params: " + str(model.num_parameters()))

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=hf_tokenizer,
        mlm=False, # mlm is set to false since this is for generation task
    )

    trainer = Trainer(
        model = model,
        args = training_args,
        train_dataset = ds_tokenized["train"],
        eval_dataset = ds_tokenized["test"],
        data_collator = data_collator,
    )

    trainer.train()

    return model, trainer

In [7]:
import yaml
import os
from utils import class_from_package

config_path = "config/train_gpt2_smiles.yaml"
with open(os.path.join(repo_root, config_path)) as f:
    conf = yaml.safe_load(f)

output_dir = os.path.join(repo_root, conf.get("output_dir"))
lang_class = class_from_package("language", conf.get("lang_class"))
lang = lang_class(**conf.get("lang_args", {}))
dataset_path = os.path.join(repo_root, conf.get("dataset_path"))

training_args = conf.get("training_args", {})
training_args["output_dir"] = output_dir
interval = conf.get("interval")
if interval == "epoch":
    training_args["eval_strategy"] = training_args["logging_strategy"] = training_args["save_strategy"] = "epoch"
if type(interval) == int:
    training_args["eval_strategy"] = "steps"
    training_args["eval_steps"] = training_args["logging_steps"] = training_args["save_steps"] = interval
    
training_args = TrainingArguments(**training_args)
test_size, n_embd, n_layer, n_head = (conf.get(k) for k in ("test_size", "n_embd", "n_layer", "n_head"))

model, trainer = train_gpt2_with_dynamic_language(lang=lang, dataset_path=dataset_path, training_args=training_args, test_size=test_size, n_embd=n_embd, n_layer=n_layer, n_head=n_head)

Map: 100%|██████████| 187092/187092 [00:09<00:00, 20028.37 examples/s]
Map: 100%|██████████| 62364/62364 [00:03<00:00, 20180.28 examples/s]


set max length to: 74
Is CUDA available: True
num_params: 209024


Epoch,Training Loss,Validation Loss
1,1.5148,1.07065
2,1.0538,0.927946
3,0.9541,0.85832
4,0.8993,0.818658
5,0.8634,0.796425
6,0.8395,0.774234
7,0.8221,0.776406
8,0.8076,0.752165
9,0.7959,0.742285
10,0.7869,0.735419


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


In [None]:
# save yaml and lang
import shutil
from utils import camel2snake
src = os.path.join(repo_root, config_path)
dst = os.path.join(output_dir, "setting.yaml")
shutil.copy(src, dst)

if lang.__class__.__name__ == "HELM":
    lib_files = [
        "chembl_35_monomer_library.xml",
        "chembl_35_monomer_library_diff.xml",
        "HELMCoreLibrary.json",
        "monomerLib2.0.json"
    ]
    lang.load_monomer_library(*[repo_root + f"data/helm/library/{name}" for name in lib_files], culling=True)
lang.save(os.path.join(output_dir, camel2snake(lang.__class__.__name__) + ".lang"))

In [None]:
# save model in colab
import shutil
from google.colab import files

dl_path = output_dir
shutil.make_archive(dl_path, 'zip', dl_path)
files.download(f'{dl_path}.zip')