In [None]:
!pip install datasets

In [2]:
from getpass import getpass
import os
import sys

if os.path.exists("molgen/.git"):
    %cd molgen
    !git pull
    %cd ..
else:
  token = getpass("GitHub Token: ")
  !git clone https://satoru-fujii:{token}@github.com/ycu-iil/molgen.git

sys.path.append("/content/molgen")

/content/molgen
Already up to date.
/content


In [None]:
import torch
from language import Language, DynamicLanguage, HELM

In [None]:
import json
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
from tokenizers import Tokenizer, models, pre_tokenizers, decoders
from tokenizers.processors import TemplateProcessing
from transformers import PreTrainedTokenizerFast
from transformers import GPT2LMHeadModel
from transformers import GPT2Config
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling

def trainlm_dynamiclanguage(lang: DynamicLanguage, dataset_path: str, training_args: TrainingArguments, test_size=0.1, block_size=None, additional_length=0, n_embd=128, n_layer=6, n_head=4):
  #additional_length: if block size is not defined, block size = max number of tokens in one sentence in the dataset + additional length

  #make dataset and build vocabs
  ds = load_dataset("text", data_files={"train": dataset_path})
  ds = ds["train"].train_test_split(test_size=test_size)
  lang.build_vocab(ds)

  ds_tokenized = ds.map(
    lambda x: {"input_ids": lang.sentence2ids(x["text"])},
    remove_columns=["text"], #remove text column
    batched=False
  )

  #set max length from dataset
  if (block_size == None):
    max_length_ds = max(
        max(len(x["input_ids"]) for x in ds_tokenized["train"]),
        max(len(x["input_ids"]) for x in ds_tokenized["test"])
    )
    block_size = max_length_ds + additional_length
    print("set max length to: " + str(block_size))

  token_bos = lang.bos_token()
  token_eos = lang.eos_token()
  token_pad = lang.pad_token()

  tok_model = models.WordLevel(vocab=lang._token2id)
  tok = Tokenizer(tok_model)
  tok.pre_tokenizer = pre_tokenizers.Sequence([])   #already done at DynamicLanguage.sentence2tokens
  tok.decoder      = decoders.Sequence([])
  tok.post_processor = TemplateProcessing(
      single=f"{token_bos} $0 {token_eos}",
      pair=f"{token_bos} $A {token_eos} $B:1 {token_eos}:1",
      special_tokens=[
          (token_bos, lang.bos_id()),
          (token_eos, lang.eos_id()),
      ],
  )
  #tok.save("tokenizer.json")

  hf_tokenizer = PreTrainedTokenizerFast(
      tokenizer_object=tok,
      bos_token=token_bos,
      eos_token=token_eos,
      pad_token=token_pad,
  )

  print("Is CUDA available: " + str(torch.cuda.is_available()))

  config = GPT2Config(
      vocab_size = len(lang.vocab()),
      n_positions = block_size,
      n_ctx       = block_size,
      n_embd      = n_embd,
      n_layer     = n_layer,
      n_head      = n_head,
      bos_token_id = lang.bos_id(),
      eos_token_id = lang.eos_id(),
      pad_token_id = lang.pad_id(),
  )

  model = GPT2LMHeadModel(config)
  print("num_params: " + str(model.num_parameters()))

  data_collator = DataCollatorForLanguageModeling(
    tokenizer=hf_tokenizer,
    mlm=False, #mlm is set to false since this is for generation task
  )

  trainer = Trainer(
    model           = model,
    args            = training_args,
    train_dataset   = ds_tokenized["train"],
    eval_dataset    = ds_tokenized["test"],
    data_collator   = data_collator,
  )

  trainer.train()

  return model, trainer

In [None]:
save_dir = "/content/model"
lang = HELM(has_period = False)
helm_dataset_path = "/content/molgen/dataset/helm/chembl34_protein_helm_only.helm"

training_args = TrainingArguments(
    output_dir          = save_dir,
    overwrite_output_dir= True,
    num_train_epochs    = 1,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size  = 8,
    eval_strategy       = "epoch",
    eval_steps          = 1,
    logging_strategy    = "epoch",
    logging_steps       = 1,
    save_strategy       = "epoch",
    save_steps          = 1,
    save_total_limit    = 2,
    learning_rate       = 5e-5,
    weight_decay        = 0.01,
    warmup_steps        = 100,
    fp16                = True,
    report_to           = "none",
    load_best_model_at_end = True,
)

model, trainer = trainlm_dynamiclanguage(lang=lang, dataset_path=helm_dataset_path, training_args=training_args, test_size=0.1, n_embd=128, n_layer=6, n_head=4)

Map:   0%|          | 0/9721 [00:00<?, ? examples/s]

Map:   0%|          | 0/1081 [00:00<?, ? examples/s]

set max length to: 90
Is CUDA available: True
num_params: 1214464


Epoch,Training Loss,Validation Loss
1,2.588,2.324035


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


In [None]:
#save model
import shutil
from google.colab import files

dl_path = save_dir
shutil.make_archive(dl_path, 'zip', dl_path)
files.download(f'{dl_path}.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#save lang
lang.save("helm_pep_noperiod.lang")