In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
!pip install rdkit
!pip install datasets

Collecting rdkit
  Downloading rdkit-2025.3.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2025.3.1-cp311-cp311-manylinux_2_28_x86_64.whl (34.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.6/34.6 MB[0m [31m63.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.3.1
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6

In [None]:
import torch
import sys

sys.path.append("drive/MyDrive/Colab Notebooks/molgen")
from language.language import Language
from language.dynamiclanguage import DynamicLanguage
from language.helm import Helm

In [None]:
import json
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
from tokenizers import Tokenizer, models, pre_tokenizers, decoders
from tokenizers.processors import TemplateProcessing
from transformers import PreTrainedTokenizerFast
from transformers import GPT2LMHeadModel
from transformers import GPT2Config
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling

def trainlm_dynamiclanguage(lang: DynamicLanguage, dataset_path: str, training_args: TrainingArguments, test_size=0.1, block_size=None, additional_length=0, n_embd=128, n_layer=6, n_head=4):
  #additional_length: if block size is not defined, block size = max number of tokens in one sentence in the dataset + additional length

  #make dataset and build vocabs
  ds = load_dataset("text", data_files={"train": dataset_path})
  ds = ds["train"].train_test_split(test_size=test_size)
  lang.build_vocab(ds)

  ds_tokenized = ds.map(
    lambda x: {"input_ids": lang.sentence2ids(x["text"])},
    remove_columns=["text"], #remove text column
    batched=False
  )

  #set max length from dataset
  if (block_size == None):
    max_length_ds = max(
        max(len(x["input_ids"]) for x in ds_tokenized["train"]),
        max(len(x["input_ids"]) for x in ds_tokenized["test"])
    )
    block_size = max_length_ds + additional_length
    print("set max length to: " + str(block_size))

  token_bos = lang.bos_token()
  token_eos = lang.eos_token()
  token_pad = lang.pad_token()

  tok_model = models.WordLevel(vocab=lang._token2id)
  tok = Tokenizer(tok_model)
  tok.pre_tokenizer = pre_tokenizers.Sequence([])   #already done at DynamicLanguage.sentence2tokens
  tok.decoder      = decoders.Sequence([])
  tok.post_processor = TemplateProcessing(
      single=f"{token_bos} $0 {token_eos}",
      pair=f"{token_bos} $A {token_eos} $B:1 {token_eos}:1",
      special_tokens=[
          (token_bos, lang.bos_id()),
          (token_eos, lang.eos_id()),
      ],
  )
  #tok.save("tokenizer.json")

  hf_tokenizer = PreTrainedTokenizerFast(
      tokenizer_object=tok,
      bos_token=token_bos,
      eos_token=token_eos,
      pad_token=token_pad,
  )

  print("Is CUDA available: " + str(torch.cuda.is_available()))

  config = GPT2Config(
      vocab_size = len(lang.vocab()),
      n_positions = block_size,
      n_ctx       = block_size,
      n_embd      = n_embd,
      n_layer     = n_layer,
      n_head      = n_head,
      bos_token_id = lang.bos_id(),
      eos_token_id = lang.eos_id(),
      pad_token_id = lang.pad_id(),
  )

  model = GPT2LMHeadModel(config)
  print("num_params: " + str(model.num_parameters()))

  data_collator = DataCollatorForLanguageModeling(
    tokenizer=hf_tokenizer,
    mlm=False, #mlm is set to false since this is for generation task
  )

  trainer = Trainer(
    model           = model,
    args            = training_args,
    train_dataset   = ds_tokenized["train"],
    eval_dataset    = ds_tokenized["test"],
    data_collator   = data_collator,
  )

  trainer.train()

  return model, trainer

In [None]:
save_dir = "/content/model"
lang = Helm(has_period = True)
helm_dataset_path = "drive/MyDrive/Colab Notebooks/molgen/dataset/helm/chembl34_protein_helm_only.helm"

training_args = TrainingArguments(
    output_dir          = save_dir,
    overwrite_output_dir= True,
    num_train_epochs    = 30,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size  = 8,
    eval_strategy       = "epoch",
    eval_steps          = 1,
    logging_strategy    = "epoch",
    logging_steps       = 1,
    save_strategy       = "epoch",
    save_steps          = 1,
    save_total_limit    = 2,
    learning_rate       = 5e-5,
    weight_decay        = 0.01,
    warmup_steps        = 100,
    fp16                = True,
    report_to           = "none",
    load_best_model_at_end = True,
)

model, trainer = trainlm_dynamiclanguage(lang=lang, dataset_path=helm_dataset_path, training_args=training_args, test_size=0.1, n_embd=128, n_layer=6, n_head=4)

set max length to: 126
Is CUDA available: True
num_params: 1219072


Epoch,Training Loss,Validation Loss
1,1.8033,1.3335
2,1.2361,1.209372
3,1.1436,1.150015
4,1.0864,1.10422
5,1.0451,1.062383
6,1.0089,1.035048
7,0.9773,1.004463
8,0.9496,0.982404
9,0.9261,0.960686
10,0.9076,0.950275


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


In [None]:
#save model
import shutil
from google.colab import files

dl_path = save_dir
shutil.make_archive(dl_path, 'zip', dl_path)
files.download(f'{dl_path}.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#save lang
import pickle
with open('helmlang.lang', mode='wb') as fo:
  pickle.dump(lang, fo)