In [None]:
# ! pip install transformers==4.38.1
# ! pip install rdkit==2023.9.4
# ! pip install accelerate==0.27.2
# ! pip install tensorflow==2.10.0
# ! pip install flash-attn
# ! pip install -q -U bitsandbytes

In [None]:
# ! pip install datasets
# ! pip install loralib
# ! pip install git+https://github.com/huggingface/peft.git

In [None]:
import random
import pickle
import json

In [None]:
from datasets import Dataset

In [None]:
import torch
import torch.nn as nn
import bitsandbytes as bnb
from peft import PeftModelForCausalLM
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig, DataCollatorWithPadding

In [None]:
HF_CREDENTIALS = ""

In [None]:
tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-Instruct-v0.2', token=HF_CREDENTIALS) #, model_max_length=1024)
tokenizer.pad_token = tokenizer.eos_token

# Dataset

In [None]:
chat = [
  {"role": "user", "content": ""},
  {"role": "assistant", "content": ""}
]

tokenizer.apply_chat_template(chat, tokenize=False)

In [None]:
with open('train_nelson_data.pkl', 'rb') as f:
    conversations = pickle.load(f)

In [None]:
with open('smolinstruct/train/property_prediction-bbbp.jsonl', 'r') as f:
    for line in f:
        if random.random() < 0.5:
            continue
        txt = json.loads(line)
        chat[0]['content'] = f"Is blood-brain barrier permeability (BBBP) a property of <SMILES> {txt['input']} </SMILES>?"
        chat[1]['content'] = f"<BOOLEAN> {txt['output']} </BOOLEAN>"
        conversations.append(tokenizer.apply_chat_template(chat, tokenize=False))
print(conversations[-1])
print(len(conversations))

with open('smolinstruct/train/property_prediction-clintox.jsonl', 'r') as f:
    for line in f:
        if random.random() < 0.5:
            continue
        txt = json.loads(line)
        chat[0]['content'] = f"Is <SMILES> {txt['input']} </SMILES> toxic?"
        chat[1]['content'] = f"<BOOLEAN> {txt['output']} </BOOLEAN>"
        conversations.append(tokenizer.apply_chat_template(chat, tokenize=False))
print(conversations[-1])
print(len(conversations))

with open('smolinstruct/train/property_prediction-esol.jsonl', 'r') as f:
    for line in f:
        if random.random() < 0.5:
            continue
        txt = json.loads(line)
        chat[0]['content'] = f"How soluble is <SMILES> {txt['input']} </SMILES>?"
        chat[1]['content'] = f"Its log solubility is <NUMBER> {txt['output']} </NUMBER> mol/L"
        conversations.append(tokenizer.apply_chat_template(chat, tokenize=False))
print(conversations[-1])
print(len(conversations))

with open('smolinstruct/train/property_prediction-hiv.jsonl', 'r') as f:
    for line in f:
        if random.random() < 0.95:
            continue
        txt = json.loads(line)
        chat[0]['content'] = f"Can <SMILES> {txt['input']} </SMILES> serve as an inhibitor of HIV replication?"
        chat[1]['content'] = f"<BOOLEAN> {txt['output']} </BOOLEAN>"
        conversations.append(tokenizer.apply_chat_template(chat, tokenize=False))
print(conversations[-1])
print(len(conversations))

with open('smolinstruct/train/property_prediction-lipo.jsonl', 'r') as f:
    for line in f:
        if random.random() < 0.9:
            continue
        txt = json.loads(line)
        chat[0]['content'] = f"Predict the octanol/water distribution coefficient logD under the circumstances of pH 7.4 for <SMILES> {txt['input']} </SMILES>"
        chat[1]['content'] = f"<NUMBER> {txt['output']} </NUMBER>"
        conversations.append(tokenizer.apply_chat_template(chat, tokenize=False))
print(conversations[-1])
print(len(conversations))

with open('smolinstruct/train/property_prediction-sider.jsonl', 'r') as f:
    for line in f:
        if random.random() < 0.5:
            continue
        txt = json.loads(line)
        chat[0]['content'] = f"Are there any known side effects of <SMILES> {txt['input']} </SMILES> affecting the heart?"
        chat[1]['content'] = f"<BOOLEAN> {txt['output']['Vascular disorders']} </BOOLEAN>"
        conversations.append(tokenizer.apply_chat_template(chat, tokenize=False))
print(conversations[-1])
print(len(conversations))

with open('smolinstruct/train/molecule_captioning.jsonl', 'r') as f:
    for line in f:
        if random.random() < 0.98:
            continue
        txt = json.loads(line)
        chat[0]['content'] = f"Describe the molecule: <SMILES> {txt['input']} </SMILES>"
        chat[1]['content'] = f"{txt['output']}"
        conversations.append(tokenizer.apply_chat_template(chat, tokenize=False))
print(conversations[-1])
print(len(conversations))

In [None]:
inputs = tokenizer(conversations, padding=True, return_tensors='pt')
dataset = Dataset.from_dict(inputs)

In [None]:
HF_CREDENTIALS = "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"

In [None]:
from huggingface_hub import notebook_login

notebook_login()

# Model

In [None]:
model = AutoModelForCausalLM.from_pretrained('mistralai/Mistral-7B-v0.1',
    torch_dtype=torch.bfloat16,
    device_map="auto",
    token=HF_CREDENTIALS
)

model = PeftModelForCausalLM.from_pretrained(
    model,
    'osunlp/LlaSMol-Mistral-7B',
    torch_dtype=torch.bfloat16,
)

model = model.merge_and_unload()

In [None]:
model.config.use_cache = False
model.config.pretraining_tp = 1

In [None]:
for param in model.parameters():
    param.requires_grad = False  # freeze the model - train adapters later
    if param.ndim == 1:
        # cast the small parameters (e.g. layernorm) to fp32 for stability
        param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
    def forward(self, x): return super().forward(x).to(torch.float32)

model.lm_head = CastOutputToFloat(model.lm_head)

In [None]:
from peft import get_peft_model, LoraConfig, TaskType

lora_config = LoraConfig(
    r=32,
    # target_modules=["q_proj", "v_proj"],
    task_type=TaskType.CAUSAL_LM,
    lora_alpha=64,
    lora_dropout=0.1
)

base_model_with_new_adapter = get_peft_model(model, lora_config)
base_model_with_new_adapter.print_trainable_parameters()

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

## Training

In [None]:
import transformers

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=dataset,
    args=transformers.TrainingArguments(
        num_train_epochs=1,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        optim="paged_adamw_32bit",
        warmup_ratio=0.1,
        max_steps=-1,
        learning_rate=1e-5,
        max_grad_norm=0.5,
        fp16=False,
        bf16=True,
        logging_steps=16,
        lr_scheduler_type="cosine",
        group_by_length=False,
        output_dir='outputs'
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

In [None]:
model.push_to_hub("luispintoc/llasmol-v2",
                  use_auth_token=True,
                  commit_message="Train all - 1 epoch",
                  private=True)

# Test

In [None]:
model.config.use_cache = True

In [None]:
model.eval();

In [None]:
with open('val_nelson_data.pkl', 'rb') as f:
    conversations = pickle.load(f) #3400

In [None]:
prompt_input = conversations[-1].split('[/INST]')[0] + '[/INST]'

In [None]:
model_config = {
    "max_new_tokens": 1024,
    "do_sample": True,
    "temperature": 0.1,
}

In [None]:
conversations[-2]

In [None]:
chat = conversations[-1]

In [None]:
chat = '<s>[INST] You are asked to check the following list of candidate SMILES and rate with Yes or No if they are less soluble to water and lower permeability than their parent SMILES <SMILES> CC1(C)CN(C(=O)Cn2ccc(=O)[nH]c2=O)C1(C)C </SMILES>. Here is the list of candidates:\n1. <SMILES> Nc1ccn(C(=O)Cn2ccc(=O)[nH]c2=O)c(=O)n1 </SMILES> \n2. <SMILES> CC1(C)CN(c2oc(N)c(-n3ccc(=O)[nH]c3=O)c2)C1(C)C </SMILES> \n3. <SMILES> CC1(C)CN(C(=O)CC(CC[S@](C)=O)=O)C1(C)C </SMILES> \n4. <SMILES> c1(-c2cccnc2)nc(C)nc2c1CCN(C(=O)NCc1ccncc1)C2 </SMILES> [/INST]1. <BOOLEAN> Yes </BOOLEAN> \n2. <BOOLEAN> No </BOOLEAN> \n3. <BOOLEAN> No </BOOLEAN></s>'
prompt_input = chat.split('[/INST]')[0] + '[/INST]'

In [None]:
prompt_input

In [None]:
inputs = tokenizer(prompt_input, return_tensors="pt").to("cuda")
outputs = tokenizer.decode(model.generate(**inputs, **model_config)[0])
outputs