In [1]:
import deepchem
import deepchem.molnet
import pandas as pd
import random
import torch
import json
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
import warnings
warnings.filterwarnings("ignore")

from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig,AutoModelForCausalLM, AutoTokenizer
from sklearn.metrics import accuracy_score,roc_auc_score

from rdkit.Chem import AllChem
from rdkit.Chem.AtomPairs import Pairs,Torsions
from rdkit.Avalon.pyAvalonTools import GetAvalonFP
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.Chem import DataStructs,rdMolDescriptors
from rdkit.Chem import MACCSkeys
from rdkit.Chem import rdMolDescriptors
from rdkit import RDLogger
from rdkit import Chem
RDLogger.DisableLog('rdApp.*')

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
2024-08-08 10:13:33.733169: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/fangmiaoNLP/.conda/envs/LZZ/lib/python3.8/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [2]:
tasks, datasets, transformers = deepchem.molnet.load_clintox(splitter='scaffold', reload=True,
                                                             data_dir='../data/clintox_data',
                                                             save_dir='../data/clintox_datasets')

train_dataset, valid_dataset, test_dataset = datasets

In [3]:
valid_dataset

<DiskDataset X.shape: (148, 1024), y.shape: (148, 2), w.shape: (148, 2), ids: ['CC(C)OC(=O)CCC/C=C\\C[C@H]1[C@H](C[C@H]([C@@H]1/C=C/[C@H](COc2cccc(c2)C(F)(F)F)O)O)O'
 'CC(C)Nc1cccnc1N2CCN(CC2)C(=O)c3cc4cc(ccc4[nH]3)NS(=O)(=O)C'
 'CC(C)n1c2ccccc2c(c1/C=C/[C@@H](C[C@@H](CC(=O)[O-])O)O)c3ccc(cc3)F' ...
 'C[C@@H](C(=O)N[C@H](CC(C)C)C(=O)N[C@@H](C)C(=O)N[C@H](C(C)C)C(=O)N[C@@H](C(C)C)C(=O)N[C@@H](C(C)C)C(=O)N[C@@H](Cc1c[nH]c2c1cccc2)C(=O)N[C@H](CC(C)C)C(=O)N[C@@H](Cc3c[nH]c4c3cccc4)C(=O)N[C@H](CC(C)C)C(=O)N[C@@H](Cc5c[nH]c6c5cccc6)C(=O)N[C@H](CC(C)C)C(=O)N[C@@H](Cc7c[nH]c8c7cccc8)C(=O)NCCO)NC(=O)CNC(=O)[C@H](C(C)C)NC=O'
 'C[C@@H]([C@@H]([C@H]1CNc2c(c(=O)nc([nH]2)N)N1)O)O'
 'C[C@@]1(C(=O)N2[C@H](C(=O)N3CCC[C@H]3[C@@]2(O1)O)Cc4ccccc4)NC(=O)[C@H]5C[NH+]([C@@H]6Cc7c[nH]c8c7c(ccc8)C6=C5)C'], task_names: ['FDA_APPROVED' 'CT_TOX']>

In [3]:
train_dataset_df = train_dataset.to_dataframe()
test_dataset_df = test_dataset.to_dataframe()

In [4]:
train_input, train_label = train_dataset_df['ids'].values, train_dataset_df['y2'].values
test_input, test_label = test_dataset_df['ids'].values, test_dataset_df['y2'].values

In [5]:
def top_k_maccs_similar_molecules(target_smiles, molecule_smiles_list, label_list, k = 5): # 从train set中选择target_smiles的KNN
    label_list = ["Yes" if i == 1 else "No" for i in label_list]
    target_mol = Chem.MolFromSmiles(target_smiles)
    target_maccs = AllChem.GetMorganFingerprintAsBitVect(target_mol, 1, 1024)
    similarities = []
    for i, smiles in enumerate(molecule_smiles_list):
        if smiles==target_smiles:
            continue

        sample_maccs = AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smiles), 1, 1024)
        # tanimoto_similarity = DataStructs.CosineSimilarity(target_maccs,sample_maccs)
        tanimoto_similarity = DataStructs.FingerprintSimilarity(target_maccs, sample_maccs, metric=DataStructs.TanimotoSimilarity)
        similarities.append((smiles, tanimoto_similarity, label_list[i]))

    similarities.sort(key=lambda x: x[1], reverse=True)
    top_5_similar_molecules = similarities[:k]
    return top_5_similar_molecules

In [6]:
top_k_maccs_similar_molecules('NS(=O)(=O)Cc1noc2ccccc12', train_input, train_label)

[('Cc1c(c(no1)c2ccccc2)c3ccc(cc3)S(=O)(=O)N', 0.48148148148148145, 'No'),
 ('Cc1c(noc1[N-]S(=O)(=O)c2ccc(cc2)N)C', 0.36666666666666664, 'No'),
 ('c1cc(ccc1C[NH3+])S(=O)(=O)N', 0.36, 'No'),
 ('c1cc(ccc1N)S(=O)(=O)N', 0.34782608695652173, 'No'),
 ('Cc1c(noc1N(C(=O)C)S(=O)(=O)c2ccc(cc2)N)C', 0.3235294117647059, 'No')]

In [7]:
task_des_n='You are an expert chemist, your task is to predict the property of molecule using your experienced chemical property prediction knowledge.\n' \
         'Please strictly follow the format, no other information can be provided. Given the SMlLES string of a molecule, the task focuses on predicting molecular properties, specifically wether a molecule is clinically trail toxic (Yes) or No clinically trail toxic (No) based on the SMILES string representation of each molecule. ' \
          'You will be provided with some molecule SMILES as examples, accompanied by a binary label indicating ' \
          'whether it is clinically trail toxic (Yes) or No clinically trail toxic (No) in the beginning.\n'

task_des_zero='You are an expert chemist, your task is to predict the property of molecule using your experienced chemical property prediction knowledge.\n' \
         'Please strictly follow the format, no other information can be provided. Given the SMlLES string of a molecule, the task focuses on predicting molecular properties, specifically wether a molecule is clinically trail toxic (Yes) or No clinically trail toxic (No) based on the SMILES string representation of each molecule. ' \
          'If the molecule is clinically trail toxic, output Yes; otherwise output No. Please answer with only Yes or No.\n'

question='Then predict whether the following molecule it is clinically trail toxic or not. Please answer with only Yes or No.\n'


In [8]:
model_name_or_id = "AI4Chem/ChemLLM-20B-Chat-SFT"
model = AutoModelForCausalLM.from_pretrained(model_name_or_id, torch_dtype=torch.float16, trust_remote_code=True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name_or_id, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/41 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
dict_a = {}

for i in range(len(train_input)): # 遍历train set
    lst = []
    scaffold_examples = top_k_maccs_similar_molecules(train_input[i], train_input, train_label, 64) # 选择train set中最近的64-NN
    for example in scaffold_examples:
        prompt = task_des_n
        prompt += f"Input SMILES: {example[0]}\nLabel: {example[-1]}\n"
        prompt += question
        prompt += f"Input SMILES: {train_input[i]}\nAnswer: "
        input_text = f"Human: {prompt}\nAssistant:"

        inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
        generation_config = GenerationConfig(
            do_sample=False,
            temperature=0.1,
            max_new_tokens=4,
            repetition_penalty=1,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
            output_scores=True,
            return_dict_in_generate=True, 
        )
        outputs = model.generate(**inputs, generation_config=generation_config)
        generated_text = tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)[0][len(input_text):]
        ans = generated_text.strip()
        logits = outputs.scores
        probs = [torch.softmax(log, dim=-1) for log in logits]
        # print(torch.max(probs[0][0]), torch.argmax(probs[0][0]))
        # print(torch.max(probs[1][0]), torch.argmax(probs[1][0]))
        yesp = probs[0][0, 7560].item()
        nop = probs[0][0, 2458].item()
        sump = yesp + nop
        print(train_label[i], ans, yesp/sump, nop/sump)
        # 正确label的confidence作为score
        if train_label[i]==1:
            lst.append((example[0],example[-1], yesp/sump))
        elif train_label[i]==0:
            lst.append((example[0],example[-1], nop/sump))

    dict_a[train_label[i]] = lst
    print(lst)

    if i > 5:
        break

1.0 Yes 0.5156199000959537 0.48438009990404624
1.0 Yes 0.5621765106207268 0.4378234893792732
1.0 Yes 0.6791786808146038 0.32082131918539625
1.0 Yes 0.577495366579297 0.42250463342070294
1.0 Yes 0.5312093785599418 0.46879062144005823
1.0 Yes 0.5926665838522231 0.4073334161477768
1.0 No 0.4687906166685577 0.5312093833314423
1.0 Yes 0.5467381471949578 0.4532618528050421
1.0 Yes 0.5621764916635495 0.4378235083364505
1.0 Yes 0.5774953819544372 0.42250461804556283


KeyboardInterrupt: 

In [None]:
info_json = json.dumps(dict_a, sort_keys=False, indent=4, separators=(',', ': '))
# 显示数据类型
with open('score.json', 'w') as f:
    f.write(info_json)