# step1 : loading an embedding model

Use the fine-tuned version of Chemberta or sentence transformers, as an embedding model. 

input: datasets.

output: embedding space of the datasets and a function that can measure the simmilarity

In [28]:
from ImportsAndDatasets import *

In [29]:
import pandas as pd
import torch
from transformers import AutoModel
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import RDKFingerprint
from rdkit.Avalon.pyAvalonTools import GetAvalonFP

def preprocess_and_extract_embeddings(examples):
    inputs = tokenizer(examples['SMILES'], return_tensors='pt', padding=True, truncation=True)
    inputs = inputs.to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        # Extract the last hidden state
        last_hidden_state = outputs.last_hidden_state  # This is the embedding from the last layer
        # Take the average of the last hidden state across the sequence length dimension
        embeddings = last_hidden_state.mean(dim=1).squeeze()
        embeddings = embeddings.cpu()  # Move tensors to CPU
    return {
        'last_hidden_state':last_hidden_state, 
        'embeddings': embeddings.numpy(),
        'SMILES': examples['SMILES']  # Optional: include the original SMILES if needed
    }


def one_embeddings(examples):
    inputs = tokenizer(examples, return_tensors='pt', padding=True, truncation=True)
    inputs = inputs.to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        # Extract the last hidden state
        last_hidden_state = outputs.last_hidden_state  # This is the embedding from the last layer
        # Take the average of the last hidden state across the sequence length dimension
        embeddings = last_hidden_state.mean(dim=1).squeeze()
        embeddings = embeddings.cpu()  # Move tensors to CPU
    return {
        #'last_hidden_state':last_hidden_state,
        'embeddings': embeddings.numpy(),
        # 'SMILES': examples['SMILES']  # Optional: include the original SMILES if needed
    }
def get_sim_score(dataset_train: pd.DataFrame , smile:str , app:int , nonapp:int )-> pd.DataFrame:
    """
    
    """
    train_embeded= dataset_train.map(preprocess_and_extract_embeddings, batch_size=8, writer_batch_size=8).to_pandas() 
    main = torch.tensor(train_embeded["embeddings"])
    one = torch.tensor(one_embeddings(smile)["embeddings"])
    similarities = torch.cosine_similarity(one.unsqueeze(0), main, dim=1) 
    train_embeded["sim"]=similarities.tolist() 
    train_embeded.drop(columns=["embeddings" , "last_hidden_state"] , inplace = True)
    most_app = train_embeded[train_embeded["labels"]==1].sort_values(by=["sim"] , ascending = False)[:app]
    most_nonapp = train_embeded[train_embeded["labels"]==0].sort_values(by=["sim"] , ascending = False)[:nonapp]
    final = train_embeded.sort_values(by=["sim"] , ascending = False)[:nonapp]
    print(f"most simmilar SMILES to {smile} based on the model's prediction")
    return most_app , most_nonapp , final

def preprocess_and_predict_encoder(examples):
    inputs = tokenizer(examples, return_tensors='pt', padding=True, truncation = True)
    inputs = inputs.to(device)
    outputs = model(**inputs)
    predicted_class_idx = torch.argmax(outputs.logits)
    predicted_class_prob = torch.nn.functional.softmax(outputs.logits, dim=1)[0][predicted_class_idx]
    predicted_class_idx = predicted_class_idx.cpu()  # Move tensors to CPU
    predicted_class_prob = predicted_class_prob.cpu()
    return {
        'predicted_class_base': predicted_class_idx.item(),
        'predicted_class_prob_base': predicted_class_prob.item()
    }

def convert_to_fingerprint_for_eval(smiles):
    mol = Chem.MolFromSmiles(smiles)

    # Generate the Morgan fingerprint
    morgan_fp = AllChem.GetMorganFingerprintAsBitVect(mol, useChirality=True, radius=2, nBits=256)
    morgan = ''.join(str(int(bit)) for bit in morgan_fp)

    return smiles+ "<[mr]>" + morgan + "<[lbl]>"
def convert_labels(target):

    if int(target) == 1:
        target = "<Approved>"
    else:
        target = "<NotApproved>"

    return target
def preprocess_and_predict_dencoder(examples):
    inputs = tokenizer(examples, return_tensors='pt', padding=True, truncation = True)
    inputs = inputs.to(device)
    outputs = model(**inputs)
    logits = outputs.logits[:, -1, :]
    probs = torch.softmax(logits, dim=-1)
    predicted_label = torch.argmax(probs, dim=-1).item()
    predicted_label = tokenizer.decode(predicted_label)
    if predicted_label == "<Approved>":
        predicted_label = 1
    else: 
        predicted_label = 0

    return {
        'predicted_class': predicted_label,
        #'predicted_class_prob': probs
    }

In [30]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Chemberta- encoder only

In [9]:
path2model = "/home/u111169/wrkdir/mgh-project/checkpoints-arxive/fine_tuning_chemberta_classic_way/augmented/checkpoint-500"#'FacebookAI/xlm-roberta-large'

In [95]:
# l_sim = []
# main = torch.tensor(train_embeded["embeddings"])
# for smile in val_embeded["SMILES"]: 
#     train_embeded_cache = train_embeded
#     one = torch.tensor(one_embeddings(smile)["embeddings"])
#     similarities = torch.cosine_similarity(one.unsqueeze(0), main, dim=1)
#     train_embeded["compare2"] = [smile]*len(train_embeded)
#     train_embeded_cache["sim"]=similarities.tolist()
#     l_sim.append(train_embeded_cache)
# main_con = pd.concat(l_sim, axis=0)

In [7]:
n = 112
smile_for_test = dataset_test["SMILES"][n]
labels_for_test=dataset_test["labels"][n]

In [8]:
model = AutoModel.from_pretrained(path2model).to(device)
tokenizer = AutoTokenizer.from_pretrained(path2model)
most_app , most_nonapp , tot = get_sim_score(dataset_train , smile_for_test, 5,5)

Some weights of RobertaModel were not initialized from the model checkpoint at /home/u111169/wrkdir/mgh-project/checkpoints-arxive/fine_tuning_chemberta_classic_way/augmented/checkpoint-500 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/2497 [00:00<?, ? examples/s]

most simmilar SMILES to C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(C)[C@@]3(F)[C@@H](O)C[C@@]21C based on the model's prediction


  main = torch.tensor(train_embeded["embeddings"])


In [11]:
model = AutoModelForSequenceClassification.from_pretrained("/home/u111169/wrkdir/mgh-project/checkpoints-arxive/fine_tuning_chemberta_classic_way/augmented/checkpoint-500" ,num_labels=2).to(device)
tokenizer = RobertaTokenizer.from_pretrained("/home/u111169/wrkdir/mgh-project/checkpoints-arxive/fine_tuning_chemberta_classic_way/augmented/checkpoint-500")#'FacebookAI/xlm-roberta-large'


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
labels_for_test , smile_for_test

(0, 'C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@]4(C)[C@@]3(F)[C@@H](O)C[C@@]21C')

In [13]:
predicted = preprocess_and_predict_encoder(smile_for_test)

{'predicted_class_base': 1, 'predicted_class_prob_base': 0.9819175601005554}

In [14]:
most_app 

Unnamed: 0,SMILES,labels,sim
642,CC(=O)[C@@]1(O)CC[C@H]2[C@@H]3C[C@H](C)C4=CC(=...,1,0.997136
748,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...,1,0.996724
1416,CC1(C)O[C@@H]2C[C@H]3[C@@H]4CCC5=CC(=O)C=C[C@]...,1,0.996411
1007,CC[C@@H]1[C@@H]2C[C@H](O)CC[C@]2(C)[C@H]2CC[C@...,1,0.996281
2281,CC(=O)[C@@]1(O)CC[C@H]2[C@@H]3C=C(C)C4=CC(=O)C...,1,0.99626


In [15]:
most_nonapp

Unnamed: 0,SMILES,labels,sim
2455,CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)...,0,0.996713
2270,C=C1/C(=C\C=C2/CCC[C@@]3(C)[C@H]2CC[C@@H]3[C@H...,0,0.992212
2324,C[C@@H]1[C@H]2C3=CC[C@@H]4[C@@]5(C)CC[C@H](O)C...,0,0.986225
2052,CC(=O)OCC(=O)[C@@]1(O)CC[C@H]2[C@@H]3CCC4=CC(=...,0,0.985912
931,C=C1CC[C@H]2[C@H](CN)[C@@H]([C@@]3(C)CC[C@H](O...,0,0.983682


In [16]:
tot

Unnamed: 0,SMILES,labels,sim
642,CC(=O)[C@@]1(O)CC[C@H]2[C@@H]3C[C@H](C)C4=CC(=...,1,0.997136
748,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...,1,0.996724
2455,CC1(C)O[C@@H]2C[C@H]3[C@@H]4C[C@H](F)C5=CC(=O)...,0,0.996713
1416,CC1(C)O[C@@H]2C[C@H]3[C@@H]4CCC5=CC(=O)C=C[C@]...,1,0.996411
1007,CC[C@@H]1[C@@H]2C[C@H](O)CC[C@]2(C)[C@H]2CC[C@...,1,0.996281


## gpt

In [None]:
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", cache_dir="/home/u111169/wrkdir/mgh-project/models")
num_added_toks = tokenizer.add_tokens(["<Approved>", "<NotApproved>"])
num_added_toks = tokenizer.add_special_tokens({"additional_special_tokens": ["<[lbl]>", "<[mr]>"]})
print("We have added", num_added_toks, "tokens")
model.resize_token_embeddings(len(tokenizer))

In [35]:

# train_df["labels"] = train_df["labels"].map(convert_labels)
val_df["SMILES"] = val_df.apply(lambda row: convert_to_fingerprint_for_eval(row["SMILES"]), axis=1)
val_df["labels"] = val_df["labels"].map(convert_labels)
test_df["SMILES"] = test_df.apply(lambda row: convert_to_fingerprint_for_eval(row["SMILES"]), axis=1)
test_df["labels"] = test_df["labels"].map(convert_labels)

In [73]:
n = 120
smile_for_test = val_df["SMILES"][n]
labels_for_test=val_df["labels"][n]

In [74]:
path2model = "/home/u111169/wrkdir/mgh-project/checkpoints/druggpt2/checkpoint-9000"

In [75]:
model = AutoModel.from_pretrained(path2model).to(device)
tokenizer = AutoTokenizer.from_pretrained(path2model, is_decoder=True)
most_app , most_nonapp , tot = get_sim_score(dataset_train , smile_for_test, 5,5)

Map:   0%|          | 0/2497 [00:00<?, ? examples/s]

most simmilar SMILES to O=C(NO)C1(CS(=O)(=O)c2ccc(Oc3ccc(Cl)cc3)cc2)CCOCC1<[mr]>0000000000010000000000000000000010001001000000101100001000000001001000000000100011000000000000110000100001000010000000000000010010000000001000001001001000000010000000000000000000000011000000100100110000010010000000101000000000001100000010000000010000000000<[lbl]> based on the model's prediction


In [76]:
model = AutoModelForCausalLM.from_pretrained(path2model, is_decoder=True)# ,num_labels=2)
model.to(device)
model.eval()

# for gpt2
tokenizer.pad_token = tokenizer.eos_token
predicted = preprocess_and_predict_dencoder(val_df["SMILES"][12])

In [77]:
labels_for_test

'<NotApproved>'

In [78]:
predicted

{'predicted_class': 0}

In [79]:
most_app

Unnamed: 0,SMILES,labels,sim
615,O=C1OC2(c3ccc(O)cc3Oc3cc(O)ccc32)c2ccccc21,1,0.605786
2436,O=C(Oc1ccc2c(c1)CCCN2C(=O)C(Cl)Cl)c1ccco1,1,0.597046
168,O=S(=O)([O-])Oc1ccc(C(c2ccc(OS(=O)(=O)[O-])cc2...,1,0.570796
2121,O=C(NC[C@H]1CN(c2ccc(N3CCOCC3=O)cc2)C(=O)O1)c1...,1,0.568946
891,O=S(=O)(O)c1cc(O)ccc1O,1,0.567458


In [80]:
most_nonapp

Unnamed: 0,SMILES,labels,sim
439,O=C1OC2(c3cc(I)c(O)c(I)c3Oc3c2cc(I)c(O)c3I)c2c...,0,0.605879
651,O=C(NS(=O)(=O)c1ccc(Br)s1)c1ccc(Cl)cc1Cl,0,0.588697
1282,OCC(O)CN1CCN(c2ccccc2)CC1,0,0.568489
1542,Oc1cc(O)c(Cl)cn1,0,0.554149
1197,O=C(O)CNC(=O)c1ccccc1I,0,0.534846


In [81]:
tot

Unnamed: 0,SMILES,labels,sim
439,O=C1OC2(c3cc(I)c(O)c(I)c3Oc3c2cc(I)c(O)c3I)c2c...,0,0.605879
615,O=C1OC2(c3ccc(O)cc3Oc3cc(O)ccc32)c2ccccc21,1,0.605786
2436,O=C(Oc1ccc2c(c1)CCCN2C(=O)C(Cl)Cl)c1ccco1,1,0.597046
651,O=C(NS(=O)(=O)c1ccc(Br)s1)c1ccc(Cl)cc1Cl,0,0.588697
168,O=S(=O)([O-])Oc1ccc(C(c2ccc(OS(=O)(=O)[O-])cc2...,1,0.570796


## sentence transformers