# The Pipeline 

## step1 : loading an embedding model

In [1]:
%%capture
import pandas as pd
from step1 import *
from ImportsAndDatasets import *
from parse_drugbank_data  import parse_drugbank ,convert_properties , extract_smiles
from preprocess_df_for_smiles import *

In [2]:
df_drugbank = parse_drugbank("/home/u111169/wrkdir/mgh-project/datasets/drugbank_data/full_database.xml")
print("drug bank data parsed")

In [3]:
%%capture
approved_df, not_approved_df , lookup_table = preprocess_df_for_smiles(df_drugbank)

In [65]:
df_properties = lookup_table[["name","description", "average_mass", "toxicity","groups", "labels" ,"approval_stat", "SMILES", "experimental_properties", "calculated_properties" ]]

In [68]:
df = pd.read_csv("/home/u111169/wrkdir/mgh-project/ChemAP/dataset/DrugApp/All_training_feature_vectors.csv")

df = df[["SMILES" , "Label"]].rename(columns={'Label': 'labels'})

In [95]:
train_df , val_df , test_df , dataset_train , dataset_df , dataset_test = train_valid_test_split(df)

In [96]:
from rdkit import Chem

def get_canonical_smiles(smiles):
    # Convert input SMILES to molecule object
    mol = Chem.MolFromSmiles(smiles)
    
    if mol is None:
        return None  # Invalid SMILES
    
    # Generate canonical SMILES
    canonical_smiles = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)
    return canonical_smiles


In [97]:
train_df["SMILES"] = train_df["SMILES"].apply(lambda x: get_canonical_smiles(x))
val_df["SMILES"] = val_df["SMILES"].apply(lambda x: get_canonical_smiles(x))
test_df["SMILES"] = test_df["SMILES"].apply(lambda x: get_canonical_smiles(x))

In [98]:
df_properties = df_properties.dropna(subset=["SMILES"])

df_properties["SMILES"] = df_properties["SMILES"].apply(lambda x: get_canonical_smiles(x))

[15:21:58] Unusual charge on atom 42 number of radical electrons set to zero


In [80]:
# train_df = train_df[: 10]
# val_df = val_df[: 10]
# test_df = test_df[: 10]

In [82]:
def get_dict_of_approved_and_unapproved(df, dataset_train, dataset="val" , number_of_simmilar_in_each=5):
    """
    """
    df["Dicts"] = None
    for inst in range(len(df)): 
        app_dict = {}
        most_app , most_nonapp , tot  = get_most_app_and_most_nonapp(inst, df, dataset_train, dataset , number_of_simmilar_in_each)
        app_dict["most_app"]=most_app 
        app_dict["most_nonapp"]=most_nonapp 
        df["Dicts"][inst] = app_dict
    return df

In [83]:
def get_value(df, smiles_value, column_name):
    """
    Retrieves a value from a DataFrame based on a SMILES value and column name.

    Args:
        df (pd.DataFrame): The DataFrame to search in.
        smiles_value (str): The SMILES value to search for.
        column_name (str): The name of the column to retrieve the value from.

    Returns:
        The value from the specified column and SMILES value, or None if not found.
    """
    row = df[df["SMILES"] == smiles_value].reset_index()
    
    if not row.empty:
        return row[column_name][0]
    return None

In [84]:
def get_list_of_most(most_app:pd.DataFrame , df_properties): 
    final_list_of_most = []
    for i in most_app["SMILES"]: 
        dic = {}
        dic["name"] = get_value(df_properties,i, "name")
        dic["simmilarity score"] = most_app.loc[most_app["SMILES"] == i, "sim"].values
        dic["approval status"] = get_value(df_properties,i, "approval_stat")
        dic["average mass"] = get_value(df_properties , i, "average_mass")
        dic["toxicity"] = get_value(df_properties , i, "toxicity") 
        dic["descriptions"] = get_value(df_properties , i, "description") 
        final_list_of_most.append(dic) 
    return final_list_of_most
        

In [85]:
def final_dataset_containing_drug_information_as_list_ready_for_prompt(df, df_properties):
    # Initialize the new columns with None
    df["list_of_most_approved_info_for_n_simmilar"] = None
    df["list_of_most_nonapproved_info_for_n_simmilar"] = None
    
    # Iterate over each row in the DataFrame
    for i in range(len(df)):
        list_of_most_approved_info = get_list_of_most(df["Dicts"][i]["most_app"], df_properties)
        list_of_most_nonapproved_info = get_list_of_most(df["Dicts"][i]["most_nonapp"], df_properties)
        df.at[i, "list_of_most_approved_info_for_n_simmilar"] = list_of_most_approved_info
        df.at[i, "list_of_most_nonapproved_info_for_n_simmilar"] = list_of_most_nonapproved_info
        
    return df

In [86]:
def prepare_final_dataset(train_df, dataset_train, df_properties, dataset="train", number_of_simmilar_in_each=5):
    """
    Prepare the final dataset containing drug information as a list ready for prompt.

    Args:
    - train_df (DataFrame): The training DataFrame.
    - dataset_train (object): The training dataset object.
    - df_properties (DataFrame): The DataFrame containing properties.
    - number_of_simmilar_in_each (int, optional): The number of similar items in each. Defaults to 5.

    Returns:
    - final_df (DataFrame): The final dataset containing drug information as a list ready for prompt.
    """
    a = get_dict_of_approved_and_unapproved(train_df, dataset_train, dataset, number_of_simmilar_in_each=number_of_simmilar_in_each)
    final_df = final_dataset_containing_drug_information_as_list_ready_for_prompt(a, df_properties)
    return final_df

In [None]:
# import time 
# start = time.time()

a = get_dict_of_approved_and_unapproved(train_df, dataset_train, dataset="train" , number_of_simmilar_in_each=5)

In [91]:
final_df = final_dataset_containing_drug_information_as_list_ready_for_prompt(a , df_properties)

# end=time.time()
# e = start - end
# print(e)

In [92]:
final_df.to_csv("cache2.csv" , index = False)

In [93]:
c = pd.read_csv("cache2.csv")

In [44]:
# final_df.to_csv("/home/u111169/wrkdir/mgh-project/dataframes/final_df_to_reason.csv", index = False)

In [94]:
c

Unnamed: 0,SMILES,labels,Dicts,list_of_most_approved_info_for_n_simmilar,list_of_most_nonapproved_info_for_n_simmilar
0,CC1(C)[C@@H](O[C@H]2O[C@H](C(=O)O)[C@@H](O)[C@...,1,{'most_app': ...,"[{'name': None, 'simmilarity score': array([0....","[{'name': None, 'simmilarity score': array([0...."
1,CCN(CC)CCNC(=O)c1c(C)[nH]c(/C=C2\C(=O)Nc3ccc(F...,1,{'most_app': ...,"[{'name': None, 'simmilarity score': array([0....","[{'name': 'Omecamtiv Mecarbil', 'simmilarity s..."
2,C[N+]1(C)[C@@H]2C[C@@H](OC(=O)C(O)(c3cccs3)c3c...,1,{'most_app': ...,"[{'name': None, 'simmilarity score': array([1....","[{'name': 'Lucanthone', 'simmilarity score': a..."
3,CO/N=C(\C(=O)N[C@@H]1C(=O)N2C(C(=O)[O-])=C(C[N...,1,{'most_app': ...,"[{'name': 'Cefpirome', 'simmilarity score': ar...","[{'name': 'Lucanthone', 'simmilarity score': a..."
4,Nc1cc(C(F)(F)F)c(-c2nc(N3CCOCC3)nc(N3CCOCC3)n2...,0,{'most_app': ...,"[{'name': 'Pexidartinib', 'simmilarity score':...","[{'name': 'Buparlisib', 'simmilarity score': a..."
5,O=C([O-])[O-].O=C([O-])[O-].O=C([O-])[O-].[La+...,1,{'most_app': ...,"[{'name': 'Calcium acetate', 'simmilarity scor...","[{'name': 'Gallium citrate Ga-67', 'simmilarit..."
6,CCC(=O)N(c1ccccc1)C1CCN(CCc2ccccc2)CC1,1,{'most_app': ...,"[{'name': None, 'simmilarity score': array([0....","[{'name': 'Chlorcyclizine', 'simmilarity score..."
7,CNC(=N)NCCC[C@H](N)C(=O)O,1,{'most_app': ...,"[{'name': 'Levomefolic acid', 'simmilarity sco...","[{'name': None, 'simmilarity score': array([1...."
8,C[S+](C)[O-],0,{'most_app': ...,"[{'name': 'Osimertinib', 'simmilarity score': ...","[{'name': 'Mechlorethamine', 'simmilarity scor..."
9,CC(C)(C)NCC(O)c1cc(Cl)c(N)c(Cl)c1,0,{'most_app': ...,"[{'name': 'Penciclovir', 'simmilarity score': ...","[{'name': 'Flutemetamol', 'simmilarity score':..."


## prompt

In [36]:
sys_prompt = "you are an expert chemist "

In [40]:
input_prompt_1 = f""" {sys_prompt}
your task is to do reasoning based on the provided informaion that why the compound X is {approval_stat}
I trained a model that can predict the most simmilar molecules to  compound X. this model will give the five most simmilar and five most dissimillar moleculs based on the state of approvality. 
you know that the compound X is {approval_stat} and your task is to reason why this will be {approval_stat} based on the provided information. 
the most simmilar approved small molecules with their properties that my model predict to this coumpound are: 
{list_of_most_approved} 
and the most simmilar non approved small moleculs with their properties are: 
{list_of_most_nonapproved} 
"""

In [38]:
input_prompt_2 = f"""{sys_prompt}
Your task is to provide a detailed analysis explaining why compound X has an approval status of {final_df["approval_stat"][0]}.

I have developed a model that identifies molecules most similar to compound X. This model outputs two lists:
1. The five most similar approved small molecules (with their properties).
2. The five most similar non-approved small molecules (with their properties).

Using the provided data:
- Approved molecules: {final_df["list_of_most_approved_info_for_n_simmilar"][0]}
- Non-approved molecules: {final_df["list_of_most_nonapproved_info_for_n_simmilar"][0]}

Please analyze the data and reason which properties or patterns contribute to compound X being {final_df["approval_stat"][0]}. In your explanation, compare the characteristics observed in both lists to support your reasoning.
"""


In [43]:
input_prompt_3 = f"""
Your task is to analyze the likelihood of compound X receiving regulatory approval based on its similarity to known molecules.  


I have developed a model that identifies molecules most similar to compound X. This model outputs two lists:  
1. The five most similar approved small molecules (with their properties and similarity scores).  
2. The five most similar non-approved small molecules (with their properties and similarity scores).  

Using the provided data:  
- Approved molecules (with similarity scores): {final_df["list_of_most_approved_info_for_n_simmilar"][0]}  
- Non-approved molecules (with similarity scores): {final_df["list_of_most_nonapproved_info_for_n_simmilar"][0]}  


Please analyze the molecular properties, trends, and distinguishing features in both lists. Take the similarity scores into account when evaluating the degree of resemblance between compound X and the approved/non-approved molecules.  

Based on these comparisons, determine whether compound X is more likely to be approved or not. Justify your reasoning by highlighting key characteristics that align with approved or non-approved compounds, and discuss how the similarity scores influence your prediction.  
"""