In [42]:
import pandas as pd
import argparse
import textwrap
from chembl_webresource_client.new_client import new_client
from rdkit.Chem import AllChem
import os
from pandas import json_normalize
import requests
from json import JSONDecodeError
from rdkit.Chem import RDConfig
import os
import sys
sys.path.append(os.path.join(RDConfig.RDContribDir, 'NP_Score'))
import npscorer
from argparse import Namespace

In [43]:
# Define the output folder path
output_folder = 'C:/Users/quirosgu/Documents/GitHub/Yggdrasil/'  # Replace 'chembl_custom_folder' with your desired folder name

# Define the arguments
args_dict = {
    "target_id": "CHEMBL4481",  # Replace with your default target ID
    "NPlike_score": -1  # Replace with your default NPlike_score
}

# Convert the arguments to Namespace object
args = Namespace(**args_dict)

In [44]:
""" Functions """

# Function used to clean the data downloaded from ChEMBL
def clean_DB(df_in, NP_model, NP_cutoff):
    '''Function to clean a ChEMBL DB'''
    
    df=df_in.copy()
    isomeric_smiles = []
    np_scores = []
    inchikey = []

    # Drop rows without Smiles or activity value
    #df.dropna(subset=['smiles', 'standard_value'], inplace=True)

    # Drop row with a Data Validity Comment "Outside typical range"
    #df.drop(df[df['data_validity_comment'] == "Outside typical range"].index, inplace=True)
    #df.drop(['data_validity_comment', 'relation', 'units', 'value', 'activity_comment', 'type'],axis =1, inplace=True)
    
    # Drop rows with an invalid smiles, replace smiles with canonical smiles and add a columns with smiles without stereo-isomers
    mols = []
    
    for i, row in df.iterrows():
        mol = AllChem.MolFromSmiles(row["canonical_smiles"])
        if mol is not None:
            iso_smiles = AllChem.MolToSmiles(mol, isomericSmiles=True)
            isomeric_smiles.append(iso_smiles)
            inchikey.append(AllChem.MolToInchiKey(mol))
            np_scores.append(npscorer.scoreMol(mol, NP_model))
            row["canonical_smiles"] = AllChem.MolToSmiles(mol)
        else:
            df.drop(i, inplace=True)

    # l=[i for i in range(len(mols)) if mols[i] == None]
    # df.drop(df.index[l], inplace=True)

    # for _, row in df.iterrows():
    #     mol = AllChem.MolFromSmiles(row["canonical_smiles"])
    #     iso_smiles = AllChem.MolToSmiles(mol, isomericSmiles=True)
    #     isomeric_smiles.append(iso_smiles)
    #     inchikey.append(AllChem.MolToInchiKey(mol))

    df['isomeric_smiles'] = isomeric_smiles
    df['inchikey'] = inchikey
    df['np_score'] = np_scores
    df['short_inchikey'] = df['inchikey'].str[:14]
    df["document_journal"].replace({None: "Unknown journal"}, inplace=True)
    df = df[(df['np_score'] > NP_cutoff) | (df['document_journal'] == 'J Nat Prod')]
    
    return df


def get_all_ik(url):
    query = '''
            PREFIX wdt: <http://www.wikidata.org/prop/direct/>
            SELECT ?ik ?wd ?isomeric_smiles
            WHERE{
                ?wd wdt:P235 ?ik .
                optional { ?wd wdt:P2017 ?isomeric_smiles } 
            }
            '''
    try:
        r = requests.get(url, params={'format': 'json', 'query': query})
        r.raise_for_status()  # Raise an exception for HTTP errors
        data = r.json()
        results = pd.DataFrame.from_dict(data).get('results', {}).get('bindings', [])
        df = json_normalize(results)
        df.rename(columns={'wd.value':'wikidata_id', 'ik.value':'inchikey', 'isomeric_smiles.value': 'isomeric_smiles'}, inplace=True)
        return df[['wikidata_id', 'inchikey', 'isomeric_smiles']]
    except (JSONDecodeError, requests.RequestException) as e:
        print(f"An error occurred: {e}")
        return pd.DataFrame()  # Return an empty DataFrame in case of errors


In [45]:
# Download selected activities
activities = new_client.activity
activities = activities.filter(target_chembl_id = args.target_id)
    
res = activities.filter(standard_value__isnull=False) # Keep only compounds with an activity value
NP_model = npscorer.readNPModel()

#res = activities.filter(standard_type__iexact = 'IC50')
res = res.only(['activity_comment', 'molecule_chembl_id', 'canonical_smiles', 'standard_relation', 'target_chembl_id',
            'standard_type', 'target_pref_name', 'standard_units', 'standard_value', 'data_validity_comment', 'document_journal',
            'assay_chembl_id', 'document_chembl_id'
             ])

print('Fetching results from ChEMBL: this step can be long. Have a coffee ;)')
res_df = pd.DataFrame.from_dict(res)
print('Fetching results from ChEMBL: Done!')
#res_df.head()
df_clean = clean_DB(res_df, NP_model, int(args.NPlike_score))
df_clean.head()

reading NP model ...
model in


Fetching results from ChEMBL: this step can be long. Have a coffee ;)
Fetching results from ChEMBL: Done!


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["document_journal"].replace({None: "Unknown journal"}, inplace=True)


Unnamed: 0,activity_comment,assay_chembl_id,canonical_smiles,data_validity_comment,document_chembl_id,document_journal,molecule_chembl_id,relation,standard_relation,standard_type,...,standard_value,target_chembl_id,target_pref_name,type,units,value,isomeric_smiles,inchikey,np_score,short_inchikey
0,,CHEMBL754647,CS/C(S)=N/CCC[C@H](N)C(=O)O,,CHEMBL1132725,Bioorg Med Chem Lett,CHEMBL430460,=,=,Inhibition,...,54.0,CHEMBL3464,"Nitric oxide synthase, inducible",Inhibition,%,54.0,CS/C(S)=N/CCC[C@H](N)C(=O)O,LJFMFRCGTLDGMK-YFKPBYRVSA-N,0.876094,LJFMFRCGTLDGMK
1,,CHEMBL754647,CSC(=NCCC[C@H](N)C(=O)O)SC,,CHEMBL1132725,Bioorg Med Chem Lett,CHEMBL328880,=,=,Inhibition,...,56.0,CHEMBL3464,"Nitric oxide synthase, inducible",Inhibition,%,56.0,CSC(=NCCC[C@H](N)C(=O)O)SC,CGQKJLSCMAFLMT-LURJTMIESA-N,0.911856,CGQKJLSCMAFLMT
2,,CHEMBL754647,N/C(S)=N/CCC[C@H](N)C(=O)O,,CHEMBL1132725,Bioorg Med Chem Lett,CHEMBL93247,>,>,Inhibition,...,99.0,CHEMBL3464,"Nitric oxide synthase, inducible",Inhibition,%,99.0,N/C(S)=N/CCC[C@H](N)C(=O)O,BKGWACHYAMTLAF-BYPYZUCNSA-N,0.994133,BKGWACHYAMTLAF
3,,CHEMBL754798,N/C(S)=N/CCC[C@H](N)C(=O)O,,CHEMBL1132725,Bioorg Med Chem Lett,CHEMBL93247,=,=,Ki,...,3600.0,CHEMBL3464,"Nitric oxide synthase, inducible",Ki,uM,3.6,N/C(S)=N/CCC[C@H](N)C(=O)O,BKGWACHYAMTLAF-BYPYZUCNSA-N,0.994133,BKGWACHYAMTLAF
4,,CHEMBL754647,CCOP(=S)(NCCC[C@H](N)C(=O)O)OCC,,CHEMBL1132725,Bioorg Med Chem Lett,CHEMBL92448,<,<,Inhibition,...,10.0,CHEMBL3464,"Nitric oxide synthase, inducible",Inhibition,%,10.0,CCOP(=S)(NCCC[C@H](N)C(=O)O)OCC,MWJZYTFUZYRCMY-QMMMGPOBSA-N,0.512122,MWJZYTFUZYRCMY


In [46]:
#recover information from Wikidata 

wd_url = 'https://query.wikidata.org/sparql'
wd_all = get_all_ik(wd_url)
#wd_all.head()
wd_filtred = wd_all[wd_all['inchikey'].isin(list(df_clean.inchikey))]

#merge with the clean df
df_total = df_clean.merge(wd_filtred[['inchikey', 'wikidata_id']], on='inchikey', how='outer')
df_total['wikidata_id'] = df_total['wikidata_id'] .fillna('no_wikidata_match')
df_total.head()

Unnamed: 0,activity_comment,assay_chembl_id,canonical_smiles,data_validity_comment,document_chembl_id,document_journal,molecule_chembl_id,relation,standard_relation,standard_type,...,target_chembl_id,target_pref_name,type,units,value,isomeric_smiles,inchikey,np_score,short_inchikey,wikidata_id
0,,CHEMBL1762844,CCOC(=O)c1c(F)c(F)c2[nH]nc(O)c2c1F,,CHEMBL1759883,Eur J Med Chem,CHEMBL1762748,=,=,Inhibition,...,CHEMBL3051,"Nitric oxide synthase, inducible",INH,%,51.8,CCOC(=O)c1c(F)c(F)c2[nH]nc(O)c2c1F,ADIBHHLBVBRJPW-UHFFFAOYSA-N,-0.840421,ADIBHHLBVBRJPW,no_wikidata_match
1,,CHEMBL960400,CCOC(=O)C1CC(C(=O)c2cc(Cl)ccc2N)=NN1,,CHEMBL1140983,Eur J Med Chem,CHEMBL453185,=,=,Inhibition,...,CHEMBL3051,"Nitric oxide synthase, inducible",INH,%,19.1,CCOC(=O)C1CC(C(=O)c2cc(Cl)ccc2N)=NN1,BBCSPGPZRDCBCF-UHFFFAOYSA-N,-0.679276,BBCSPGPZRDCBCF,no_wikidata_match
2,,CHEMBL696856,N/C(S)=N/CCC[C@H](N)C(=O)O,,CHEMBL1127683,J Med Chem,CHEMBL93247,=,=,Inhibition,...,CHEMBL3051,"Nitric oxide synthase, inducible",Inhibition,%,87.0,N/C(S)=N/CCC[C@H](N)C(=O)O,BKGWACHYAMTLAF-BYPYZUCNSA-N,0.994133,BKGWACHYAMTLAF,http://www.wikidata.org/entity/Q27094813
3,,CHEMBL872049,N/C(S)=N/CCC[C@H](N)C(=O)O,,CHEMBL1127683,J Med Chem,CHEMBL93247,=,=,Inhibition,...,CHEMBL3051,"Nitric oxide synthase, inducible",Inhibition,%,34.0,N/C(S)=N/CCC[C@H](N)C(=O)O,BKGWACHYAMTLAF-BYPYZUCNSA-N,0.994133,BKGWACHYAMTLAF,http://www.wikidata.org/entity/Q27094813
4,,CHEMBL696856,N/C(S)=N/CCC[C@@H](N)C(=O)O,,CHEMBL1127683,J Med Chem,CHEMBL168461,<,<,Inhibition,...,CHEMBL3051,"Nitric oxide synthase, inducible",Inhibition,%,1.0,N/C(S)=N/CCC[C@@H](N)C(=O)O,BKGWACHYAMTLAF-SCSAIBSYSA-N,0.994133,BKGWACHYAMTLAF,no_wikidata_match


In [None]:
# Construct the file path
filename = f"{args.target_id}_np_like_min_{args.NPlike_score}.csv"
path_to_file = os.path.join(output_folder, filename)

# Save the DataFrame to CSV
df_total.to_csv(path_to_file, index=False)

# Print a message indicating where the file was saved
print(f"Finished. Results are in: {path_to_file}")

Finished. Results are in: C:/Users/quirosgu/Documents/GitHub/Yggdrasil/CHEMBL3051_np_like_min_-1.csv
