In [87]:
import pandas as pd
import argparse
import textwrap
from chembl_webresource_client.new_client import new_client
from rdkit.Chem import AllChem
import os
from pandas import json_normalize
import requests
from json import JSONDecodeError
from rdkit.Chem import RDConfig
import os
import sys
sys.path.append(os.path.join(RDConfig.RDContribDir, 'NP_Score'))
import npscorer
from argparse import Namespace

In [88]:
# Define the output folder path
output_folder = 'C:/Users/quirosgu/Documents/GitHub/Yggdrasil/chembl/'  # Replace 'chembl_custom_folder' with your desired folder name

# Define the arguments
args_dict = {
    "target_id": "CHEMBL3051",  # Replace with your default target ID
    "NPlike_score": -1  # Replace with your default NPlike_score
}

# Convert the arguments to Namespace object
args = Namespace(**args_dict)

In [83]:
""" Functions """

# Function used to clean the data downloaded from ChEMBL
def clean_DB(df_in, NP_model, NP_cutoff):
    '''Function to clean a ChEMBL DB'''
    
    df=df_in.copy()
    isomeric_smiles = []
    np_scores = []
    inchikey = []

    # Drop rows without Smiles or activity value
    #df.dropna(subset=['smiles', 'standard_value'], inplace=True)

    # Drop row with a Data Validity Comment "Outside typical range"
    #df.drop(df[df['data_validity_comment'] == "Outside typical range"].index, inplace=True)
    #df.drop(['data_validity_comment', 'relation', 'units', 'value', 'activity_comment', 'type'],axis =1, inplace=True)
    
    # Drop rows with an invalid smiles, replace smiles with canonical smiles and add a columns with smiles without stereo-isomers
    mols = []
    
    for i, row in df.iterrows():
        mol = AllChem.MolFromSmiles(row["canonical_smiles"])
        if mol is not None:
            iso_smiles = AllChem.MolToSmiles(mol, isomericSmiles=True)
            isomeric_smiles.append(iso_smiles)
            inchikey.append(AllChem.MolToInchiKey(mol))
            np_scores.append(npscorer.scoreMol(mol, NP_model))
            row["canonical_smiles"] = AllChem.MolToSmiles(mol)
        else:
            df.drop(i, inplace=True)

    # l=[i for i in range(len(mols)) if mols[i] == None]
    # df.drop(df.index[l], inplace=True)

    # for _, row in df.iterrows():
    #     mol = AllChem.MolFromSmiles(row["canonical_smiles"])
    #     iso_smiles = AllChem.MolToSmiles(mol, isomericSmiles=True)
    #     isomeric_smiles.append(iso_smiles)
    #     inchikey.append(AllChem.MolToInchiKey(mol))

    df['isomeric_smiles'] = isomeric_smiles
    df['inchikey'] = inchikey
    df['np_score'] = np_scores
    # Convert 'inchikey' column to string type
    df['inchikey'] = df['inchikey'].astype(str)

    # Now you can safely use the .str accessor on the 'inchikey' column
    df['short_inchikey'] = df['inchikey'].str[:14]
    df["document_journal"].replace({None: "Unknown journal"}, inplace=True)
    df = df[(df['np_score'] > NP_cutoff) | (df['document_journal'] == 'J Nat Prod')]
    
    return df


def get_all_ik(url):
    query = '''
            PREFIX wdt: <http://www.wikidata.org/prop/direct/>
            SELECT ?ik ?wd ?isomeric_smiles
            WHERE{
                ?wd wdt:P235 ?ik .
                optional { ?wd wdt:P2017 ?isomeric_smiles } 
            }
            '''
    try:
        r = requests.get(url, params={'format': 'json', 'query': query})
        r.raise_for_status()  # Raise an exception for HTTP errors
        data = r.json()
        results = pd.DataFrame.from_dict(data).get('results', {}).get('bindings', [])
        df = json_normalize(results)
        df.rename(columns={'wd.value':'wikidata_id', 'ik.value':'inchikey', 'isomeric_smiles.value': 'isomeric_smiles'}, inplace=True)
        return df[['wikidata_id', 'inchikey', 'isomeric_smiles']]
    except (JSONDecodeError, requests.RequestException) as e:
        print(f"An error occurred: {e}")
        return pd.DataFrame()  # Return an empty DataFrame in case of errors


In [84]:
# Download selected activities
activities = new_client.activity
activities = activities.filter(target_chembl_id = args.target_id)
    
res = activities.filter(standard_value__isnull=False) # Keep only compounds with an activity value
NP_model = npscorer.readNPModel()

#res = activities.filter(standard_type__iexact = 'IC50')
res = res.only(['activity_comment', 'molecule_chembl_id', 'canonical_smiles', 'standard_relation', 'target_chembl_id',
            'standard_type', 'target_pref_name', 'standard_units', 'standard_value', 'data_validity_comment', 'document_journal',
            'assay_chembl_id', 'document_chembl_id'
             ])

print('Fetching results from ChEMBL: this step can be long. Have a coffee ;)')
res_df = pd.DataFrame.from_dict(res)
print('Fetching results from ChEMBL: Done!')
#res_df.head()
df_clean = clean_DB(res_df, NP_model, int(args.NPlike_score))
df_clean.head()

reading NP model ...
model in


Fetching results from ChEMBL: this step can be long. Have a coffee ;)
Fetching results from ChEMBL: Done!


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["document_journal"].replace({None: "Unknown journal"}, inplace=True)


Unnamed: 0,activity_comment,assay_chembl_id,canonical_smiles,data_validity_comment,document_chembl_id,document_journal,molecule_chembl_id,relation,standard_relation,standard_type,...,standard_value,target_chembl_id,target_pref_name,type,units,value,isomeric_smiles,inchikey,np_score,short_inchikey
0,,CHEMBL696856,CN(CCC[C@H](N)C(=O)O)C(=N)N,,CHEMBL1127683,J Med Chem,CHEMBL99636,=,=,Inhibition,...,89.0,CHEMBL3051,"Nitric oxide synthase, inducible",Inhibition,%,89.0,CN(CCC[C@H](N)C(=O)O)C(=N)N,XKCWNEVAXQCMGP-YFKPBYRVSA-N,0.785323,XKCWNEVAXQCMGP
1,,CHEMBL872049,CN(CCC[C@H](N)C(=O)O)C(=N)N,,CHEMBL1127683,J Med Chem,CHEMBL99636,=,=,Inhibition,...,33.0,CHEMBL3051,"Nitric oxide synthase, inducible",Inhibition,%,33.0,CN(CCC[C@H](N)C(=O)O)C(=N)N,XKCWNEVAXQCMGP-YFKPBYRVSA-N,0.785323,XKCWNEVAXQCMGP
2,,CHEMBL696856,N/C(S)=N/CCC[C@@H](N)C(=O)O,,CHEMBL1127683,J Med Chem,CHEMBL168461,<,<,Inhibition,...,1.0,CHEMBL3051,"Nitric oxide synthase, inducible",Inhibition,%,1.0,N/C(S)=N/CCC[C@@H](N)C(=O)O,BKGWACHYAMTLAF-SCSAIBSYSA-N,0.994133,BKGWACHYAMTLAF
3,,CHEMBL872049,CSC(=N)NCCC[C@@H](N)C(=O)O,,CHEMBL1127683,J Med Chem,CHEMBL1161525,<,<,Inhibition,...,1.0,CHEMBL3051,"Nitric oxide synthase, inducible",Inhibition,%,1.0,CSC(=N)NCCC[C@@H](N)C(=O)O,NGVMVBQRKZPFLB-RXMQYKEDSA-N,0.492889,NGVMVBQRKZPFLB
4,,CHEMBL696856,N/C(S)=N/CCCC[C@H](N)C(=O)O,,CHEMBL1127683,J Med Chem,CHEMBL171397,=,=,Inhibition,...,75.0,CHEMBL3051,"Nitric oxide synthase, inducible",Inhibition,%,75.0,N/C(S)=N/CCCC[C@H](N)C(=O)O,NTDQCCFHWYKFAW-YFKPBYRVSA-N,0.929818,NTDQCCFHWYKFAW


In [85]:
#recover information from Wikidata 

wd_url = 'https://query.wikidata.org/sparql'
wd_all = get_all_ik(wd_url)
#wd_all.head()
wd_filtred = wd_all[wd_all['inchikey'].isin(list(df_clean.inchikey))]

#merge with the clean df
df_total = df_clean.merge(wd_filtred[['inchikey', 'wikidata_id']], on='inchikey', how='outer')
df_total['wikidata_id'] = df_total['wikidata_id'] .fillna('no_wikidata_match')
df_total.head()

KeyboardInterrupt: 

In [None]:
# Construct the file path
filename = f"{args.target_id}_np_like_min_{args.NPlike_score}.csv"
path_to_file = os.path.join(output_folder, filename)

# Save the DataFrame to CSV
df_total.to_csv(path_to_file, index=False)

# Print a message indicating where the file was saved
print(f"Finished. Results are in: {path_to_file}")

Finished. Results are in: C:/Users/quirosgu/Documents/GitHub/Yggdrasil/CHEMBL2111350_np_like_min_-1.csv


MERGE INFORMATION if there were more than one CHEMBLcard


In [86]:
import os
import pandas as pd

# Define the output folder containing multiple CSV files
output_folder = 'C:/Users/quirosgu/Documents/GitHub/Yggdrasil/chembl/'

# Initialize an empty DataFrame to store concatenated data
all_data = pd.DataFrame()

# Iterate over each CSV file in the output folder
for file in os.listdir(output_folder):
    if file.endswith('.csv'):
        # Read the CSV file into a DataFrame
        file_path = os.path.join(output_folder, file)
        df = pd.read_csv(file_path)
        
        # Concatenate the DataFrame with the existing data
        all_data = pd.concat([all_data, df], ignore_index=True)

# Group the DataFrame by 'inchikey' and aggregate the values in other columns
grouped_data = all_data.groupby('inchikey').agg('first').reset_index()

# Save the aggregated DataFrame to a new CSV file
output_csv_path = os.path.join(output_folder, 'merged_data.csv')
grouped_data.to_csv(output_csv_path, index=False)

print(f"Merged data saved to: {output_csv_path}")


Merged data saved to: C:/Users/quirosgu/Documents/GitHub/Yggdrasil/chembl/merged_data.csv


In [90]:
# Load merged_data.csv containing active compounds
merged_data = pd.read_csv('C:/Users/quirosgu/Documents/GitHub/Yggdrasil/chembl/merged_data.csv')

# Initialize an empty DataFrame to store taxonomical data
taxonomical_data = pd.DataFrame()

# Specify the folder containing TSV files with reported compounds for each species
tsv_folder = 'C:/Users/quirosgu/Desktop/Inmuno/output_data/species_data/'

# Columns to select from the merged_data DataFrame
merged_columns = ['inchikey']  # Add other columns from merged_data if needed

# Columns to select from the TSV files
tsv_columns = ['structure_inchikey', 'structure_wikidata', 'structure_inchi', 
               'structure_molecular_formula', 'structure_exact_mass', 'structure_smiles_2D',
               'structure_nameIupac', 'structure_nameTraditional', 
               'structure_taxonomy_npclassifier_01pathway', 'structure_taxonomy_npclassifier_02superclass',
               'structure_taxonomy_npclassifier_03class', 'organism_wikidata', 
               'organism_taxonomy_ottid', 'organism_taxonomy_06family', 
               'organism_taxonomy_08genus', 'organism_taxonomy_09species', 
               'reference_wikidata', 'reference_doi', 'chemical_superclass']

# Iterate through each TSV file in the folder
for filename in os.listdir(tsv_folder):
    if filename.endswith('.tsv'):
        # Read the TSV file into a DataFrame
        tsv_filepath = os.path.join(tsv_folder, filename)
        species_data = pd.read_csv(tsv_filepath, sep='\t')
        
        # Filter the data to include only active compounds
        active_compounds_data = species_data[species_data['structure_inchikey'].isin(merged_data['inchikey'])]
        
        # Select desired columns from both DataFrames
        merged_selected = merged_data[merged_columns]
        tsv_selected = active_compounds_data[tsv_columns]
        
        # Concatenate the selected columns
        selected_data = pd.concat([merged_selected, tsv_selected], axis=1)
        
        # Append the selected data to the taxonomical_data DataFrame
        taxonomical_data = pd.concat([taxonomical_data, selected_data], ignore_index=True)

# Remove duplicate rows based on taxonomical information
taxonomical_data.drop_duplicates(subset=['organism_wikidata'], inplace=True)

# Save the taxonomical data to a CSV file
taxonomical_data.to_csv('C:/Users/quirosgu/Documents/GitHub/Yggdrasil/chembl/species_with_active_compounds.csv', index=False)

print("Taxonomical data saved to species_taxonomical_data.csv")


  taxonomical_data = pd.concat([taxonomical_data, selected_data], ignore_index=True)
  taxonomical_data = pd.concat([taxonomical_data, selected_data], ignore_index=True)
  taxonomical_data = pd.concat([taxonomical_data, selected_data], ignore_index=True)
  taxonomical_data = pd.concat([taxonomical_data, selected_data], ignore_index=True)
  taxonomical_data = pd.concat([taxonomical_data, selected_data], ignore_index=True)
  taxonomical_data = pd.concat([taxonomical_data, selected_data], ignore_index=True)
  taxonomical_data = pd.concat([taxonomical_data, selected_data], ignore_index=True)
  taxonomical_data = pd.concat([taxonomical_data, selected_data], ignore_index=True)
  taxonomical_data = pd.concat([taxonomical_data, selected_data], ignore_index=True)
  taxonomical_data = pd.concat([taxonomical_data, selected_data], ignore_index=True)
  taxonomical_data = pd.concat([taxonomical_data, selected_data], ignore_index=True)
  taxonomical_data = pd.concat([taxonomical_data, selected_data],

Taxonomical data saved to species_taxonomical_data.csv
