In [1]:
import pandas as pd

import curate as cur

from curate.chem import chembl_extraction

## STEP 1: use list of ChEMBL IDs to extract data from ChEMBL

In [2]:
chembl_list = pd.read_excel('chemtest_complex.xlsx')

In [3]:
chembl_list

Unnamed: 0,chembl_id,dummy1,dummy2
0,CHEMBL230,dc11,dc12
1,CHEMBL25,dc21,dc22
2,CHEMBL4523582,dc31,dc32
3,CHEMBL3616356,dc41,dc42


In [4]:
concatenated_chembl_target_compounds, warning = chembl_extraction.concatenate_dataframes_from_different_chembl_ids(chembl_list)

In [5]:
concatenated_chembl_target_compounds.head()

Unnamed: 0,assay_description,molecule_chembl_id,molecule_pref_name,canonical_smiles,pchembl_value,standard_type,standard_relation,standard_value,standard_units,target_pref_name,target_organism,chembl_id
0,Inhibition of PGE-2 production by arachidonic ...,CHEMBL91832,,CC1(C)C(=O)C(c2ccc(F)cc2)=C1c1ccc(S(C)(=O)=O)cc1,8.3,IC50,=,5.0,nM,Cyclooxygenase-2,Homo sapiens,CHEMBL230
1,Inhibition of PGE-2 production by arachidonic ...,CHEMBL91118,,C=C1CC(c2ccc(S(C)(=O)=O)cc2)=C1c1ccccc1,8.92,IC50,=,1.2,nM,Cyclooxygenase-2,Homo sapiens,CHEMBL230
2,Inhibition of PGE-2 production by arachidonic ...,CHEMBL92443,,C=C1C(c2ccccc2)=C(c2ccc(S(C)(=O)=O)cc2)C1(C)C,8.66,IC50,=,2.2,nM,Cyclooxygenase-2,Homo sapiens,CHEMBL230
3,Inhibition of PGE-2 production by arachidonic ...,CHEMBL328003,,CS(=O)(=O)c1ccc(C2=C(c3ccccc3)C(=O)C2)cc1,6.96,IC50,=,110.0,nM,Cyclooxygenase-2,Homo sapiens,CHEMBL230
4,Inhibition of PGE-2 production by arachidonic ...,CHEMBL330516,,CC1(C)C(c2ccc(S(C)(=O)=O)cc2)=C(c2ccccc2)/C1=N/O,7.21,IC50,=,61.0,nM,Cyclooxygenase-2,Homo sapiens,CHEMBL230


## Step 2: curate the data structures

In [6]:
def curate_data(dataframe, structure_column, remove_problematic=False) -> pd.DataFrame:
    """
        Check SMILES column to get a curated SMILES and the type of substance.

        :return curated_data: dataframe containing the curated information
    """

    from curate.chem import structure_curation as cur
    data_cur = cur.Curator()

    curated_data = dataframe

    for i, row in curated_data.iterrows():
        smi = row[structure_column]
        data_cur.get_rdkit_mol(smi)
        sub_type, san_smi = data_cur.filter_smiles()
        curated_data.loc[curated_data.index == i,'structure_curated'] = san_smi
        curated_data.loc[curated_data.index == i,'substance_type_name'] = sub_type

    if remove_problematic:
        curated_data, problematic_structures = remove_problematic_structures(curated_data)
    
    return curated_data

def remove_problematic_structures(dataframe) -> pd.DataFrame:
    """
        Remove problematic structures from main dataset.
        Returns cleaned dataset and problematic structures a part.

        :param data: input data to be cleaned

        :return curated_data: data without problematic structures
        :return problematic_structures: data with the problematic structures
    """

    problem_struc_list =  ['organometallic', 'no_sanitizable', 'inorganic_salt', 
                          'inorganic', 'inorganic_metal', 'no_sanitizable_organic',
                          'no_sanitizable_inorganic', 'no_sanitizable_organometallic']

    curated_data = dataframe.loc[~dataframe['substance_type_name'].isin(problem_struc_list)]
    problematic_structures = dataframe.loc[dataframe['substance_type_name'].isin(problem_struc_list)]

    return curated_data, problematic_structures

In [7]:
curdata = curate_data(concatenated_chembl_target_compounds, 'canonical_smiles',remove_problematic=True)

RDKit INFO: [16:04:40] Initializing Normalizer
RDKit INFO: [16:04:40] Running Normalizer
RDKit INFO: [16:04:40] Running Uncharger
RDKit INFO: [16:04:40] Running Normalizer
RDKit INFO: [16:04:40] Running Uncharger
RDKit INFO: [16:04:40] Running Normalizer
RDKit INFO: [16:04:40] Running Uncharger
RDKit INFO: [16:04:40] Running Normalizer
RDKit INFO: [16:04:40] Running Uncharger
RDKit INFO: [16:04:40] Running Normalizer
RDKit INFO: [16:04:40] Running Uncharger
RDKit INFO: [16:04:40] Running Normalizer
RDKit INFO: [16:04:40] Running Uncharger
RDKit INFO: [16:04:40] Running Normalizer
RDKit INFO: [16:04:40] Running Uncharger
RDKit INFO: [16:04:40] Running Normalizer
RDKit INFO: [16:04:40] Running Uncharger
RDKit INFO: [16:04:40] Running Normalizer
RDKit INFO: [16:04:40] Running Uncharger
RDKit INFO: [16:04:40] Running Normalizer
RDKit INFO: [16:04:40] Running Uncharger
RDKit INFO: [16:04:40] Running Normalizer
RDKit INFO: [16:04:40] Running Uncharger
RDKit INFO: [16:04:40] Running Normalize

## Step 3: split dataframe in the initial ChEMBL IDs and create one sdf for each

In [13]:
from curate.util import utils

In [10]:
chembl_ids = curdata.chembl_id.unique()

In [15]:
curdata.head()

Unnamed: 0,assay_description,molecule_chembl_id,molecule_pref_name,canonical_smiles,pchembl_value,standard_type,standard_relation,standard_value,standard_units,target_pref_name,target_organism,chembl_id,structure_curated,substance_type_name
0,Inhibition of PGE-2 production by arachidonic ...,CHEMBL91832,,CC1(C)C(=O)C(c2ccc(F)cc2)=C1c1ccc(S(C)(=O)=O)cc1,8.3,IC50,=,5.0,nM,Cyclooxygenase-2,Homo sapiens,CHEMBL230,CC1(C)C(=O)C(c2ccc(F)cc2)=C1c1ccc(S(C)(=O)=O)cc1,organic
1,Inhibition of PGE-2 production by arachidonic ...,CHEMBL91118,,C=C1CC(c2ccc(S(C)(=O)=O)cc2)=C1c1ccccc1,8.92,IC50,=,1.2,nM,Cyclooxygenase-2,Homo sapiens,CHEMBL230,C=C1CC(c2ccc(S(C)(=O)=O)cc2)=C1c1ccccc1,organic
2,Inhibition of PGE-2 production by arachidonic ...,CHEMBL92443,,C=C1C(c2ccccc2)=C(c2ccc(S(C)(=O)=O)cc2)C1(C)C,8.66,IC50,=,2.2,nM,Cyclooxygenase-2,Homo sapiens,CHEMBL230,C=C1C(c2ccccc2)=C(c2ccc(S(C)(=O)=O)cc2)C1(C)C,organic
3,Inhibition of PGE-2 production by arachidonic ...,CHEMBL328003,,CS(=O)(=O)c1ccc(C2=C(c3ccccc3)C(=O)C2)cc1,6.96,IC50,=,110.0,nM,Cyclooxygenase-2,Homo sapiens,CHEMBL230,CS(=O)(=O)c1ccc(C2=C(c3ccccc3)C(=O)C2)cc1,organic
4,Inhibition of PGE-2 production by arachidonic ...,CHEMBL330516,,CC1(C)C(c2ccc(S(C)(=O)=O)cc2)=C(c2ccccc2)/C1=N/O,7.21,IC50,=,61.0,nM,Cyclooxygenase-2,Homo sapiens,CHEMBL230,CC1(C)C(c2ccc(S(C)(=O)=O)cc2)=C(c2ccccc2)/C1=N/O,organic


In [16]:
for chembl_id in chembl_ids:
    df_ = curdata.loc[curdata['chembl_id'] == chembl_id]
    utils.write_sdf(data=df_, outfile_name=chembl_id, smiles_column='structure_curated', identifier='molecule_chembl_id')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frame[molCol] = frame[smilesCol].map(Chem.MolFromSmiles)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value
