In [1]:
import pandas as pd

from curate.chem import chembl_extraction

#### Warning disable
We are getting rid of warnings for visualization purposes.
RDKit warnings require a special function besides jupyter notebook code.

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from rdkit import RDLogger
from rdkit.rdBase import DisableLog

for level in RDLogger._levels:
    DisableLog(level)

## STEP 1: use list of ChEMBL IDs to extract data from ChEMBL

In [4]:
chembl_list = pd.read_excel('chemtest_complex.xlsx')

In [5]:
chembl_list

Unnamed: 0,chembl_id,dummy1,dummy2
0,CHEMBL230,dc11,dc12
1,CHEMBL25,dc21,dc22
2,CHEMBL4523582,dc31,dc32
3,CHEMBL3616356,dc41,dc42


In [6]:
concatenated_chembl_target_compounds, warning = chembl_extraction.concatenate_dataframes_from_different_chembl_ids(chembl_list)

In [7]:
concatenated_chembl_target_compounds.head()

Unnamed: 0,assay_description,molecule_chembl_id,molecule_pref_name,canonical_smiles,pchembl_value,standard_type,standard_relation,standard_value,standard_units,target_pref_name,target_organism,chembl_id
0,Inhibition of PGE-2 production by arachidonic ...,CHEMBL91832,,CC1(C)C(=O)C(c2ccc(F)cc2)=C1c1ccc(S(C)(=O)=O)cc1,8.3,IC50,=,5.0,nM,Cyclooxygenase-2,Homo sapiens,CHEMBL230
1,Inhibition of PGE-2 production by arachidonic ...,CHEMBL91118,,C=C1CC(c2ccc(S(C)(=O)=O)cc2)=C1c1ccccc1,8.92,IC50,=,1.2,nM,Cyclooxygenase-2,Homo sapiens,CHEMBL230
2,Inhibition of PGE-2 production by arachidonic ...,CHEMBL92443,,C=C1C(c2ccccc2)=C(c2ccc(S(C)(=O)=O)cc2)C1(C)C,8.66,IC50,=,2.2,nM,Cyclooxygenase-2,Homo sapiens,CHEMBL230
3,Inhibition of PGE-2 production by arachidonic ...,CHEMBL328003,,CS(=O)(=O)c1ccc(C2=C(c3ccccc3)C(=O)C2)cc1,6.96,IC50,=,110.0,nM,Cyclooxygenase-2,Homo sapiens,CHEMBL230
4,Inhibition of PGE-2 production by arachidonic ...,CHEMBL330516,,CC1(C)C(c2ccc(S(C)(=O)=O)cc2)=C(c2ccccc2)/C1=N/O,7.21,IC50,=,61.0,nM,Cyclooxygenase-2,Homo sapiens,CHEMBL230


## Step 2: curate the data structures

In [8]:
def curate_data(dataframe, structure_column, remove_problematic=False) -> pd.DataFrame:
    """
        Check SMILES column to get a curated SMILES and the type of substance.

        :return curated_data: dataframe containing the curated information
    """

    from curate.chem import structure_curation as cur
    data_cur = cur.Curator()

    curated_data = dataframe

    for i, row in curated_data.iterrows():
        smi = row[structure_column]
        data_cur.get_rdkit_mol(smi)
        sub_type, san_smi = data_cur.filter_smiles()
        curated_data.loc[curated_data.index == i,'structure_curated'] = san_smi
        curated_data.loc[curated_data.index == i,'substance_type_name'] = sub_type

    if remove_problematic:
        curated_data, problematic_structures = remove_problematic_structures(curated_data)
    
    return curated_data

def remove_problematic_structures(dataframe) -> pd.DataFrame:
    """
        Remove problematic structures from main dataset.
        Returns cleaned dataset and problematic structures a part.

        :param data: input data to be cleaned

        :return curated_data: data without problematic structures
        :return problematic_structures: data with the problematic structures
    """

    problem_struc_list =  ['organometallic', 'no_sanitizable', 'inorganic_salt', 
                          'inorganic', 'inorganic_metal', 'no_sanitizable_organic',
                          'no_sanitizable_inorganic', 'no_sanitizable_organometallic']

    curated_data = dataframe.loc[~dataframe['substance_type_name'].isin(problem_struc_list)]
    problematic_structures = dataframe.loc[dataframe['substance_type_name'].isin(problem_struc_list)]

    return curated_data, problematic_structures

In [9]:
curdata = curate_data(concatenated_chembl_target_compounds, 'canonical_smiles',remove_problematic=True)

## Step 3: split dataframe in the initial ChEMBL IDs and create one sdf for each

In [10]:
from curate.util import utils
from rdkit import Chem
from rdkit.Chem import Descriptors

In [11]:
for i, row in curdata.iterrows():
    smiles = row['structure_curated']
    m = Chem.MolFromSmiles(smiles)
    logp = Descriptors.MolLogP(m)
    mol_weight = Descriptors.MolWt(m)
    num_heavy_atoms = Descriptors.HeavyAtomCount(m)
    curdata.loc[curdata.index == i, 'logP'] = logp
    curdata.loc[curdata.index == i, 'MW'] = mol_weight
    curdata.loc[curdata.index == i, 'heavy_atoms'] = num_heavy_atoms

In [12]:
chembl_ids = curdata.chembl_id.unique()

In [13]:
for chembl_id in chembl_ids:
    df_ = curdata.loc[(curdata['chembl_id'] == chembl_id) &
                     (curdata['MW'] <= 500) &
                     (curdata['MW'] > 100)]
    print(len(curdata), len(df_))
    utils.write_sdf(data=df_, outfile_name=chembl_id, smiles_column='structure_curated', identifier='molecule_chembl_id')

2524 2185
2524 98
2524 2
