<b>[Author]</b> Eric March Vila<br>
<b>[Organisation]</b> Universitat Pompeu Fabra<br>
<b>[Year]</b> 2022

In [1]:
import pandas as pd

from curate.chem import chembl_extraction

#### Warning disable
We are getting rid of warnings for visualization purposes.
RDKit warnings require a special function besides jupyter notebook code.

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from rdkit import RDLogger
from rdkit.rdBase import DisableLog

for level in RDLogger._levels:
    DisableLog(level)

## STEP 1: use list of ChEMBL IDs to extract data from ChEMBL

In [4]:
chembl_list = pd.read_excel('ED_targets.xlsx')

In [5]:
chembl_list

Unnamed: 0,chembl_id,name,type,species,activity thresholds
0,CHEMBL1871,androgen receptor,single protein,homo sapiens,Nuclear receptor (<= 100nM)
1,CHEMBL242,estrogen receptor beta,single protein,homo sapiens,Nuclear receptor (<= 100nM)
2,CHEMBL206,estrogen receptor alfa,single protein,homo sapiens,Nuclear receptor (<= 100nM)
3,CHEMBL1860,thyroid receptor alpha,single protein,homo sapiens,Nuclear receptor (<= 100nM)
4,CHEMBL1947,thyroid receptor beta 1,single protein,homo sapiens,Nuclear receptor (<= 100nM)
5,CHEMBL2034,glucocorticoid receptor,single protein,homo sapiens,Nuclear receptor (<= 100nM)


In [None]:
concatenated_chembl_target_compounds, warning = chembl_extraction.concatenate_dataframes_from_different_chembl_ids(chembl_list)

In [None]:
concatenated_chembl_target_compounds.loc[:,'pchembl_value'] = concatenated_chembl_target_compounds.loc[:,'pchembl_value'].astype(float)

In [None]:
concatenated_chembl_target_compounds.head()

## Step 2: curate the data structures

In [None]:
def curate_data(dataframe, structure_column, remove_problematic=False) -> pd.DataFrame:
    """
        Check SMILES column to get a curated SMILES and the type of substance.

        :return curated_data: dataframe containing the curated information
    """

    from curate.chem import structure_curation as cur
    data_cur = cur.Curator()

    curated_data = dataframe

    for i, row in curated_data.iterrows():
        smi = row[structure_column]
        data_cur.get_rdkit_mol(smi)
        sub_type, san_smi = data_cur.filter_smiles()
        curated_data.loc[curated_data.index == i,'structure_curated'] = san_smi
        curated_data.loc[curated_data.index == i,'substance_type_name'] = sub_type

    if remove_problematic:
        curated_data, problematic_structures = remove_problematic_structures(curated_data)
    
    return curated_data

def remove_problematic_structures(dataframe) -> pd.DataFrame:
    """
        Remove problematic structures from main dataset.
        Returns cleaned dataset and problematic structures a part.

        :param data: input data to be cleaned

        :return curated_data: data without problematic structures
        :return problematic_structures: data with the problematic structures
    """

    problem_struc_list =  ['organometallic', 'no_sanitizable', 'inorganic_salt', 
                          'inorganic', 'inorganic_metal', 'no_sanitizable_organic',
                          'no_sanitizable_inorganic', 'no_sanitizable_organometallic']

    curated_data = dataframe.loc[~dataframe['substance_type_name'].isin(problem_struc_list)]
    problematic_structures = dataframe.loc[dataframe['substance_type_name'].isin(problem_struc_list)]

    return curated_data, problematic_structures

In [None]:
curdata = curate_data(concatenated_chembl_target_compounds, 'canonical_smiles',remove_problematic=True)

## Step 3: split dataframe in the initial ChEMBL IDs and create one sdf for each

In [None]:
from curate.util import utils
from rdkit import Chem
from rdkit.Chem import Descriptors

In [None]:
for i, row in curdata.iterrows():
    smiles = row['structure_curated']
    m = Chem.MolFromSmiles(smiles)
    logp = Descriptors.MolLogP(m)
    mol_weight = Descriptors.MolWt(m)
    num_heavy_atoms = Descriptors.HeavyAtomCount(m)
    curdata.loc[curdata.index == i, 'logP'] = logp
    curdata.loc[curdata.index == i, 'MW'] = mol_weight
    curdata.loc[curdata.index == i, 'heavy_atoms'] = num_heavy_atoms

In [None]:
chembl_ids = curdata.chembl_id.unique()

#### Set the filter variables
- max_mw: sets the top limit of the MW to filter. Anything above this will be discarded
- min_mw: sets the minimum limit of MW. Anything below this will be discarded
- assay_type: selects the type of ChEMBL assay to use. User can choose between 'EC50', 'IC50', 'Ki', 'Kd'. Or it can also check manually the standard_type column and see which options are there.
- activity threshold: as suggested in IDG (https://druggablegenome.net/ProteinFam) the activity threshold is set to generate a qualitative set of active/non-active targets at a certain concentration value. Thresholds are the following, depending on the target family. Less or equal to the threshold means inactive and larger than the threshold active:
    - Kinases: 30nM -> -log(30*10^⁻9) = 7,52
    - GPCRs: 100nM -> -log(100*10^-9) = 7
    - Nuclear Receptors: 100nM -> -log(100*10^-9) = 7
    - Ion Channels: 10μM -> -log(10*10^-6) = 5
    - Non-IDG Family Targets: 1μM -> -log(1*10^-6) = 6


In [None]:
max_mw = 500
min_mw = 100
assay_type = 'IC50'
activity_threshold = 7

In [None]:
dict_of_frames = {}

In [None]:
for chembl_id in chembl_ids:
    df_ = curdata.loc[(curdata['chembl_id'] == chembl_id) &
                     (curdata['MW'] <= max_mw) &
                     (curdata['MW'] > min_mw) &
                     (curdata['standard_type'] == assay_type)]
    
    df_.loc[df_['pchembl_value'] <= activity_threshold, 'activity'] = 0
    df_.loc[df_['pchembl_value'] > activity_threshold, 'activity'] = 1
    dict_of_frames[chembl_id] = df_
    utils.write_sdf(data=df_, outfile_name=chembl_id, smiles_column='structure_curated', identifier='molecule_chembl_id')

#### Step 4: Use the dictionary of Dataframes to study the imbalance and apply oversampling to the underrepresented class