### Cleaning of the DRD2 dataset extracted from Excape db. 


In [1]:
# importing libraries

import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools
import useful_rdkit_utils as uru
from collections import Counter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ro5_calc = uru.Ro5Calculator()

In [4]:
# provide path
path = './raw/DRD2_actives_from_Excape.csv'
df = pd.read_csv(path)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8323 entries, 0 to 8322
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Ambit_InchiKey     8323 non-null   object 
 1   Original_Entry_ID  8323 non-null   object 
 2   Entrez_ID          8323 non-null   int64  
 3   Activity_Flag      8323 non-null   object 
 4   pXC50              8323 non-null   float64
 5   DB                 8323 non-null   object 
 6   Original_Assay_ID  8323 non-null   int64  
 7   Tax_ID             8323 non-null   int64  
 8   Gene_Symbol        8323 non-null   object 
 9   Ortholog_Group     8323 non-null   int64  
 10  SMILES             8323 non-null   object 
dtypes: float64(1), int64(4), object(6)
memory usage: 715.4+ KB


In [6]:
df = df[['SMILES','pXC50']]

In [7]:
#rename columns
df.columns = ['smiles','value']

In [8]:
#check for null values
df.isnull().sum()

smiles    0
value     0
dtype: int64

In [9]:
# drop all rows with null values
df = df.dropna(axis = 'rows')

In [10]:
# generate canonical SMILES to double-check for duplicates

canonical_smiles = []
for s in df.smiles:
    try:
        c = Chem.MolToSmiles(Chem.MolFromSmiles(s))
        canonical_smiles.append(c)
    except:
        canonical_smiles.append(None)


[11:15:19] Explicit valence for atom # 6 N, 5, is greater than permitted


In [11]:
df['canonical_smiles'] = canonical_smiles

In [12]:
# Perform the necessary steps before checking

df_nuniques = df.groupby('canonical_smiles').agg({'smiles' : 'nunique','value': 'nunique'}).reset_index(drop=False)
df_first = df.groupby('canonical_smiles').agg({'smiles' : 'first','value': 'unique' }).reset_index(drop=False)


In [13]:
df_w_info = df_nuniques.merge(df_first[['canonical_smiles','value']], on='canonical_smiles').rename(columns={'value_x' : 'nb_values', 'value_y' : 'values'})

In [14]:
df_w_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7939 entries, 0 to 7938
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   canonical_smiles  7939 non-null   object
 1   smiles            7939 non-null   int64 
 2   nb_values         7939 non-null   int64 
 3   values            7939 non-null   object
dtypes: int64(2), object(2)
memory usage: 248.2+ KB


In [15]:
df_w_info.head()

Unnamed: 0,canonical_smiles,smiles,nb_values,values
0,Brc1ccc(-[n+]2cc[n+](Cc3ccccc3)cc2)c2cc[nH]c12,1,1,[7.91364]
1,Brc1ccc(N2CCN(Cc3ccccc3)CC2)c2cc[nH]c12,1,2,"[7.33, 7.91]"
2,Brc1cccc(N2CCN(Cc3cc4ccccn4n3)CC2)n1,1,1,[6.26]
3,Brc1cccc(N2CCN(Cc3cnn4ccccc34)CC2)n1,1,1,[6.82391]
4,Brc1cccc2c1[C@@H]1CCN(CC3CC3)[C@@H]1CC2,1,1,[6.92]


In [17]:
df_w_info[ro5_calc.names] = df_w_info.canonical_smiles.apply(ro5_calc.calc_smiles).to_list()


In [18]:
df_w_info.describe()

Unnamed: 0,smiles,nb_values,MolWt,LogP,HBD,HBA,TPSA
count,7939.0,7939.0,7939.0,7939.0,7939.0,7939.0,7939.0
mean,1.012218,1.04333,395.649016,4.067812,0.902381,4.172566,46.263823
std,0.109865,0.206073,90.479917,1.200677,0.901648,1.790402,23.050839
min,1.0,1.0,173.219,-1.6579,0.0,0.0,3.24
25%,1.0,1.0,337.423,3.29595,0.0,3.0,29.54
50%,1.0,1.0,395.24,4.0359,1.0,4.0,44.81
75%,1.0,1.0,447.667,4.77963,1.0,5.0,61.1
max,2.0,3.0,997.288,13.323,7.0,18.0,208.75


The distributions indicate that 
- some compounds are associated with more than 1 activity value
- some compounds have extreme values of Ro5 descriptors.

We select only compounds with one unique pIC50 value and use relaxed Ro5 criteria to clean the dataset before use

In [19]:
criteria_query = """nb_values < 2  and smiles < 2 and MolWt <= 600 and LogP <= 6 and HBD <= 6 and HBA <= 10 and TPSA <= 150"""
df_remaining = df_w_info.query(criteria_query)

In [20]:
df_remaining.drop(columns=['smiles'], inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_remaining.drop(columns=['smiles'], inplace=True)


In [21]:
df_remaining.rename(columns={'canonical_smiles' : 'smiles'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_remaining.rename(columns={'canonical_smiles' : 'smiles'}, inplace=True)


In [22]:
df_remaining.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7179 entries, 0 to 7938
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   smiles     7179 non-null   object 
 1   nb_values  7179 non-null   int64  
 2   values     7179 non-null   object 
 3   ROMol      7179 non-null   object 
 4   MolWt      7179 non-null   float64
 5   LogP       7179 non-null   float64
 6   HBD        7179 non-null   float64
 7   HBA        7179 non-null   float64
 8   TPSA       7179 non-null   float64
dtypes: float64(5), int64(1), object(3)
memory usage: 560.9+ KB


In [23]:
activity_value = [ v[0] for v in df_remaining['values']]

df_remaining['value'] = activity_value

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_remaining['value'] = activity_value


In [25]:
df_remaining[['smiles', 'value']].to_csv('./drd2_cleaned.csv', index=None)