### Cleaning of the EGFR dataset extracted from Excape db. 


In [1]:
# importing libraries

import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools
import useful_rdkit_utils as uru
from collections import Counter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ro5_calc = uru.Ro5Calculator()

In [3]:
# provide path
path = './raw/EGFR_actives_from_Excape.csv'
df = pd.read_csv(path)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5209 entries, 0 to 5208
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Ambit_InchiKey     5209 non-null   object 
 1   Original_Entry_ID  5209 non-null   object 
 2   Entrez_ID          5209 non-null   int64  
 3   Activity_Flag      5209 non-null   object 
 4   pXC50              5186 non-null   float64
 5   DB                 5209 non-null   object 
 6   Original_Assay_ID  5209 non-null   int64  
 7   Tax_ID             5209 non-null   int64  
 8   Gene_Symbol        5209 non-null   object 
 9   Ortholog_Group     5209 non-null   int64  
 10  SMILES             5209 non-null   object 
dtypes: float64(1), int64(4), object(6)
memory usage: 447.8+ KB


In [5]:
df = df[['SMILES','pXC50']]

In [6]:
#rename columns
df.columns = ['smiles','value']

In [7]:
#check for null values
df.isnull().sum()

smiles     0
value     23
dtype: int64

In [8]:
# drop all rows with null values
df = df.dropna(axis = 'rows')

In [9]:
# generate canonical SMILES to double-check for duplicates

canonical_smiles = []
for s in df.smiles:
    try:
        c = Chem.MolToSmiles(Chem.MolFromSmiles(s))
        canonical_smiles.append(c)
    except:
        canonical_smiles.append(None)


[13:26:33] Explicit valence for atom # 1 N, 4, is greater than permitted
[13:26:33] Explicit valence for atom # 1 N, 4, is greater than permitted
[13:26:34] Explicit valence for atom # 4 O, 3, is greater than permitted
[13:26:34] Explicit valence for atom # 1 N, 4, is greater than permitted
[13:26:35] Explicit valence for atom # 1 N, 4, is greater than permitted


In [10]:
df['canonical_smiles'] = canonical_smiles

In [11]:
# Perform the necessary steps before checking

df_nuniques = df.groupby('canonical_smiles').agg({'smiles' : 'nunique','value': 'nunique'}).reset_index(drop=False)
df_first = df.groupby('canonical_smiles').agg({'smiles' : 'first','value': 'unique' }).reset_index(drop=False)


In [12]:
df_w_info = df_nuniques.merge(df_first[['canonical_smiles','value']], on='canonical_smiles').rename(columns={'value_x' : 'nb_values', 'value_y' : 'values'})

In [13]:
df_w_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5151 entries, 0 to 5150
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   canonical_smiles  5151 non-null   object
 1   smiles            5151 non-null   int64 
 2   nb_values         5151 non-null   int64 
 3   values            5151 non-null   object
dtypes: int64(2), object(2)
memory usage: 161.1+ KB


In [14]:
df_w_info.head()

Unnamed: 0,canonical_smiles,smiles,nb_values,values
0,Brc1ccc(C2=NN(c3ccccc3)C(c3ccc4ccccc4c3)C2)cc1,1,1,[6.15]
1,Brc1ccc(C2=NN(c3ccccc3)C(c3cccc4ccccc34)C2)cc1,1,1,[6.31]
2,Brc1ccc(C2=NN(c3nc(-c4ccccc4)cs3)C(c3ccc(Br)cc...,1,1,[5.05]
3,Brc1ccc(CNc2ccc3ncnc(Nc4cccc(Br)c4)c3c2)cc1,1,1,[5.5]
4,Brc1ccc2[nH]nc(-c3ccccc3)c2c1,1,1,[5.7]


In [16]:
df_w_info[ro5_calc.names] = df_w_info.canonical_smiles.apply(ro5_calc.calc_smiles).to_list()


In [17]:
df_w_info.describe()

Unnamed: 0,smiles,nb_values,MolWt,LogP,HBD,HBA,TPSA
count,5151.0,5151.0,5151.0,5151.0,5151.0,5151.0,5151.0
mean,1.002912,1.005242,411.290554,4.204393,1.978063,5.807222,84.345156
std,0.05389,0.072217,103.392853,1.462652,1.127461,2.01314,29.098088
min,1.0,1.0,170.171,-1.98,0.0,0.0,3.88
25%,1.0,1.0,339.397,3.2594,1.0,4.0,66.24
50%,1.0,1.0,402.534,4.1937,2.0,6.0,82.7
75%,1.0,1.0,473.8375,5.10965,3.0,7.0,100.0
max,2.0,2.0,942.25,10.1293,16.0,17.0,379.3


The distributions indicate that 
- some compounds are associated with more than 1 activity value
- some compounds have extreme values of Ro5 descriptors.

We select only compounds with one unique pIC50 value and use relaxed Ro5 criteria to clean the dataset before use

In [18]:
criteria_query = """nb_values < 2  and smiles < 2 and MolWt <= 600 and LogP <= 6 and HBD <= 6 and HBA <= 10 and TPSA <= 150"""
df_remaining = df_w_info.query(criteria_query)

In [19]:
df_remaining.drop(columns=['smiles'], inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_remaining.drop(columns=['smiles'], inplace=True)


In [20]:
df_remaining.rename(columns={'canonical_smiles' : 'smiles'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_remaining.rename(columns={'canonical_smiles' : 'smiles'}, inplace=True)


In [21]:
df_remaining.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4431 entries, 4 to 5150
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   smiles     4431 non-null   object 
 1   nb_values  4431 non-null   int64  
 2   values     4431 non-null   object 
 3   MolWt      4431 non-null   float64
 4   LogP       4431 non-null   float64
 5   HBD        4431 non-null   float64
 6   HBA        4431 non-null   float64
 7   TPSA       4431 non-null   float64
dtypes: float64(5), int64(1), object(2)
memory usage: 311.6+ KB


In [22]:
activity_value = [ v[0] for v in df_remaining['values']]

df_remaining['value'] = activity_value

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_remaining['value'] = activity_value


In [23]:
df_remaining[['smiles', 'value']].to_csv('./egfr_cleaned.csv', index=None)