### Imports

In [11]:
import pandas as pd
import numpy as np
from pathlib import Path

### Loading the Data

In [2]:
chembl_data = pd.read_csv(Path("../data/raw/Chembl_data_5HT2A.csv"))

### Looking at the Data

In [3]:
chembl_data.head()

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,...,Target Type,Document ChEMBL ID,Source ID,Source Description,Document Journal,Document Year,Cell ChEMBL ID,Properties,relation_clean,pKi_numeric
0,CHEMBL4212943,,,452.53,0.0,4.42,8b,CC(=O)c1c(OCCCCN2CCN(c3cccc(F)c3)CC2)ccc2c(C)c...,Ki,'=',...,SINGLE PROTEIN,CHEMBL4184192,1,Scientific Literature,Bioorg Med Chem,2018.0,,,=,6.4
1,CHEMBL316527,,,235.28,0.0,1.07,9,COc1c2c(c(CCN)c3c1OCC3)CCO2,Ki,'=',...,SINGLE PROTEIN,CHEMBL1130147,1,Scientific Literature,J Med Chem,1997.0,,,=,5.35
2,CHEMBL4591410,,,336.4,0.0,3.49,22; PKSN-240,COc1ccc2[nH]cc(CCNCc3ccc(-c4cn[nH]c4)o3)c2c1,Ki,'=',...,SINGLE PROTEIN,CHEMBL4312034,1,Scientific Literature,Eur J Med Chem,2020.0,CHEMBL3307715,,=,7.55
3,CHEMBL4584504,,,350.39,0.0,4.6,34; PKSN-222,Oc1ccc(-c2ccc(CNCCc3c[nH]c4cc(F)ccc34)o2)cc1,Ki,'=',...,SINGLE PROTEIN,CHEMBL4312034,1,Scientific Literature,Eur J Med Chem,2020.0,CHEMBL3307715,,=,7.02
4,CHEMBL180010,,,481.43,1.0,5.16,"6, PG01037",O=C(NC/C=C/CN1CCN(c2cccc(Cl)c2Cl)CC1)c1ccc(-c2...,Ki,'=',...,SINGLE PROTEIN,CHEMBL1148745,1,Scientific Literature,J Med Chem,2007.0,,,=,7.21


In [4]:
chembl_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5454 entries, 0 to 5453
Data columns (total 47 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Molecule ChEMBL ID          5454 non-null   object 
 1   Molecule Name               1198 non-null   object 
 2   Molecule Max Phase          1045 non-null   float64
 3   Molecular Weight            5454 non-null   float64
 4   #RO5 Violations             5423 non-null   float64
 5   AlogP                       5423 non-null   float64
 6   Compound Key                5454 non-null   object 
 7   Smiles                      5454 non-null   object 
 8   Standard Type               5454 non-null   object 
 9   Standard Relation           4467 non-null   object 
 10  Standard Value              4490 non-null   float64
 11  Standard Units              4504 non-null   object 
 12  pChEMBL Value               3949 non-null   float64
 13  Data Validity Comment       20 no

In [5]:
chembl_data.columns

Index(['Molecule ChEMBL ID', 'Molecule Name', 'Molecule Max Phase',
       'Molecular Weight', '#RO5 Violations', 'AlogP', 'Compound Key',
       'Smiles', 'Standard Type', 'Standard Relation', 'Standard Value',
       'Standard Units', 'pChEMBL Value', 'Data Validity Comment', 'Comment',
       'Uo Units', 'Ligand Efficiency BEI', 'Ligand Efficiency LE',
       'Ligand Efficiency LLE', 'Ligand Efficiency SEI', 'Potential Duplicate',
       'Assay ChEMBL ID', 'Assay Description', 'Assay Type', 'BAO Format ID',
       'BAO Label', 'Assay Organism', 'Assay Tissue ChEMBL ID',
       'Assay Tissue Name', 'Assay Cell Type', 'Assay Subcellular Fraction',
       'Assay Parameters', 'Assay Variant Accession', 'Assay Variant Mutation',
       'Target ChEMBL ID', 'Target Name', 'Target Organism', 'Target Type',
       'Document ChEMBL ID', 'Source ID', 'Source Description',
       'Document Journal', 'Document Year', 'Cell ChEMBL ID', 'Properties',
       'relation_clean', 'pKi_numeric'],
      

In [6]:
chembl_data = chembl_data.rename(columns={'Smiles': 'smiles'})

In [7]:
chembl_data["Standard Relation"].unique()

array(["'='", nan, "'>'", "'<'", "'>='"], dtype=object)

In [8]:
chembl_data_filtered = chembl_data[chembl_data["Standard Relation"] == "'='"]

In [9]:
chembl_data_filtered["Standard Relation"].unique()

array(["'='"], dtype=object)

In [12]:
if 'Standard Value' in chembl_data_filtered.columns:

    chembl_data_calc = chembl_data_filtered[chembl_data_filtered['Standard Value'] > 0].copy()

    # pKi = 9 - log10(Ki[nM])
    # Values in 'Standard Value' column are Ki in nanomoles [nM]
    chembl_data_calc['pKi_calc'] = 9 - np.log10(chembl_data_calc['Standard Value'])

    print("Calculated pKi for ChemBL data:")
    print(chembl_data_calc[['Standard Value', 'pKi_calc']].head())

else:
    print("No 'Standard Value' column in chembl_data_filtered")

Calculated pKi for ChemBL data:
   Standard Value  pKi_calc
0           400.0  6.397940
1          4443.0  5.352324
2            28.0  7.552842
3            96.0  7.017729
4            62.4  7.204815


In [13]:
chembl_data_calc.columns

Index(['Molecule ChEMBL ID', 'Molecule Name', 'Molecule Max Phase',
       'Molecular Weight', '#RO5 Violations', 'AlogP', 'Compound Key',
       'smiles', 'Standard Type', 'Standard Relation', 'Standard Value',
       'Standard Units', 'pChEMBL Value', 'Data Validity Comment', 'Comment',
       'Uo Units', 'Ligand Efficiency BEI', 'Ligand Efficiency LE',
       'Ligand Efficiency LLE', 'Ligand Efficiency SEI', 'Potential Duplicate',
       'Assay ChEMBL ID', 'Assay Description', 'Assay Type', 'BAO Format ID',
       'BAO Label', 'Assay Organism', 'Assay Tissue ChEMBL ID',
       'Assay Tissue Name', 'Assay Cell Type', 'Assay Subcellular Fraction',
       'Assay Parameters', 'Assay Variant Accession', 'Assay Variant Mutation',
       'Target ChEMBL ID', 'Target Name', 'Target Organism', 'Target Type',
       'Document ChEMBL ID', 'Source ID', 'Source Description',
       'Document Journal', 'Document Year', 'Cell ChEMBL ID', 'Properties',
       'relation_clean', 'pKi_numeric', 'pKi_ca

In [14]:
chembl_data_calc = chembl_data_calc[['smiles', 'pKi_calc']]

In [16]:
chembl_data_calc.to_csv('../data/processed/chembl_data_processed.csv', index=False)