# Drugs

## Features

In [1]:
import pandas as pd
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit import Chem, RDLogger


In [2]:
df_binarized = pd.read_csv('KIBA_binarized.csv')
df_binarized

Unnamed: 0,Drug_ID,Drug,Target_ID,Target,Y
0,CHEMBL1087421,COc1cc2c(cc1Cl)C(c1ccc(Cl)c(Cl)c1)=NCC2,O00141,MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...,0
1,CHEMBL1087421,COc1cc2c(cc1Cl)C(c1ccc(Cl)c(Cl)c1)=NCC2,O14920,MSWSPSLTTQTCGAWEMKERLGTGGFGNVIRWHNQETGEQIAIKQC...,0
2,CHEMBL1087421,COc1cc2c(cc1Cl)C(c1ccc(Cl)c(Cl)c1)=NCC2,O15111,MERPPGLRPGAGGPWEMRERLGTGGFGNVCLYQHRELDLKIAIKSC...,0
3,CHEMBL1087421,COc1cc2c(cc1Cl)C(c1ccc(Cl)c(Cl)c1)=NCC2,P00533,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,0
4,CHEMBL1087421,COc1cc2c(cc1Cl)C(c1ccc(Cl)c(Cl)c1)=NCC2,P04626,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...,0
...,...,...,...,...,...
117652,CHEMBL230654,CCCc1nc[nH]c1CNc1cc(Cl)c2ncc(C#N)c(Nc3ccc(F)c(...,Q13554,MATTVTCTRFTDEYQLYEDIGKGAFSVVRRCVKLCTGHEYAAKIIN...,0
117653,CHEMBL230654,CCCc1nc[nH]c1CNc1cc(Cl)c2ncc(C#N)c(Nc3ccc(F)c(...,Q13555,MATTATCTRFTDDYQLFEELGKGAFSVVRRCVKKTSTQEYAAKIIN...,0
117654,CHEMBL230654,CCCc1nc[nH]c1CNc1cc(Cl)c2ncc(C#N)c(Nc3ccc(F)c(...,Q13557,MASTTTCTRFTDEYQLFEELGKGAFSVVRRCMKIPTGQEYAAKIIN...,0
117655,CHEMBL230654,CCCc1nc[nH]c1CNc1cc(Cl)c2ncc(C#N)c(Nc3ccc(F)c(...,Q16539,MSQERPTFYRQELNKTIWEVPERYQNLSPVGSGAYGSVCAAFDTKT...,0


Get Unique Drugs

In [3]:
df_binarized = df_binarized.drop_duplicates(subset=['Drug'])
df_binarized 

Unnamed: 0,Drug_ID,Drug,Target_ID,Target,Y
0,CHEMBL1087421,COc1cc2c(cc1Cl)C(c1ccc(Cl)c(Cl)c1)=NCC2,O00141,MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...,0
18,CHEMBL1088633,COc1cc2c(cc1Cl)C(c1cccc(Cl)c1)=NCC2,O00141,MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...,0
36,CHEMBL1090360,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,O00141,MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...,0
159,CHEMBL1688215,Nc1nccc(-c2ccc3c(N)n[nH]c3c2)n1,O00141,MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...,0
177,CHEMBL1765781,CNc1cncc(-c2c[nH]c(=O)c(NC(=O)c3ccc(N4CCCC4CN4...,O00141,MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...,0
...,...,...,...,...,...
117591,CHEMBL2002182,O=c1[nH]nc2c(-c3ccccc3)cccn12,P07949,MAKATSGAAGLRLLLLLLLPLLGKVALGLYFSRDAYWEKLYVDQAA...,0
117604,CHEMBL1668418,COc1cc(O)c2c(c1)C=CCCCC(=O)C=CCC(C)OC2=O,P09619,MRLPGAMPALALKGELLLLSLLLLLEPQISQGLVVTPPGPELVLNV...,0
117620,CHEMBL408982,CCN1C(=CC(C)=O)Sc2ccc(OC)cc21,P11309,MPHEPHEPLTPPFSALPDPAGAPSRRQSRQRPQLSSDSPSAFRASR...,1
117631,CHEMBL307152,Cn1ccc2ccc3c4[nH]c5c(CCCNC6CCC(O)CC6)cccc5c4c4...,P11802,MATSRYEPVAEIGVGAYGTVYKARDPHSGHFVALKSVRVPNGGGGG...,1


## Standardize Drugs


In [4]:
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit import Chem, RDLogger

RDLogger.DisableLog('rdApp.*') # disable rdkit warnings


def standardize(smiles):
    # convert to mol object
    mol = Chem.MolFromSmiles(smiles)

    # avoid errors with invalid molecules
    if mol is None:
        return None

    # removeHs, disconnect metal atoms, normalize the molecule, reionize the molecule
    clean_mol = rdMolStandardize.Cleanup(mol)

    # if many fragments, get the "parent" (the actual mol we are interested in)
    parent_clean_mol = rdMolStandardize.FragmentParent(clean_mol)

    # try to neutralize molecule
    uncharger = rdMolStandardize.Uncharger()
    uncharged_parent_clean_mol = uncharger.uncharge(parent_clean_mol)

    # convert back to smiles
    uncharged_parent_clean_smiles = Chem.MolToSmiles(uncharged_parent_clean_mol)
    return uncharged_parent_clean_smiles

# apply the function to the SMILES column
standardized_smiles = df_binarized.copy().apply(lambda x: standardize(x['Drug']), axis=1)
standardized_smiles.head()

0                COc1cc2c(cc1Cl)C(c1ccc(Cl)c(Cl)c1)=NCC2
18                   COc1cc2c(cc1Cl)C(c1cccc(Cl)c1)=NCC2
36     O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...
159                      Nc1nccc(-c2ccc3c(N)n[nH]c3c2)n1
177    CNc1cncc(-c2c[nH]c(=O)c(NC(=O)c3ccc(N4CCCC4CN4...
dtype: object

# Generate descriptors

Generate descriptors from the SMILES

In [5]:
import pandas as pd
import numpy as np
from rdkit.Chem.GraphDescriptors import Ipc
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors

# 2D descriptors
def get_descriptors(smiles):
    feature_names = [x[0] for x in Descriptors._descList]
    try:
        mol = Chem.MolFromSmiles(smiles)
        calc = MoleculeDescriptors.MolecularDescriptorCalculator(feature_names)
        # Deal with very large/inf values of the Ipc descriptor (https://github.com/rdkit/rdkit/issues/1527)
        # find position of Ipc
        pos = feature_names.index("Ipc")
        # calculate AvgIpc
        avg_ipc = Ipc(mol, avg=1)

        descriptors = list(calc.CalcDescriptors(mol))
        # replace Ipc with AvgIpc
        descriptors[pos] = avg_ipc
        descriptors = np.array(descriptors, dtype=np.float32)
        return descriptors
    except:
        # return np array of nans of size feature_names
        return np.empty(len(feature_names)) * np.nan

# calculate descriptors
descriptors = standardized_smiles.apply(lambda x: get_descriptors(x))
# convert to dataframe
descriptors_smiles = pd.DataFrame(descriptors.values.tolist(), columns=[x[0] for x in Descriptors._descList])
descriptors_smiles.insert(0, 'Drug', df_binarized['Drug'].values)
descriptors_smiles

Unnamed: 0,Drug,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,COc1cc2c(cc1Cl)C(c1ccc(Cl)c(Cl)c1)=NCC2,6.243423,0.517221,6.243423,0.517221,0.749475,340.636993,328.540985,338.998444,108.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,COc1cc2c(cc1Cl)C(c1cccc(Cl)c1)=NCC2,6.239307,0.599887,6.239307,0.599887,0.805831,306.191986,293.088013,305.037415,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,12.754791,-0.070738,12.754791,0.070738,0.216340,587.708984,558.476990,587.210327,214.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,Nc1nccc(-c2ccc3c(N)n[nH]c3c2)n1,5.701094,0.256224,5.701094,0.256224,0.577312,226.242996,216.162994,226.096695,84.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CNc1cncc(-c2c[nH]c(=O)c(NC(=O)c3ccc(N4CCCC4CN4...,12.919469,-0.376600,12.919469,0.165238,0.483859,473.580994,442.333008,473.253937,182.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2063,O=c1[nH]nc2c(-c3ccccc3)cccn12,11.407152,-0.215017,11.407152,0.215017,0.665489,211.223999,202.151993,211.074554,78.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2064,COc1cc(O)c2c(c1)C=CCCCC(=O)C=CCC(C)OC2=O,12.422585,-0.598066,12.422585,0.068241,0.795626,330.380005,308.204010,330.146729,128.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2065,CCN1C(=CC(C)=O)Sc2ccc(OC)cc21,11.194309,0.074742,11.194309,0.074742,0.770003,249.335007,234.214996,249.082352,90.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2066,Cn1ccc2ccc3c4[nH]c5c(CCCNC6CCC(O)CC6)cccc5c4c4...,13.124377,-0.327578,13.124377,0.133993,0.210694,494.595001,464.355011,494.231781,188.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


Generate Morgan Fingerprints from the drug (SMILES)

In [6]:
from rdkit.Chem import AllChem

def get_morgan_fingerprints(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        morgan_fingerprints = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
        return np.array(morgan_fingerprints, np.float32)
    except:
        # return np array of nans of size feature_names
        return np.empty(1024) * np.nan

# calculate morgan fingerprints
morgan_fingerprints = standardized_smiles.apply(lambda x: get_morgan_fingerprints(x))
# convert to dataframe
morgan_fingerprints = pd.DataFrame(morgan_fingerprints.values.tolist(), columns=[f'morgan_{i}' for i in range(1024)])
morgan_fingerprints.insert(0, 'Drug', df_binarized['Drug'].values)
morgan_fingerprints

Unnamed: 0,Drug,morgan_0,morgan_1,morgan_2,morgan_3,morgan_4,morgan_5,morgan_6,morgan_7,morgan_8,...,morgan_1014,morgan_1015,morgan_1016,morgan_1017,morgan_1018,morgan_1019,morgan_1020,morgan_1021,morgan_1022,morgan_1023
0,COc1cc2c(cc1Cl)C(c1ccc(Cl)c(Cl)c1)=NCC2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,COc1cc2c(cc1Cl)C(c1cccc(Cl)c1)=NCC2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,O=C(Cc1ccccc1)Nc1cccc(-c2nc3sccn3c2-c2ccnc(Nc3...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Nc1nccc(-c2ccc3c(N)n[nH]c3c2)n1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CNc1cncc(-c2c[nH]c(=O)c(NC(=O)c3ccc(N4CCCC4CN4...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2063,O=c1[nH]nc2c(-c3ccccc3)cccn12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2064,COc1cc(O)c2c(c1)C=CCCCC(=O)C=CCC(C)OC2=O,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2065,CCN1C(=CC(C)=O)Sc2ccc(OC)cc21,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2066,Cn1ccc2ccc3c4[nH]c5c(CCCNC6CCC(O)CC6)cccc5c4c4...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [7]:
descriptors_smiles.to_csv('drugs_descriptors_smiles.csv', index=False)

In [8]:
morgan_fingerprints.to_csv('drugs_morgan_fingerprints.csv', index=False)

# Encoding

### FAlta acabar aqui 
### Está Notebook aulas 11/12