In [7]:
import os
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors

In [3]:
train_df = pd.read_csv('data/balanced_train.csv')
train_df.head()

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds
0,45074911,C[C@@]1(C(=O)O)CCCN1C(=O)OCC1c2ccccc2-c2ccccc21,Cc1ccccc1-c1csc(N)n1,NCc1ccon1,Cc1ccccc1-c1csc(Nc2nc(NCc3ccon3)nc(N3CCC[C@@]3...,HSA,0
1,21655935,CCCCC(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,Nc1ccc2c(c1)CNC2=O,Cl.Cl.NCc1nc2c(s1)CCC2,CCCCC(Nc1nc(NCc2nc3c(s2)CCC3)nc(Nc2ccc3c(c2)CN...,BRD4,1
2,82557269,O=C(NCC1CCC(C(=O)O)CC1)OCC1c2ccccc2-c2ccccc21,Nc1cc(Cl)c(F)c(Cl)c1,CC(F)(F)CN.Cl,CC(F)(F)CNc1nc(NCC2CCC(C(=O)N[Dy])CC2)nc(Nc2cc...,sEH,1
3,195673909,O=C(Nc1ccc(C(=O)O)nc1)OCC1c2ccccc2-c2ccccc21,Cl.NCc1cnc2n1CCOC2,Nc1ncnc2c1ncn2C1CCCCO1,O=C(N[Dy])c1ccc(Nc2nc(NCc3cnc4n3CCOC4)nc(Nc3nc...,HSA,1
4,228828477,O=C(Nc1ncc(Br)cc1C(=O)O)OCC1c2ccccc2-c2ccccc21,CC(C)(CN)C(=O)N1CCCC1,Cn1ccc(S(=O)(=O)NCCN)c1,Cn1ccc(S(=O)(=O)NCCNc2nc(NCC(C)(C)C(=O)N3CCCC3...,BRD4,1


In [4]:
train_df = train_df.rename(columns={'buildingblock1_smiles': 'bb1_smiles',
                                    'buildingblock2_smiles': 'bb2_smiles',
                                    'buildingblock3_smiles': 'bb3_smiles'})
train_df.head()

Unnamed: 0,id,bb1_smiles,bb2_smiles,bb3_smiles,molecule_smiles,protein_name,binds
0,45074911,C[C@@]1(C(=O)O)CCCN1C(=O)OCC1c2ccccc2-c2ccccc21,Cc1ccccc1-c1csc(N)n1,NCc1ccon1,Cc1ccccc1-c1csc(Nc2nc(NCc3ccon3)nc(N3CCC[C@@]3...,HSA,0
1,21655935,CCCCC(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,Nc1ccc2c(c1)CNC2=O,Cl.Cl.NCc1nc2c(s1)CCC2,CCCCC(Nc1nc(NCc2nc3c(s2)CCC3)nc(Nc2ccc3c(c2)CN...,BRD4,1
2,82557269,O=C(NCC1CCC(C(=O)O)CC1)OCC1c2ccccc2-c2ccccc21,Nc1cc(Cl)c(F)c(Cl)c1,CC(F)(F)CN.Cl,CC(F)(F)CNc1nc(NCC2CCC(C(=O)N[Dy])CC2)nc(Nc2cc...,sEH,1
3,195673909,O=C(Nc1ccc(C(=O)O)nc1)OCC1c2ccccc2-c2ccccc21,Cl.NCc1cnc2n1CCOC2,Nc1ncnc2c1ncn2C1CCCCO1,O=C(N[Dy])c1ccc(Nc2nc(NCc3cnc4n3CCOC4)nc(Nc3nc...,HSA,1
4,228828477,O=C(Nc1ncc(Br)cc1C(=O)O)OCC1c2ccccc2-c2ccccc21,CC(C)(CN)C(=O)N1CCCC1,Cn1ccc(S(=O)(=O)NCCN)c1,Cn1ccc(S(=O)(=O)NCCNc2nc(NCC(C)(C)C(=O)N3CCCC3...,BRD4,1


In [8]:
def generate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return {
            'mol_wt': Descriptors.MolWt(mol),
            'log_p': Descriptors.MolLogP(mol),
            'num_h_donors': Descriptors.NumHDonors(mol),
            'num_h_acceptors': Descriptors.NumHAcceptors(mol)
        }
    else:
        return {
            'mol_wt': None,
            'log_p': None,
            'num_h_donors': None,
            'num_h_acceptors': None
        }

In [9]:
train_df['bb1_desc'] = train_df['bb1_smiles'].apply(generate_descriptors)
train_df['bb2_desc'] = train_df['bb2_smiles'].apply(generate_descriptors)
train_df['bb3_desc'] = train_df['bb3_smiles'].apply(generate_descriptors)
train_df['molecule_desc'] = train_df['molecule_smiles'].apply(generate_descriptors)
train_df.head()

Unnamed: 0,id,bb1_smiles,bb2_smiles,bb3_smiles,molecule_smiles,protein_name,binds,bb1_desc,bb2_desc,bb3_desc,molecule_desc
0,45074911,C[C@@]1(C(=O)O)CCCN1C(=O)OCC1c2ccccc2-c2ccccc21,Cc1ccccc1-c1csc(N)n1,NCc1ccon1,Cc1ccccc1-c1csc(Nc2nc(NCc3ccon3)nc(N3CCC[C@@]3...,HSA,0,"{'mol_wt': 351.4020000000001, 'log_p': 3.87460...","{'mol_wt': 190.271, 'log_p': 2.700720000000000...","{'mol_wt': 98.10499999999999, 'log_p': 0.13330...","{'mol_wt': 653.0730000000001, 'log_p': 3.58662..."
1,21655935,CCCCC(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,Nc1ccc2c(c1)CNC2=O,Cl.Cl.NCc1nc2c(s1)CCC2,CCCCC(Nc1nc(NCc2nc3c(s2)CCC3)nc(Nc2ccc3c(c2)CN...,BRD4,1,"{'mol_wt': 353.41800000000006, 'log_p': 4.1685...","{'mol_wt': 148.165, 'log_p': 0.512200000000000...","{'mol_wt': 227.15999999999997, 'log_p': 1.9341...","{'mol_wt': 669.1160000000002, 'log_p': 2.96710..."
2,82557269,O=C(NCC1CCC(C(=O)O)CC1)OCC1c2ccccc2-c2ccccc21,Nc1cc(Cl)c(F)c(Cl)c1,CC(F)(F)CN.Cl,CC(F)(F)CNc1nc(NCC2CCC(C(=O)N[Dy])CC2)nc(Nc2cc...,sEH,1,"{'mol_wt': 379.4560000000001, 'log_p': 4.41610...","{'mol_wt': 180.00900000000001, 'log_p': 2.7147...","{'mol_wt': 131.55300000000003, 'log_p': 1.0221...","{'mol_wt': 667.852, 'log_p': 4.927000000000003..."
3,195673909,O=C(Nc1ccc(C(=O)O)nc1)OCC1c2ccccc2-c2ccccc21,Cl.NCc1cnc2n1CCOC2,Nc1ncnc2c1ncn2C1CCCCO1,O=C(N[Dy])c1ccc(Nc2nc(NCc3cnc4n3CCOC4)nc(Nc3nc...,HSA,1,"{'mol_wt': 360.3690000000001, 'log_p': 4.14080...","{'mol_wt': 189.64599999999996, 'log_p': 0.2937...","{'mol_wt': 219.24799999999993, 'log_p': 1.1076...","{'mol_wt': 746.0969999999996, 'log_p': 2.12549..."
4,228828477,O=C(Nc1ncc(Br)cc1C(=O)O)OCC1c2ccccc2-c2ccccc21,CC(C)(CN)C(=O)N1CCCC1,Cn1ccc(S(=O)(=O)NCCN)c1,Cn1ccc(S(=O)(=O)NCCNc2nc(NCC(C)(C)C(=O)N3CCCC3...,BRD4,1,"{'mol_wt': 439.26500000000016, 'log_p': 4.9033...","{'mol_wt': 170.25599999999994, 'log_p': 0.5936...","{'mol_wt': 203.267, 'log_p': -0.73789999999999...","{'mol_wt': 826.0830000000003, 'log_p': 1.75619..."


In [10]:
train_df.to_csv('data/balanced_train_with_descriptors.csv', index=False)

In [None]:
def ECFP_from_smiles(smiles,
                     R = 2,
                     L = 2**10,
                     use_features = False,
                     use_chirality = False):
    """
    Input:
    - smiles ... SMILES string of input compound
    - R ... maximum radius of circular substructures
    - L ... fingerprint-length
    - use_features ... if false then use standard DAYLIGHT atom features, if true then use pharmacophoric atom features
    - use_chirality ... if true then append tetrahedral chirality flags to atom features

    Output:
    - np.array(feature_list) ... ECFP with length L and maximum radius R
    """

    molecule = AllChem.MolFromSmiles(smiles)
    feature_list = AllChem.GetMorganFingerprintAsBitVect(molecule,
                                                                       radius = R,
                                                                       nBits = L,
                                                                       useFeatures = use_features,
                                                                       useChirality = use_chirality)
    return np.array(feature_list)