Extracting features for the graph neural network. Hybridization (for node) and conjugation (for edges).

In [16]:
import rdkit 
from rdkit import Chem
#from rdkit.Chem import rdchem
import pandas as pd


In [25]:
df = pd.read_csv("../1_data/processed/cleaned_data.csv") # read in cleaned dataframe so we can get the SMILES strings

def get_hybridization(smiles): # function to get the hybridization of each atom and return a series 
    mol = Chem.MolFromSmiles(smiles) # create molecule from smiles string
   
    if mol:
        hybrid_list = []
        atom_list = []
        for atom in mol.GetAtoms(): # iterate over each atom in the molecule
            hybridization = atom.GetHybridization() 
            hybrid_list.append(hybridization)
            atom_list.append(atom.GetSymbol())
        # print(atom_list)
        return pd.Series(hybrid_list), pd.Series(atom_list)

    else:
        return pd.Series(None)
    
df['Hybridization'] = df['Smiles'].apply(get_hybridization)


In [26]:
df

#df.to_csv("hybridization_features.csv")

Unnamed: 0,Molecule ChEMBL ID,Smiles,pChEMBL Value,Classification,Hybridization
0,CHEMBL3235962,N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc(C(F)(F...,7.08,2.0,"([2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3,..."
1,CHEMBL3235983,C[C@H](NC(=O)N1CCc2ccccc2[C@H]1c1ccc(C(F)(F)F)...,8.00,2.0,"([4, 4, 3, 3, 3, 3, 4, 4, 3, 3, 3, 3, 3, 3, 4,..."
2,CHEMBL1650511,FC(F)(F)c1ccccc1-c1cc(C(F)(F)F)c2[nH]c(C3=NOC4...,9.38,2.0,"([4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4,..."
3,CHEMBL2443068,O=C1CC2(CCN(C(=O)Nc3ccc(C(F)(F)F)cc3)CC2)Oc2c(...,6.64,1.0,"([3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 4,..."
4,CHEMBL3959823,Cc1cccc(CN(C(=O)c2ccccc2)[C@@H](C(N)=O)c2ccccc...,6.06,1.0,"([4, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3,..."
...,...,...,...,...,...
524,CHEMBL207433,O=C(Nc1ccccc1Br)N[C@@H]1CCN(c2ccc(C(F)(F)F)cn2)C1,5.80,1.0,"([3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 4, 4, 4, 3,..."
525,CHEMBL1377190,CCOC(=O)c1sc(-c2ccc(Cl)cc2)nc1O,6.26,1.0,"([4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,..."
526,CHEMBL5275535,CCn1nnc(-c2sc(-c3cnccn3)nc2O)n1,5.72,1.0,"([4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,..."
527,CHEMBL5269450,CC(C)CC(=O)c1sc(-c2cccc(F)c2)nc1O,6.54,1.0,"([4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4,..."


Get the conjugation features.

In [27]:
# df = pd.read_csv("../1_data/processed/cleaned_data.csv")

def get_conjugation(smiles):

    mol = rdkit.Chem.MolFromSmiles(smiles) # create a molecule from smiles
    bonds = mol.GetBonds() # get all the bonds in the molecule
    vector = []
    for bond in bonds: # iterate through the bonds and determine if conjugated
        if bond.GetIsConjugated():
            vector.append(1) # if bond is conjugated, encode with 1
        else:
            vector.append(0) # if bond is not conjugated encode with 0
    return vector
    #return pd.Series(vector)
     
df['Conjugation'] = df2['Smiles'].apply(get_conjugation)

df

Unnamed: 0,Molecule ChEMBL ID,Smiles,pChEMBL Value,Classification,Hybridization,Conjugation
0,CHEMBL3235962,N#Cc1cccc(NC(=O)N2CCc3ccccc3[C@H]2c2ccc(C(F)(F...,7.08,2.0,"([2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, ..."
1,CHEMBL3235983,C[C@H](NC(=O)N1CCc2ccccc2[C@H]1c1ccc(C(F)(F)F)...,8.00,2.0,"([4, 4, 3, 3, 3, 3, 4, 4, 3, 3, 3, 3, 3, 3, 4,...","[0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, ..."
2,CHEMBL1650511,FC(F)(F)c1ccccc1-c1cc(C(F)(F)F)c2[nH]c(C3=NOC4...,9.38,2.0,"([4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4,...","[0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ..."
3,CHEMBL2443068,O=C1CC2(CCN(C(=O)Nc3ccc(C(F)(F)F)cc3)CC2)Oc2c(...,6.64,1.0,"([3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 4,...","[1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, ..."
4,CHEMBL3959823,Cc1cccc(CN(C(=O)c2ccccc2)[C@@H](C(N)=O)c2ccccc...,6.06,1.0,"([4, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3,...","[0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...,...,...,...
524,CHEMBL207433,O=C(Nc1ccccc1Br)N[C@@H]1CCN(c2ccc(C(F)(F)F)cn2)C1,5.80,1.0,"([3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 4, 4, 4, 3,...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, ..."
525,CHEMBL1377190,CCOC(=O)c1sc(-c2ccc(Cl)cc2)nc1O,6.26,1.0,"([4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,...","[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, ..."
526,CHEMBL5275535,CCn1nnc(-c2sc(-c3cnccn3)nc2O)n1,5.72,1.0,"([4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,...","[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
527,CHEMBL5269450,CC(C)CC(=O)c1sc(-c2cccc(F)c2)nc1O,6.54,1.0,"([4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4,...","[0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, ..."
