In [1]:
'''
survey_kernel_tutorial.ipynb

Notebook containing WL kernel similarity tutorial for survey paper
that operates on example RAx group of chemicals. Contains code from Oxford Protein Informatics Group to featureize
RDKit graphs.

QA ID: I-CCED-0032994

Author: Brett Hagan

PI: Grace Patlewicz

Last modified: 3/14/24

'''

'\nsurvey_kernel_tutorial.ipynb\n\nNotebook containing WL kernel similarity tutorial for survey paper\nthat operates on example RAx group of chemicals. Contains code from Oxford Protein Informatics Group to featureize\nRDKit graphs.\n\nQA ID: I-CCED-0032994\n\nAuthor: Brett Hagan\n\nPI: Grace Patlewicz\n\nLast modified: 3/14/24\n\n'

In [6]:
import matplotlib.pyplot as plt
import grakel
from rdkit import Chem
import numpy as np
import pandas as pd
import networkx as nx

In [7]:
# list of dtxsids and smiles from category
dtxsids = ['2-Amino-4,6-Dinitrotoluene',
'2,4,6-Trinitrotoluene',
'2-Methyl-5-nitroaniline',
'Isopropalin',
'Pendimethalin',
'Trifluralin'
]

smiles = ['CC1=C(C=C(C=C1N)[N+]([O-])=O)[N+]([O-])=O',
'CC1=C(C=C(C=C1[N+]([O-])=O)[N+]([O-])=O)[N+]([O-])=O',
'CC1=C(N)C=C(C=C1)[N+]([O-])=O',
'CCCN(CCC)C1=C(C=C(C=C1[N+]([O-])=O)C(C)C)[N+]([O-])=O',
'CCC(CC)NC1=C(C=C(C)C(C)=C1[N+]([O-])=O)[N+]([O-])=O',
'CCCN(CCC)C1=C(C=C(C=C1[N+]([O-])=O)C(F)(F)F)[N+]([O-])=O'
]

In [8]:
# function to create molecular graphs from SMILES

def smile_to_mol_graph(smile):

    mol = Chem.MolFromSmiles(smile)
    g = nx.Graph()

    for atom in mol.GetAtoms():
        g.add_node(atom.GetIdx(),
                   atom_symbol = atom.GetSymbol())
    return g

In [9]:
# conver to grakel graph objects with atom symbol as feature

graphs = [smile_to_mol_graph(smile) for smile in smiles]
grakel_graphs = grakel.graph_from_networkx(graphs,node_labels_tag='atom_symbol')

In [10]:
# define WL kernel
# highest similarity corresponds to correct RAx source analogue
# visualize in notebook

wl_kernel = grakel.WeisfeilerLehman(base_graph_kernel=grakel.VertexHistogram,normalize=True)
p = wl_kernel.fit_transform(grakel_graphs)
df = pd.DataFrame(p)
display(df)

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.983182,0.970035,0.955779,0.968147,0.946442
1,0.983182,1.0,0.915321,0.900213,0.918241,0.897656
2,0.970035,0.915321,1.0,0.996872,0.998507,0.976121
3,0.955779,0.900213,0.996872,1.0,0.998969,0.976573
4,0.968147,0.918241,0.998507,0.998969,1.0,0.977581
5,0.946442,0.897656,0.976121,0.976573,0.977581,1.0
