In [1]:
'''
survey_kernel_tutorial.ipynb

Notebook containing WL kernel similarity tutorial for survey paper
that operates on example RAx group of chemicals. Contains code from Oxford Protein Informatics Group to featureize
RDKit graphs.

QA ID: I-CCED-0032994

Author: Brett Hagan

PI: Grace Patlewicz

Last modified: 3/14/24

'''

'\nsurvey_kernel_tutorial.ipynb\n\nNotebook containing WL kernel similarity tutorial for survey paper\nthat operates on example RAx group of chemicals. Contains code from Oxford Protein Informatics Group to featureize\nRDKit graphs.\n\nQA ID: I-CCED-0032994\n\nAuthor: Brett Hagan\n\nPI: Grace Patlewicz\n\nLast modified: 3/14/24\n\n'

In [1]:
import matplotlib.pyplot as plt
import grakel
from rdkit import Chem
import numpy as np
import pandas as pd
import networkx as nx

In [16]:
# list of dtxsids and smiles from category
dtxsids = ['2-Amino-4,6-Dinitrotoluene',
'2,4,6-Trinitrotoluene',
'2-Methyl-5-nitroaniline',
'Isopropalin',
'Pendimethalin',
'Trifluralin'
]

smiles = ['CC1=C(C=C(C=C1N)[N+]([O-])=O)[N+]([O-])=O',
'CC1=C(C=C(C=C1[N+]([O-])=O)[N+]([O-])=O)[N+]([O-])=O',
'CC1=C(N)C=C(C=C1)[N+]([O-])=O',
'CCCN(CCC)C1=C(C=C(C=C1[N+]([O-])=O)C(C)C)[N+]([O-])=O',
'CCC(CC)NC1=C(C=C(C)C(C)=C1[N+]([O-])=O)[N+]([O-])=O',
'CCCN(CCC)C1=C(C=C(C=C1[N+]([O-])=O)C(F)(F)F)[N+]([O-])=O'
]

In [21]:
# function to create molecular graphs from SMILES

def smile_to_mol_graph(smile):
    mol = Chem.MolFromSmiles(smile)
    g = nx.Graph()

    # Add nodes with atom symbols
    for atom in mol.GetAtoms():
        g.add_node(atom.GetIdx(), atom_symbol=atom.GetSymbol())
    
    # Add edges (bonds) with bond types
    for bond in mol.GetBonds():
        g.add_edge(bond.GetBeginAtomIdx(),
                   bond.GetEndAtomIdx(),
                   bond_type=bond.GetBondType())
    return g

In [26]:
# conver to grakel graph objects with atom symbol as feature

graphs = [smile_to_mol_graph(smile) for smile in smiles]
grakel_graphs = grakel.graph_from_networkx(graphs,node_labels_tag='atom_symbol')

In [23]:
graphs

[<networkx.classes.graph.Graph at 0x77cde2dd8550>,
 <networkx.classes.graph.Graph at 0x77cde2dd8820>,
 <networkx.classes.graph.Graph at 0x77cde2dd8a30>,
 <networkx.classes.graph.Graph at 0x77cde2dd87f0>,
 <networkx.classes.graph.Graph at 0x77cde2dd8b50>,
 <networkx.classes.graph.Graph at 0x77cde2dd8e80>]

In [24]:
# define WL kernel
# highest similarity corresponds to correct RAx source analogue
# visualize in notebook

wl_kernel = grakel.WeisfeilerLehman(base_graph_kernel=grakel.VertexHistogram,normalize=True)
p = wl_kernel.fit_transform(grakel_graphs)
df = pd.DataFrame(p)
display(df)

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.877781,0.734344,0.656103,0.703633,0.616602
1,0.877781,1.0,0.633474,0.597295,0.646695,0.569431
2,0.734344,0.633474,1.0,0.670563,0.69991,0.616493
3,0.656103,0.597295,0.670563,1.0,0.763393,0.853267
4,0.703633,0.646695,0.69991,0.763393,1.0,0.696215
5,0.616602,0.569431,0.616493,0.853267,0.696215,1.0


In [27]:
from grakel import GraphKernel
# Initialize Graphlet Kernel
gk = GraphKernel(kernel=["graphlet_sampling"], normalize=True)

# Compute Kernel Matrix
kernel_matrix = gk.fit_transform(grakel_graphs)


In [29]:
pd.DataFrame(kernel_matrix)

Unnamed: 0,0,1,2,3,4,5
0,1.0,0.993798,0.993249,0.977407,0.987774,0.987481
1,0.993798,1.0,0.999988,0.994849,0.998983,0.998793
2,0.993249,0.999988,1.0,0.995327,0.999189,0.999004
3,0.977407,0.994849,0.995327,1.0,0.998408,0.998313
4,0.987774,0.998983,0.999189,0.998408,1.0,0.999852
5,0.987481,0.998793,0.999004,0.998313,0.999852,1.0


Graphlet Kernel: Focuses on counting small connected subgraphs (graphlets) within the molecular graphs. This could add fine-grained comparison for small datasets.

In [30]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.0.5-py3-none-any.whl.metadata (24 kB)
Collecting wrapt (from smart-open>=1.8.1->gensim)
  Downloading wrapt-1.17.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.4 kB)
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m45.0 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m
[?25hDownloading numpy-1