In [1]:
import os
import warnings
import pickle
import sqlite3

import pandas
import numpy as np

from rdkit import Chem
from rdkit.Chem import AllChem

# import cuml
# cuml.set_global_output_type('input')
# cuml.global_output_type = 'input'

import cupy
import cudf
from cuml.manifold import UMAP
from cuml.decomposition import PCA, TruncatedSVD
from cuml.cluster import KMeans
from cuml.neighbors import NearestNeighbors
from cugraph import louvain, leiden, Graph
from cugraph.layout.force_atlas2 import force_atlas2

from bokeh.io.export import export_png
from bokeh.plotting import figure
from bokeh.models.tickers import FixedTicker
from bokeh.io import output_notebook, push_notebook, show

warnings.filterwarnings('ignore', 'Expected ')
warnings.simplefilter('ignore')



output_notebook()


## Settings

In [2]:
# Settings
radius = 2
nBits = 512

pca_comps = 64
n_clusters = 6
n_neighbors = 100
num_mols = 30000


## Generate ECFP and MinHash Fingerprints

MinHash fingerprints and large integer values -- should normalize them later.

In [3]:
chembl_db_file = '/data/db/chembl_27.db'

chembl_pkl_file = '/data/tmp/chembl.pkl'
fp_pkl_file = '/data/tmp/ecfp.pkl'
mh_pkl_file = '/data/tmp/minhash.pkl'

if os.path.exists(fp_pkl_file) & os.path.exists(mh_pkl_file):

    # ECFP fingerprints
    fp = cupy.load(fp_pkl_file, allow_pickle=True)

    # MinHash version of fingerprints
    mh = cupy.load(mh_pkl_file, allow_pickle=True)

else:
    # tmap seems to not be compatible with the RAPIDS conda environment -- I created a separate environment and installed
    # will also need numpy and pandas in the environment
    import tmap
    
    chembl_db = 'file:{}?mode=ro'.format(chembl_db_file)
    minhash = tmap.Minhash()

    select_stmt = '''
        SELECT md.molregno, cs.canonical_smiles, cs.standard_inchi
        FROM compound_properties cp,
             molecule_dictionary md,
             compound_structures cs
        WHERE cp.molregno = md.molregno
              AND md.molregno = cs.molregno
        LIMIT 500000
    '''
    
    df = pandas.read_sql(select_stmt,
                         sqlite3.connect(chembl_db, uri=True),
                         index_col='molregno')
    df['mol'] = df['canonical_smiles'].map(Chem.MolFromSmiles)
    df['fp'] = df['mol'].map(lambda x: AllChem.GetMorganFingerprintAsBitVect(x, radius=radius, nBits=nBits).ToBitString())
    df['fp'] = df['fp'].map(lambda x: [int(y) for y in x])
    df['mh'] = df['fp'].map(lambda x: list(minhash.from_binary_array(x)))
    
    with open(mh_pkl_file, 'wb') as fh:
        minhash = np.array(df['mh'].tolist())
        pickle.dump(minhash, fh)
        del minhash
        
    with open(fp_pkl_file, 'wb') as fh:
        fp = np.array(df['fp'].tolist())
        pickle.dump(fp, fh)
        del fp
    
    df.to_pickle(chembl_pkl_file)
    del df

In [4]:
mh.shape, mh[0, :10]

((500000, 128),
 array([ 71131466,  69994144, 305801762, 361969177, 103568912,  43153142,
        122906807, 194869780,  73700223,  20475546]))

## Plotting Function

In [5]:
COLORS = ["#406278", "#e32636", "#9966cc", "#cd9575", "#915c83", "#008000",
          "#ff9966", "#848482", "#8a2be2", "#de5d83", "#800020", "#e97451",
          "#5f9ea0", "#36454f", "#008b8b", "#e9692c", "#f0b98d", "#ef9708",
          "#0fcfc0", "#9cded6", "#d5eae7", "#f3e1eb", "#f6c4e1", "#f79cd4"]


def show_cluster_plot(df, title='UMAP'):
    """
    Draws a scatter plots from output of UMAP.
    """
    umap_fig = figure(title=title, width=800, output_backend="webgl")

    for cluster,dat in df.groupby('cluster'):
        x_array = dat['x']
        y_array = dat['y']

        color = COLORS[cluster % len(COLORS)]
        umap_fig.circle(x_array.to_pandas(),
                        y_array.to_pandas(),
                        size=2,
                        color=color,
                        alpha=0.5, 
                        legend = 'Cluster ' + str(cluster))

    umap_fig.legend.location = 'top_right'
    umap_fig.legend.title = 'Clusters'
    
    umap_fig_handle = show(umap_fig, notebook_handle=True)
    push_notebook(handle=umap_fig_handle)
    
def standard_scaler(df):
    return (df - df.std(axis=0)) / df.mean(axis=0)

## ECFP -> MinHash -> Scale -> PCA -> KMeans + UMAP

Similar to the previous version -- separation of clusters is very poor -- this is consistent with the publication.

In [6]:
# Normalize
df_xf = standard_scaler(mh[:num_mols])

# PCA
pca = PCA(n_components=pca_comps)
df_xf = pca.fit_transform(df_xf)

# UMAP
umap = UMAP(n_neighbors=n_neighbors,
            a=1.0,
            b=1.0,
            learning_rate=1.0)
Xt = umap.fit_transform(df_xf)

# KMeans
clusters = KMeans(n_clusters=n_clusters).fit(df_xf).labels_

plot_df = cudf.DataFrame({'x': Xt[:, 0], 'y':Xt[:, 1], 'cluster': clusters})
show_cluster_plot(plot_df)

## ECFP -> MinHash -> SVD -> NearestNeighbors -> Louvain + UMAP

In [7]:
# Normalize
df_xf = standard_scaler(mh[:num_mols])
# df_xf = mh[:num_mols]

# PCA
# pca = PCA(n_components=pca_comps)
# df_xf = pca.fit_transform(df_xf)
svd = TruncatedSVD(n_components=2, output_type='input')
df_xf = svd.fit_transform(df_xf)

# Calculate nearest neighbors graph and then extract indices (row/col) for edge list
nng = NearestNeighbors(n_neighbors=n_neighbors, output_type='input')
nng.fit(df_xf)
df_g = nng.kneighbors_graph(X=df_xf).tocoo()
edge_list = cudf.DataFrame({'row':df_g.row, 'col':df_g.col})

# UMAP
umap = UMAP(n_neighbors=n_neighbors,
            a=0.5,
            b=1.0,
            learning_rate=1.0)
Xt = umap.fit_transform(df_xf)

# #####################

# Create a graph from the edgelist
G = Graph()
G.from_cudf_edgelist(edge_list, 'row', 'col')

# Perform clustering on the graph object
louvain_parts, _  = louvain(G)
louvain_clusters = louvain_parts.sort_values('vertex')['partition']

plot_df = cudf.DataFrame({'x': Xt[:, 0], 'y':Xt[:, 1], 'cluster': louvain_clusters})
show_cluster_plot(plot_df)


Same Louvain clusters, different UMAP settings -- looks worse.

In [8]:
# UMAP
umap = UMAP(n_neighbors=n_neighbors,
            min_dist=0.05,
            spread=1.0,
            a=1.9,
            b=1.5,
            learning_rate=1.0)
Xt = umap.fit_transform(df_xf)

plot_df = cudf.DataFrame({'x': Xt[:, 0], 'y':Xt[:, 1], 'cluster': louvain_clusters})
show_cluster_plot(plot_df)

## ECFP -> MinHash -> PCA -> NearestNeighbors -> Leiden + UMAP

In [9]:

# Normalize
df_xf = standard_scaler(mh[:num_mols])
# df_xf = mh[:num_mols]

# PCA
# pca = PCA(n_components=pca_comps)
# df_xf = pca.fit_transform(df_xf)
svd = TruncatedSVD(n_components=2)
df_xf = svd.fit_transform(df_xf)

# Calculate nearest neighbors graph and then extract indices (row/col) for edge list
nng = NearestNeighbors(n_neighbors=n_neighbors, output_type='input')
nng.fit(df_xf)
df_g = nng.kneighbors_graph(X=df_xf).tocoo()
edge_list = cudf.DataFrame({'row':df_g.row, 'col':df_g.col})

# UMAP
umap = UMAP(n_neighbors=n_neighbors,
            a=0.5,
            b=1.0,
            learning_rate=1.0)
Xt = umap.fit_transform(df_xf)

# Create a graph from the edgelist
G = Graph()
G.from_cudf_edgelist(edge_list, 'row', 'col')

###############

# Perform clustering on the graph object
leiden_parts, _  = leiden(G)
leiden_clusters = leiden_parts.sort_values('vertex')['partition']

plot_df = cudf.DataFrame({'x': Xt[:, 0], 'y':Xt[:, 1], 'cluster': leiden_clusters})
show_cluster_plot(plot_df)

## X

In [10]:
# Normalize
df_xf = standard_scaler(mh)

# PCA
pca = PCA(n_components=pca_comps)
df_xf = pca.fit_transform(df_xf)

# Calculate nearest neighbors graph and then extract indices (row/col) for edge list
nng = NearestNeighbors(n_neighbors=n_neighbors, output_type='input')
nng.fit(df_xf)
df_g = nng.kneighbors_graph(X=df_xf).tocoo()
edge_list = cudf.DataFrame({'row':df_g.row, 'col':df_g.col})

# Create a graph from the edgelist
G = Graph()
G.from_cudf_edgelist(edge_list, 'row', 'col')

# Perform clustering on the graph object
leiden_parts, _  = leiden(G)
leiden_clusters = leiden_parts.sort_values('vertex')['partition']

#####

# Force Atlas
Xt = force_atlas2(G).sort_values('vertex')[['x', 'y']]

plot_df = cudf.DataFrame({'x': Xt['x'], 'y':Xt['y'], 'cluster': leiden_clusters})
show_cluster_plot(plot_df)