In [1]:
import os.path

import numpy as np
import pandas as pd
from ogb.graphproppred import GraphPropPredDataset
from skfp.preprocessing import MolFromSmilesTransformer, ConformerGenerator



# Process the datasets

The dataset stores molecules in SMILES format

To compute the fingerprints we need molecule objects

Some fingerprints require 3D conformers so we generate those too

In [2]:
DATASET_NAMES = ["ogbg-molbace", "ogbg-molbbbp", "ogbg-molhiv"]
CLASS_NAMES = ["Class", "p_np", "HIV_active"]

if not os.path.exists("saved_mols"): 
    os.mkdir("saved_mols")

all_datasets = []
for dataset_name, feature in zip(DATASET_NAMES, CLASS_NAMES):
    print(dataset_name)
    if not os.path.exists(f"saved_mols/{dataset_name}_mols_labels.npy"):
        GraphPropPredDataset(name=dataset_name, root="../dataset")
        dataset = pd.read_csv(
            f"dataset/{'_'.join(dataset_name.split('-'))}/mapping/mol.csv.gz"
        )

        X = dataset["smiles"]
        X = MolFromSmilesTransformer().transform(X)
        y = dataset[feature]

        X, y = np.array(
            ConformerGenerator(n_jobs=-1, error_on_gen_fail=False).transform_x_y(X, y)
        )
        np.save(f"saved_mols/{dataset_name}_mols_labels.npy", y, allow_pickle=True)
        np.save(f"saved_mols/{dataset_name}_mols_with_conformers.npy", X, allow_pickle=True)
        all_datasets.append((X, y))
    else:
        y = np.load(f"saved_mols/{dataset_name}_mols_labels.npy", allow_pickle=True)
        X = np.load(f"saved_mols/{dataset_name}_mols_with_conformers.npy", allow_pickle=True)
        all_datasets.append((X, y))
    

ogbg-molbace
ogbg-molbbbp
ogbg-molhiv


# Compute, scale and save the fingerprints

We want to compute the individual fingerprints and save them so that we can work on them later on without worrying about computing them multiple times

In [3]:
from skfp.fingerprints import *

all_fingerprints = [
    AtomPairFingerprint,
    AutocorrFingerprint,
    AvalonFingerprint,
    # E3FPFingerprint,
    ECFPFingerprint,
    ERGFingerprint,
    EStateFingerprint,
    # GETAWAYFingerprint,
    GhoseCrippenFingerprint,
    KlekotaRothFingerprint,
    LaggnerFingerprint,
    LayeredFingerprint,
    LingoFingerprint,
    MACCSFingerprint,
    MAPFingerprint,
    MHFPFingerprint,
    MordredFingerprint,
    MORSEFingerprint,
    PatternFingerprint,
    # PharmacophoreFingerprint,
    PhysiochemicalPropertiesFingerprint,
    PubChemFingerprint,
    RDFFingerprint,
    RDKitFingerprint,
    SECFPFingerprint,
    TopologicalTorsionFingerprint,
    USRFingerprint,
    USRCATFingerprint,
    WHIMFingerprint,
]


In [4]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler

transformed_datasets = []
for dataset_name, data in zip(DATASET_NAMES, all_datasets):
    print(dataset_name)
    
    os.makedirs(f"saved_fingerprints/{dataset_name}",exist_ok=True)
    X, y = data
    
    transformed_datasets.append([])
    for fingerprint in all_fingerprints:
        print(f" - - - {fingerprint.__name__}")
        
        if not os.path.exists(f"saved_fingerprints/{dataset_name}/{fingerprint.__name__}.npy"):
            
            # some fingerprints can't process all moleculex. they implement transform_x_y method
            if fingerprint.__name__ not in ["USRFingerprint","USRCATFingerprint"]:
                X_transformed = fingerprint(n_jobs=-1).transform(X)
                y_transformed = y # we can't just use the same y everytime, as it can differ in size
            
            else:
                X_transformed,y_transformed = fingerprint(n_jobs=-1).transform_x_y(X,y)        
                np.save(f"saved_fingerprints/{dataset_name}/{fingerprint.__name__}_labels.npy", y_transformed, allow_pickle=True)
            
            # we can perform some preprocessing - Mordred fingerprint can have missing values.
            X_transformed = SimpleImputer(strategy="median").fit_transform(X_transformed)
            X_transformed = RobustScaler().fit_transform(X_transformed)
            
            np.save(f"saved_fingerprints/{dataset_name}/{fingerprint.__name__}.npy", X_transformed, allow_pickle=True)
            transformed_datasets[-1].append((X_transformed,y_transformed))
        
        else:
            X_transformed = np.load(f"saved_fingerprints/{dataset_name}/{fingerprint.__name__}.npy", allow_pickle=True)
            
            if fingerprint.__name__ not in ["USRFingerprint","USRCATFingerprint"]:
                y_transformed = y
            
            else:
                y_transformed = np.load(f"saved_fingerprints/{dataset_name}/{fingerprint.__name__}_labels.npy", allow_pickle=True)   
            
            transformed_datasets[-1].append((X_transformed,y_transformed))
                
        

ogbg-molbace
 - - - AtomPairFingerprint
 - - - AutocorrFingerprint
 - - - AvalonFingerprint
 - - - ECFPFingerprint
 - - - ERGFingerprint
 - - - EStateFingerprint
 - - - GhoseCrippenFingerprint
 - - - KlekotaRothFingerprint
 - - - LaggnerFingerprint
 - - - LayeredFingerprint
 - - - LingoFingerprint
 - - - MACCSFingerprint
 - - - MAPFingerprint
 - - - MHFPFingerprint
 - - - MordredFingerprint
 - - - MORSEFingerprint
 - - - PatternFingerprint
 - - - PhysiochemicalPropertiesFingerprint
 - - - PubChemFingerprint
 - - - RDFFingerprint
 - - - RDKitFingerprint
 - - - SECFPFingerprint
 - - - TopologicalTorsionFingerprint
 - - - USRFingerprint
 - - - USRCATFingerprint
 - - - WHIMFingerprint
ogbg-molbbbp
 - - - AtomPairFingerprint
 - - - AutocorrFingerprint
 - - - AvalonFingerprint
 - - - ECFPFingerprint
 - - - ERGFingerprint
 - - - EStateFingerprint
 - - - GhoseCrippenFingerprint
 - - - KlekotaRothFingerprint
 - - - LaggnerFingerprint
 - - - LayeredFingerprint
 - - - LingoFingerprint
 - - - MACC

# visualize the fingerprints using the 4 different methods

In [5]:
from matplotlib import pyplot as plt
from pacmap import PaCMAP
from trimap import TRIMAP
from umap import UMAP
from sklearn.manifold import TSNE

In [6]:
def plot_classes(X_2D, y, name, file_path):
    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(1, 1, 1)
    ax.set_xlabel("Component 1", fontsize=15)
    ax.set_ylabel("Component 2", fontsize=15)
    ax.set_title(name, fontsize=20)
    scatter = ax.scatter(X_2D[:, 0], X_2D[:, 1], c=y, marker=".")
    legend = ax.legend(*scatter.legend_elements(), loc="best", title="Classes")
    ax.add_artist(legend)
    plt.tight_layout()
    ax.savefig(file_path)

In [7]:
dim_reduction_transformers = [TSNE, UMAP, TRIMAP, PaCMAP]

for dataset_name, data in zip(DATASET_NAMES, transformed_datasets):        
    for fingerprint, transformed_mols in zip(all_fingerprints, data):
        for transformer in dim_reduction_transformers:
            os.makedirs(f"saved_plots/{dataset_name}/{transformer.__name__}",exist_ok=True)
            name = fingerprint.__name__
            if not os.path.exists(f"saved_plots/{dataset_name}/{transformer.__name__}/{name}.png"):
                X, y = transformed_mols
                X_2D = transformer.transform(X)
                plot_classes(X_2D, y, name, f"saved_plots/{dataset_name}/{transformer.__name__}/{name}.png")
                