In [1]:
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple

import umap.umap_ as umap
import umap.plot
import umap.utils as utils
import umap.aligned_umap
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from rdkit.Chem import AllChem
from rdkit.Chem import PandasTools
from rdkit.Chem.rdMolDescriptors import GetMACCSKeysFingerprint

  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit(nopython=False)


In [4]:
from typing import Dict, List, Tuple
from tqdm import tqdm

tqdm.pandas()

def create_manifold(
    df: pd.DataFrame, 
    n_components: int,
    n_neighbors: int, 
    min_dist: float,
    metric: str,
    random_state: int,
    target_weight: float,
    supervised: bool,
) -> Tuple[umap.umap_.UMAP, np.ndarray]:
    x, y, y_str = df.drop(["label", "label_str"], axis=1), df[["label"]].values.flatten(), df[["label_str"]].values.flatten()
    y_encoded = pd.factorize(y)[0]
    if supervised:
        manifold = umap.UMAP(
            n_components=n_components, 
            n_neighbors=n_neighbors, 
            min_dist=min_dist, 
            metric=metric, 
            random_state=random_state, 
            n_jobs=1,
            target_weight=target_weight,
        ).fit(x, y_encoded)
    else: 
        manifold = umap.UMAP(
            n_components=n_components, 
            n_neighbors=n_neighbors, 
            min_dist=min_dist, 
            metric=metric, 
            random_state=random_state, 
            n_jobs=1,
        ).fit(x)
    return manifold, y_str

def create_dfs_for_umap_before_readding() -> Tuple[
    pd.DataFrame, 
    pd.DataFrame, 
    pd.DataFrame, 
    pd.DataFrame, 
    pd.DataFrame, 
    pd.DataFrame, 
]:
    df_final = pd.read_csv("datasets/old_data/class_improved_env_biowin_both.csv", index_col=0)
    df_final = df_final[['cas', 'smiles','inchi_from_smiles', 'label', 'linear_label',
        'non_linear_label', 'miti_linear_label', 'miti_non_linear_label',
        'prediction_class']]
    df_final.reset_index(inplace=True, drop=True)
    df_removed = pd.read_csv("datasets/old_data/class_improved_env_biowin_both_removed_predicted.csv", index_col=0)
    df_removed.reset_index(inplace=True, drop=True)

    def convert_to_maccs_fingerprints(df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy()
        mols = [AllChem.MolFromSmiles(smiles) for smiles in df["smiles"]]
        fp_vecs = [GetMACCSKeysFingerprint(mol) for mol in mols]
        fp_arrays = [np.frombuffer(fp_vec.ToBitString().encode(), 'u1') - ord('0') for fp_vec in fp_vecs]
        df["fingerprint"] = [fp_array.tolist() for fp_array in fp_arrays]
        return df

    df_final = convert_to_maccs_fingerprints(df=df_final)
    df_removed = convert_to_maccs_fingerprints(df=df_removed)
    
    cols = [*range(0, 167, 1)]
    df_final_fp = pd.DataFrame(df_final.fingerprint.tolist(), columns=cols)
    df_removed_fp = pd.DataFrame(df_removed.fingerprint.tolist(), columns=cols)
    # Create scs labeled with removed and kept
    df_final_fp["label"] = "not removed"
    df_removed_fp["label"] = "removed"
    df_scs_removed_labels = pd.concat([df_final_fp, df_removed_fp], axis=0)
    df_scs_removed_labels["label_str"] = df_scs_removed_labels['label']

    # Create scs labeled with two labels: RB and NRB
    df_final_fp["label"] = df_final["label"]
    df_removed_fp["label"] = df_removed['label']
    df_scs_two_labels = pd.concat([df_final_fp, df_removed_fp], axis=0)
    df_scs_two_labels["label_str"] = df_scs_two_labels['label'].map({0: "NRB", 1: "RB"})

    # Create scs labeled with four labels: RB and NRB
    df_final_fp["label"] = df_final["label"]
    df_removed_fp["label"] = df_removed['label'] + 2
    df_scs_four_labels = pd.concat([df_final_fp, df_removed_fp], axis=0)
    df_scs_four_labels["label_str"] = df_scs_four_labels['label'].map({0: "NRB", 1: "RB", 2: "NRB removed", 3: "RB removed"})

    # Create scs labeled with two labels four colors
    df_removed_fp["label"] = df_removed['label']
    df_scs_two_labels_four_colors = pd.concat([df_final_fp, df_removed_fp], axis=0)
    df_scs_two_labels_four_colors["label_str"] = df_scs_four_labels['label_str']

    # Create final labeled with RB and NRB
    df_final_fp_labeled = pd.DataFrame(df_final.fingerprint.tolist(), columns=cols)
    df_final_fp_labeled["label"] = df_final["label"]
    df_final_fp_labeled["label_str"] = df_final_fp_labeled['label'].map({0: "NRB", 1: "RB"})

    # semisupervised
    df_removed_fp["label"] = -1
    df_scs_semi_supervised = pd.concat([df_final_fp, df_removed_fp], axis=0)
    df_scs_semi_supervised["label_str"] = df_scs_four_labels['label_str']

    return df_scs_removed_labels, df_scs_two_labels, df_scs_four_labels, df_scs_two_labels_four_colors, df_final_fp_labeled, df_scs_semi_supervised


def create_dfs_for_umap() -> Tuple[
    pd.DataFrame, 
    pd.DataFrame, 
    pd.DataFrame, 
    pd.DataFrame, 
    pd.DataFrame, 
    pd.DataFrame, 
]:
    df_final = pd.read_csv("datasets/old_data/class_improved_env_biowin_both_readded.csv", index_col=0)
    df_final = df_final[['cas', 'smiles','inchi_from_smiles', 'label', 'linear_label',
        'non_linear_label', 'miti_linear_label', 'miti_non_linear_label',
        'prediction_class']]
    df_final.reset_index(inplace=True, drop=True)
    df_removed = pd.read_csv("datasets/old_data/class_improved_env_biowin_both_removed_predicted.csv", index_col=0)
    df_removed = df_removed[df_removed["label"] != df_removed["prediction_class"]]
    df_removed.reset_index(inplace=True, drop=True)

    def convert_to_maccs_fingerprints(df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy()
        mols = [AllChem.MolFromSmiles(smiles) for smiles in df["smiles"]]
        fp_vecs = [GetMACCSKeysFingerprint(mol) for mol in mols]
        fp_arrays = [np.frombuffer(fp_vec.ToBitString().encode(), 'u1') - ord('0') for fp_vec in fp_vecs]
        df["fingerprint"] = [fp_array.tolist() for fp_array in fp_arrays]
        return df

    df_final = convert_to_maccs_fingerprints(df=df_final)
    df_removed = convert_to_maccs_fingerprints(df=df_removed)
    
    cols = [*range(0, 167, 1)]
    df_final_fp = pd.DataFrame(df_final.fingerprint.tolist(), columns=cols)
    df_removed_fp = pd.DataFrame(df_removed.fingerprint.tolist(), columns=cols)
    # Create scs labeled with removed and kept
    df_final_fp["label"] = "not removed"
    df_removed_fp["label"] = "removed"
    df_scs_removed_labels = pd.concat([df_final_fp, df_removed_fp], axis=0)
    df_scs_removed_labels["label_str"] = df_scs_removed_labels['label']

    # Create scs labeled with two labels: RB and NRB
    df_final_fp["label"] = df_final["label"]
    df_removed_fp["label"] = df_removed['label']
    df_scs_two_labels = pd.concat([df_final_fp, df_removed_fp], axis=0)
    df_scs_two_labels["label_str"] = df_scs_two_labels['label'].map({0: "NRB", 1: "RB"})

    # Create scs labeled with four labels: RB and NRB
    df_final_fp["label"] = df_final["label"]
    df_removed_fp["label"] = df_removed['label'] + 2
    df_scs_four_labels = pd.concat([df_final_fp, df_removed_fp], axis=0)
    df_scs_four_labels["label_str"] = df_scs_four_labels['label'].map({0: "NRB", 1: "RB", 2: "NRB removed", 3: "RB removed"})

    # Create scs labeled with two labels four colors
    df_removed_fp["label"] = df_removed['label']
    df_scs_two_labels_four_colors = pd.concat([df_final_fp, df_removed_fp], axis=0)
    df_scs_two_labels_four_colors["label_str"] = df_scs_four_labels['label_str']

    # Create final labeled with RB and NRB
    df_final_fp_labeled = pd.DataFrame(df_final.fingerprint.tolist(), columns=cols)
    df_final_fp_labeled["label"] = df_final["label"]
    df_final_fp_labeled["label_str"] = df_final_fp_labeled['label'].map({0: "NRB", 1: "RB"})

    # semisupervised
    df_removed_fp["label"] = -1
    df_scs_semi_supervised = pd.concat([df_final_fp, df_removed_fp], axis=0)
    df_scs_semi_supervised["label_str"] = df_scs_four_labels['label_str']

    return df_scs_removed_labels, df_scs_two_labels, df_scs_four_labels, df_scs_two_labels_four_colors, df_final_fp_labeled, df_scs_semi_supervised


def create_plot(
    manifolds: List[umap.umap_.UMAP], 
    y_strs: List[np.ndarray], 
    titles: List[str], 
    legend_loc: List[str], 
    number_of_labels: List[int],
    figure_title: str,
) -> None:
    num_plots = len(manifolds)
    rows_of_plots = int(num_plots/2)
    fig, axs = plt.subplots(rows_of_plots, 2, figsize=(20, 10))

    for i, ax in enumerate(axs.flatten()):
        umap.plot.points(
            umap_object=manifolds[i], 
            labels=y_strs[i], 
            color_key={
                'NRB': 'royalblue', 
                'NRB removed': 'limegreen', 
                'RB': 'darkorange', 
                'RB removed': 'fuchsia'}, 
            background='black',
            ax=ax,
        ) 
        ax.set_xlabel(titles[i], fontsize=22)
        nrb = mpatches.Patch(color='royalblue', label='NRB')
        rb = mpatches.Patch(color='darkorange', label='RB')
        if number_of_labels[i] == 4:
            nrb_r = mpatches.Patch(color='limegreen', label='NRB removed')
            rb_r = mpatches.Patch(color='fuchsia', label='RB removed')
            ax.legend(loc=legend_loc[i], handles=[nrb, rb, nrb_r, rb_r], fontsize=16)
        else:
            ax.legend(loc=legend_loc[i], handles=[nrb, rb], fontsize=16)
        umap.plot.plt.savefig(f'figures/{figure_title}.png')


## Unsupervised and semisupervised before readding

In [None]:
def create_plot_umap_semisupervised(
    n_components, 
    n_neighbors, 
    min_dist, 
    target_weight, 
    metric, 
    random_state
) -> None:

    _, _, df_scs_four_labels, _, _, df_scs_semi_supervised = create_dfs_for_umap_before_readding()

    manifold_unsupervised, y_unsupervised_str = create_manifold(
        df=df_scs_four_labels, 
        n_components=n_components,
        n_neighbors=n_neighbors, 
        min_dist=min_dist,
        metric=metric,
        random_state=random_state,
        target_weight=target_weight,
        supervised=False,
    )
    manifold_semi_supervised, y_semi_supervised_str = create_manifold(
        df=df_scs_semi_supervised, 
        n_components=n_components,
        n_neighbors=n_neighbors, 
        min_dist=min_dist,
        metric=metric,
        random_state=random_state,
        target_weight=target_weight,
        supervised=True,
    )

    manifolds = [
        manifold_unsupervised,
        manifold_semi_supervised, 
    ]
    y_strs = [
        y_unsupervised_str,
        y_semi_supervised_str,
    ]
    titles = [
        'a) $Curated_{SCS}$ unsupervised',
        'b) $Curated_{SCS}$ semi-supervised \n' + f'(target_weight={target_weight})',
    ]
    legend_loc = [
        'upper left',
        'upper left',
    ]
    number_of_labels = [4, 4]

    create_plot(
        manifolds=manifolds, 
        y_strs=y_strs, 
        titles=titles, 
        legend_loc=legend_loc, 
        number_of_labels=number_of_labels,
        figure_title="umap_semisupervised"
    )

create_plot_umap_semisupervised(
    n_components=2, 
    n_neighbors=30, 
    min_dist=0.5, 
    target_weight=0.2, 
    metric="manhattan", 
    random_state=42,
)

## Unsupervised and semisupervised after readding

In [7]:
def create_plot_umap_semisupervised(
    n_components, 
    n_neighbors, 
    min_dist, 
    target_weight, 
    metric, 
    random_state
) -> None:

    _, _, df_scs_four_labels, _, _, df_scs_semi_supervised = create_dfs_for_umap()

    manifold_unsupervised, y_unsupervised_str = create_manifold(
        df=df_scs_four_labels, 
        n_components=n_components,
        n_neighbors=n_neighbors, 
        min_dist=min_dist,
        metric=metric,
        random_state=random_state,
        target_weight=target_weight,
        supervised=False,
    )
    manifold_semi_supervised, y_semi_supervised_str = create_manifold(
        df=df_scs_semi_supervised, 
        n_components=n_components,
        n_neighbors=n_neighbors, 
        min_dist=min_dist,
        metric=metric,
        random_state=random_state,
        target_weight=target_weight,
        supervised=True,
    )

    manifolds = [
        manifold_unsupervised,
        manifold_semi_supervised, 
    ]
    y_strs = [
        y_unsupervised_str,
        y_semi_supervised_str,
    ]
    titles = [
        'a) $Curated_{SCS}$ unsupervised',
        'b) $Curated_{SCS}$ semi-supervised \n' + f'(target_weight={target_weight})',
    ]
    legend_loc = [
        'upper left',
        'upper left',
    ]
    number_of_labels = [4, 4]

    create_plot(
        manifolds=manifolds, 
        y_strs=y_strs, 
        titles=titles, 
        legend_loc=legend_loc, 
        number_of_labels=number_of_labels,
        figure_title="umap_semisupervised"
    )

create_plot_umap_semisupervised(
    n_components=2, 
    n_neighbors=30, 
    min_dist=0.5, 
    target_weight=0.2, 
    metric="manhattan", 
    random_state=42,
)

FileNotFoundError: [Errno 2] No such file or directory: 'datasets/old_data/class_improved_env_biowin_both_readded.csv'

## Fit on final and transform removed

In [None]:
def create_plots_fit_transform(n_components, n_neighbors, min_dist, target_weight, metric, random_state):
    df_final = pd.read_csv("dataframes/improved_data/class_improved_env_biowin_both_readded.csv", index_col=0)
    df_final = df_final[['cas', 'smiles','inchi_from_smiles', 'label', 'linear_label',
        'non_linear_label', 'miti_linear_label', 'miti_non_linear_label',
        'prediction_class']]
    df_final.reset_index(inplace=True, drop=True)
    df_removed = pd.read_csv("dataframes/improved_data/class_improved_env_biowin_both_removed_predicted.csv", index_col=0)
    df_removed = df_removed[df_removed["label"] != df_removed["prediction_class"]]
    df_removed.reset_index(inplace=True, drop=True)

    def convert_to_maccs_fingerprints(df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy()
        mols = [AllChem.MolFromSmiles(smiles) for smiles in df["smiles"]]
        fp_vecs = [GetMACCSKeysFingerprint(mol) for mol in mols]
        fp_arrays = [np.frombuffer(fp_vec.ToBitString().encode(), 'u1') - ord('0') for fp_vec in fp_vecs]
        df["fingerprint"] = [fp_array.tolist() for fp_array in fp_arrays]
        return df

    df_final = convert_to_maccs_fingerprints(df=df_final)
    df_removed = convert_to_maccs_fingerprints(df=df_removed)

    cols = [*range(0, 167, 1)]

    df_final_fp = pd.DataFrame(df_final.fingerprint.tolist(), columns=cols)
    df_final_fp["label"] = df_final["label"]
    df_removed_fp = pd.DataFrame(df_removed.fingerprint.tolist(), columns=cols)

    # final 
    df_final_data = df_final_fp.copy()
    df_final_data["label_str"] = df_final_data['label'].map({0: "NRB", 1: "RB"})

    # removed
    df_removed_data = df_removed_fp.copy()
    df_removed_data["label"] = df_removed["label"]
    df_removed_data["label_str"] = df_removed_data['label'].map({0: "NRB removed", 1: "RB removed"})


    # Fit final and then transform removed data
    x_final, y_final, y_final_str = df_final_data.drop(["label", "label_str"], axis=1), df_final_data[["label"]].values.flatten(), df_final_data[["label_str"]].values.flatten()
    manifold_final = umap.UMAP(
        n_components=n_components, 
        n_neighbors=n_neighbors, 
        min_dist=min_dist, 
        metric=metric, 
        random_state=random_state, 
        n_jobs=1,
        target_weight=target_weight,
    ).fit(x_final, y_final)
    df_embeddings_final = pd.DataFrame(manifold_final.embedding_, columns=["x_value", "y_value"])
    df_embeddings_final.to_csv("dataframes/embeddings_final.csv")
    # Fit removed 
    x_removed, _, y_removed_str = df_removed_data.drop(["label", "label_str"], axis=1), df_removed_data[["label"]].values.flatten(), df_removed_data[["label_str"]].values.flatten()
    manifold_fit_transform = manifold_final.transform(x_removed)

    manifolds_embeddings = [
        manifold_final.embedding_,
        manifold_fit_transform,
    ]
    y_strs = [
        y_final_str,
        y_removed_str,
    ]
    titles = [
        'a) 2D embeddings of the $Curated_{Final}$ data', #  + f'(target_weight={target_weight})'
        'b) Transformed $Removed$ data',
    ]

    legend_loc = [
        'lower right', # lower
        'lower right',
    ]

    for indx, y_str in enumerate(y_strs):
        for i in range(len(y_str)):
            if y_str[i] == "NRB":
                y_str[i] = 'royalblue'
            elif y_str[i] == "RB":
                y_str[i] = 'darkorange'
            elif y_str[i] == "NRB removed":
                y_str[i] = 'royalblue'
            elif y_str[i] == "RB removed":
                y_str[i] = 'darkorange'
        y_strs[indx] = y_str

    fig, axs = plt.subplots(1, 2, figsize=(20, 6))
    for i, ax in enumerate(axs.flatten()):
        manifolds_embedding = manifolds_embeddings[i]
        ax.scatter(manifolds_embedding[:, 0], manifolds_embedding[:, 1], s=3, c=y_strs[i])
        ax.set_xlabel(titles[i], fontsize=22)
        ax.set_facecolor('black')
        if '$Curated_{Final}$' in titles[i]:
            nrb = mpatches.Patch(color='royalblue', label="NRB")
            rb = mpatches.Patch(color='darkorange', label="RB")
            ax.legend(loc=legend_loc[i], handles=[nrb, rb], fontsize=18) #, title="Classes"
        else: 
            nrbr = mpatches.Patch(color='royalblue', label="NRB removed")
            rbr = mpatches.Patch(color='darkorange', label="RB removed")
            ax.legend(loc=legend_loc[i], handles=[nrbr, rbr], fontsize=18) #, title="Classes"

n_components = 2
n_neighbors = 30
target_weight = 0.2
random_state=42

min_dist = 0.5
metric = "manhattan"

create_plots_fit_transform(n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist, target_weight=target_weight, metric=metric, random_state=random_state)

In [None]:
# Analyse third cluster
from rdkit import Chem

df_embeddings_final = pd.read_csv("dataframes/embeddings_final.csv", index_col=0)
print(len(df_embeddings_final))
df_cluster = df_embeddings_final[(df_embeddings_final["x_value"] > 10.0) & (df_embeddings_final["y_value"] < 15.0)]
df_final = pd.read_csv("dataframes/improved_data/class_improved_env_biowin_both_readded.csv", index_col=0)
df_final_in_cluster = df_final[df_final.index.isin(df_cluster.index)]
print("Data points in cluster: ", len(df_final_in_cluster))
df_final_in_cluster_with_s = df_final_in_cluster[df_final_in_cluster['smiles'].str.contains("S")]
print("Percentage in cluster that contains S: ", len(df_final_in_cluster_with_s)/len(df_final_in_cluster)*100)
df_final_in_cluster_with_s_o = df_final_in_cluster_with_s[df_final_in_cluster_with_s['smiles'].str.contains("O")]
print("Percentage in cluster that contains S and O: ", len(df_final_in_cluster_with_s_o)/len(df_final_in_cluster)*100)
df_final_in_cluster_with_s_o_n = df_final_in_cluster_with_s_o[df_final_in_cluster_with_s_o['smiles'].str.contains("N")]
print("Percentage in cluster that contains S, O, and N: ", len(df_final_in_cluster_with_s_o_n)/len(df_final_in_cluster)*100)
def contains_ring(smiles):
    mol = Chem.MolFromSmiles(smiles)  # can use Chem.MolFromInchi for InChI strings
    num_rings = mol.GetRingInfo().NumRings()
    return num_rings
df_final_in_cluster_with_s_o['num_rings'] = df_final_in_cluster_with_s_o['smiles'].apply(contains_ring)
df_final_in_cluster_with_s_o_ring = df_final_in_cluster_with_s_o[df_final_in_cluster_with_s_o['num_rings']>0]
print("Percentage in cluster that contains S, O, and at least one ring: ", len(df_final_in_cluster_with_s_o_ring)/len(df_final_in_cluster)*100)
# all of the substances in the cluster are organosulfur compounds


df_final_not_in_cluster = df_final[~df_final.index.isin(df_cluster.index)]
print("Data points not in cluster: ", len(df_final_not_in_cluster))
df_final_not_in_cluster_with_s = df_final_not_in_cluster[df_final_not_in_cluster['smiles'].str.contains("S")]
print("Percentage not in cluster that contains S: ", len(df_final_not_in_cluster_with_s)/len(df_final_not_in_cluster)*100)
df_final_not_in_cluster_with_s_o = df_final_not_in_cluster[(df_final_not_in_cluster['smiles'].str.contains("O")) & (df_final_not_in_cluster['smiles'].str.contains("S"))]
print("Percentage not in cluster that contains S and O: ", len(df_final_not_in_cluster_with_s_o)/len(df_final_not_in_cluster)*100)
print("Number of organosulfur chemicals outside cluster: ", len(df_final_not_in_cluster_with_s_o))
print("Number of organosulfur chemicals outside cluster which are RB: ", len(df_final_not_in_cluster_with_s_o[df_final_not_in_cluster_with_s_o["label"]==1]))
df_final_not_in_cluster_o = df_final_not_in_cluster[(df_final_not_in_cluster['smiles'].str.contains("O"))]
print("Percentage not in cluster that contains O: ", len(df_final_not_in_cluster_o)/len(df_final_not_in_cluster)*100)
df_final_not_in_cluster_n = df_final_not_in_cluster[df_final_not_in_cluster['smiles'].str.contains("N")]
print("Percentage not in cluster that contains N: ", len(df_final_not_in_cluster_n)/len(df_final_not_in_cluster)*100)
def contains_ring(smiles):
    mol = Chem.MolFromSmiles(smiles)  # can use Chem.MolFromInchi for InChI strings
    num_rings = mol.GetRingInfo().NumRings()
    return num_rings
df_final_not_in_cluster['num_rings'] = df_final_not_in_cluster['smiles'].apply(contains_ring)
df_final_not_in_cluster_ring = df_final_not_in_cluster[df_final_not_in_cluster['num_rings']>0]
print("Percentage not in cluster that contains at least one ring: ", len(df_final_not_in_cluster_ring)/len(df_final_not_in_cluster)*100)
df_final_not_in_cluster_with_s_o['num_rings'] = df_final_not_in_cluster_with_s_o['smiles'].apply(contains_ring)
df_final_not_in_cluster_with_s_o_ring = df_final_not_in_cluster_with_s_o[df_final_not_in_cluster_with_s_o['num_rings']>0]
print("Percentage in cluster that contains S, O, and at least one ring: ", len(df_final_not_in_cluster_with_s_o_ring)/len(df_final_not_in_cluster)*100)


rb_in_luster = df_final_in_cluster[df_final_in_cluster["label"]==1]
print(len(rb_in_luster))