# Add Cluster Similarities to Data Set

For each measurement,

* add the similarity to each functional cluster
* add the cluster with the highest similarity and its similarity value

In [1]:
%reload_ext autoreload
%autoreload 2

import sys
import os.path as op

from typing import Iterable, List

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import HTML, display 

import ipywidgets as ipyw
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from jupy_tools import plt_style, cpa
from jupy_tools import utils as u
from jupy_tools.utils import info
u.timestamp()

Timestamp: 05-Jul-2022 12:15:28


## Load Data Set

In [2]:
ds_refs = u.read_tsv("../input/ds_refs.tsv")

read_tsv                           : [    3560 / 589 ] 


## Add Similarities to Functional Clusters

In [3]:
ds_refs_sim = cpa.add_func_clusters(ds_refs)
info(ds_refs_sim)

Shape                              : [    3560 / 601 ] 


In [4]:
u.write_tsv(ds_refs_sim, "../output/ds_refs_sim_to_clusters.tsv")

## Reduced Data Set for DataWarrior

In [5]:
df = ds_refs_sim.copy()

columns = ["Compound_Id", "Well_Id", "Is_Ref", "Conc_uM", "Rel_Cell_Count", "Induction"]

clusters = cpa.get_func_cluster_names(prefix="Cluster_")
columns.extend(clusters)
columns.extend(["Cluster_High", "Cluster_Sim"])
columns.extend(["Trivial_Name", "Known_Act"])
columns.extend(["Chiral", "Smiles"])

df = df[columns]
info(df)

Shape                              : [    3560 /  22 ] 


In [6]:
u.write_tsv(df, "../output/ds_refs_sim_to_clusters_dw.txt")

## Add Cluster Similaritites to List of Cluster Defining Compounds

In [2]:
df_cl = u.read_tsv("../input/cluster_cpds.tsv")
df_ref = u.read_tsv("../output/ds_refs_sim_to_clusters_dw.txt")

read_tsv                           : [     227 /   5 ] ( Well_Id, Cluster, Induction, Conc_uM, Trivial_Name )
read_tsv                           : [    3560 /  22 ] 


In [8]:
result = []
clusters = cpa.get_func_cluster_names()
for cl in clusters:
    tmp = df_cl[df_cl["Cluster"] == cl]
    assert len(tmp) > 0
    tmp = pd.merge(tmp, df_ref[["Well_Id", f"Cluster_{cl}"]], on="Well_Id", how="left")
    tmp = tmp.rename(columns={f"Cluster_{cl}": "Cluster_Sim"})
    tmp = tmp.round(decimals={"Cluster_Sim": 0})
    result.append(tmp)
df_res = pd.concat(result)
u.write_tsv(df_res, "../output/cluster_cpds_with_cluster_sim.tsv")

## Create Data Set for Web App

In [17]:
df_wa = df.copy()
df_wa = df_wa.query('Is_Ref')
df_wa[["C_Id", "Batch_No", "Rest"]] = df_wa["Well_Id"].str.split(":", n=2, expand=True)

In [18]:
columns = [
    'Compound_Id', "Batch_No", 'Well_Id', 'Is_Ref', 'Conc_uM', 'Rel_Cell_Count',
    'Induction', 'Cluster_AKT_PI3K_MTOR', 'Cluster_Aurora', 'Cluster_BET',
    'Cluster_DNA_Synth', 'Cluster_HDAC', 'Cluster_HSP90', 'Cluster_LCH',
    'Cluster_Protein_Synth', 'Cluster_Tubulin', 'Cluster_Uncoupler',
    'Cluster_High', 'Cluster_Sim', 'Trivial_Name', 'Known_Act', 'Chiral',
    'Smiles'
]
df_wa["Batch_No"] = df_wa["Batch_No"].astype(int)
df_wa = df_wa[columns]
df_wa = df_wa.sort_values(["Compound_Id", "Batch_No", "Conc_uM"])

In [19]:
u.write_tsv(df_wa, "../output/ds_refs_sim_to_clusters_webapp.tsv")

## Cross-Similarity Plot (Pair Plot)

In [2]:
u.timestamp()
ds_refs_sim = u.read_tsv("../output/ds_refs_sim_to_clusters.tsv").query('Is_Ref')
info(ds_refs_sim)

Timestamp: 01-Jul-2022 14:52:40
read_tsv                           : [    3560 / 601 ] 
Shape                              : [    3547 / 601 ] 


In [None]:
clusters = cpa.get_func_cluster_names(prefix="Cluster_")
tmp = ds_refs_sim[clusters].copy()
renames = {
    'Cluster_AKT_PI3K_MTOR': 'AKT/PI3K/MTOR',
    'Cluster_Aurora': 'Aurora',
    'Cluster_BET': 'BET',
    'Cluster_DNA_Synth': 'DNA Synth',
    'Cluster_HDAC': 'HDAC',
    'Cluster_HSP90': 'HSP90',
    'Cluster_LCH': 'LCH',
    'Cluster_Protein_Synth': 'Protein Synth',
    'Cluster_Tubulin': 'Tubulin',
    'Cluster_Uncoupler': 'Uncoupling'
}
# for x in clusters:
#     renames[x] = x[8:]
tmp = tmp.rename(columns=renames)
pp = sns.pairplot(tmp)
plt.savefig("plots/clusters_pairplot.png")
plt.savefig("plots/clusters_pairplot.svg")