# Calculate the Cluster Biosimilarity Matrix

For a given set of measurements, This Notebook calculates the similarity to all defined clusters.  
and displays the result as a matrix.

In addition, two heat maps are generated for the given measurements:

* Full profile heat map with all 579 features
* Heat map of the cluster profile using only the cluster features with the median cluster profile on top

In [None]:
CONFIGS = {
    "Tubulin": {
        "well_Ids": ["413124:01:04_01.00", "392587:02:03_00.20"],
        "labels": ["1.0 µM KG 5", "0.2 µM AZ 960"],
        "hm_full_well_id": "245354:03:12_00.10",
        "hm_full_label": "0.1 µM Nocodazole",
        "cluster_name": "Tubulin",
        "height": 1.5
    },
    "HDAC": {
        "well_Ids": [
            "105083:01:14_10.00", 
            "105083:01:14_30.00", 
            "105096:01:10_10.00", 
            "105096:01:10_30.00", 
        ],
        "labels": [
            "10 µM Cpd 1", "30 µM Cpd 1", "10 µM Cpd 2", "30 µM Cpd 2", 
        ],
        "hm_full_well_id": "410804:01:03_02.00",
        "hm_full_label": "2 µM TSA",
        "cluster_name": "HDAC",
        "height": 3.0
    },
    "Protein Synth": {
        "well_Ids": ["411099:01:05_01.00", "344315:01:04_10.00"],
        "labels": [" 1 µM LDN193189", "10 µM Cpd 3"],
        "hm_full_well_id": "247300:01:15_01.00",
        "hm_full_label": "1 µM Cycloheximide",
        "cluster_name": "Protein_Synth",
        "height": 1.5
    },
    "HTH": {
        # Used for cluster similarity matrix only (Fig. 6B)
        "well_Ids": ["394207:01:09_03.00", "394207:01:08_10.00"],
        "labels": [" 3 µM HTH-01-015", "10 µM HTH-01-015"],
        "hm_full_well_id": "",
        "hm_full_label": "",
        "cluster_name": "HTH",
        "height": 1.5
    }
}

In [None]:
# "Tubulin", "HDAC", "Protein Synth", "HTH"
example = "HTH"
config = CONFIGS[example]

w_ids = config["well_Ids"]
cl_label = f"{example:>15s}"
labels = [f"{x:>15s}" for x in config["labels"]]
cluster_name = config["cluster_name"]
height = config["height"]

In [None]:
%reload_ext autoreload
%autoreload 2

import sys
import os.path as op

from typing import Iterable, List

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import ListedColormap
import seaborn as sns

from IPython.display import HTML, display 

# from tqdm.notebook import tqdm
import ipywidgets as ipyw
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from jupy_tools import plt_style, cpa
from jupy_tools import utils as u
from jupy_tools.utils import info
u.timestamp()

## Load Data Set

In [None]:
ds_refs = u.read_tsv("../output/ds_refs_sim_to_clusters.tsv")

In [None]:
clusters_short = cpa.get_func_cluster_names()
clusters_long = [f"Cluster_{cl}" for cl in clusters_short]
renames = {cl_long: cl_short for cl_long, cl_short in zip(clusters_long, clusters_short)}

Some specific Renames for nicer legends:

In [None]:
renames["Cluster_AKT_PI3K_MTOR"] = "AKT/PI3K/MTOR"
renames["Cluster_DNA_Synth"] = "DNA Synth"
renames["Cluster_Na_K-ATPase"] = "Na+/K+-ATPase"
renames["Cluster_Protein_Synth"] = "Protein Synth"
renames["Cluster_Pyrimidine_Synth"] = "Pyrimid. Synth"
renames


Create a custom colormap where the lowest value is mapped to white:

In [None]:
blues = cm.get_cmap("Blues", 20)
white_blue = blues(np.linspace(0, 1, 20))
white_blue[0, :] = 1
wb_cm = ListedColormap(white_blue)

In [None]:
columns = ["Well_Id"]
columns.extend(clusters_long)
df_cpds = ds_refs[ds_refs["Well_Id"].isin(w_ids)]
# Sort the filtered dataframe according to the Well_Id input list.
# StackOverflow: https://stackoverflow.com/a/58622999
# This is needed to have the correct order in the heatmap later on:
df_cpds = df_cpds.set_index('Well_Id').loc[w_ids].reset_index()
print(len(df_cpds))
df_cpds["Well_Id_org"] = df_cpds["Well_Id"]
df_cpds["Well_Id"] = labels
df_biosim = df_cpds[columns].copy()
df_biosim

In [None]:
# other color schems considered: 1 - afmhot_r; 2 - binary; 
tmp = df_biosim.copy()
tmp = tmp.rename(columns=renames)
tmp.set_index("Well_Id", inplace=True)
f, ax = plt.subplots(figsize=(14, height))
hm = sns.heatmap(tmp, annot=True, fmt=".0f", linewidths=.5, annot_kws={"size": 22}, cmap=wb_cm, vmin=80.0, vmax=100.0, ax=ax);
# hm.invert_yaxis()
hm.set_ylabel("")
hm.set_title("Cluster Biosimilaritites")
fig = hm.get_figure()
plt.savefig(f"plots/ex_{cluster_name.lower()}_biosim.png", bbox_inches='tight')
plt.savefig(f"plots/ex_{cluster_name.lower()}_biosim.svg", bbox_inches='tight')

## Heatmap Full Profile

In [None]:
def heatmap(df_mp: pd.DataFrame, fn="heatmap", features=None):
    if features is None:
        features = cpa.ACT_PROF_FEATURES
    size = 2.77 + (17.0 * len(features) / 579)
    width = int(160.0 + (1020 * len(features) / 579))
    hm = cpa.heat_mpl(
        df_mp, id_prop="Well_Id", features=features, show=False, 
        biosim=True, 
        img_size = size, img_tag_options=f'style="width: {width}px;"',
        # save_to_file=[f"plots/{fn}.png", f"plots/{fn}.svg"]
        save_to_file=f"plots/{fn}.svg"
    )

In [None]:
columns = ["Well_Id_org", "Well_Id", "Induction", "Rel_Cell_Count", "Conc_uM"]
if config["hm_full_well_id"] != "":
    tmp = ds_refs[ds_refs["Well_Id"] == config["hm_full_well_id"]].copy()
    tmp["Well_Id_org"] = tmp["Well_Id"]
    tmp["Well_Id"] = config["hm_full_label"]
    df_hm_full = pd.concat([tmp, df_cpds]).reset_index(drop=True)
else:
    df_hm_full = df_cpds.copy()
u.write_tsv(df_hm_full[columns], f"plots/ex_{cluster_name.lower()}_hm_ylabels.tsv")
df_hm_full[columns]

In [None]:
heatmap(df_hm_full, fn=f"ex_{cluster_name.lower()}_hm_full")

## Heatmap Cluster Subprofile

In [None]:
if example != "HTH":
    df_cl = cpa.get_func_cluster_features(cluster_name)
    df_cl["Well_Id"] = cl_label
    info(df_cl)

    cl_feat = cpa.feature_data(df_cl)
    info(cl_feat)
    tmp = pd.concat([df_cl, df_cpds])
    heatmap(tmp, fn=f"ex_{cluster_name.lower()}_hm_cluster", features=cl_feat)

In [None]:
if example == "HTH":
    well_ids = ["394207:01:08_10.00", "392522:03:05_00.20", "392321:04:03_02.00", "410784:01:03_00.20"]
    labels = ["HTH-01-015 10µM", "SNS-314", "ZM-447439", "Barasertib"]
    
    df_cpds = ds_refs[ds_refs["Well_Id"].isin(well_ids)].copy()
    # Sort the filtered dataframe according to the Well_Id input list.
    # StackOverflow: https://stackoverflow.com/a/58622999
    # This is needed to have the correct order in the heatmap later on:
    df_cpds = df_cpds.set_index('Well_Id').loc[well_ids].reset_index()
    print(len(df_cpds))
    df_cpds["Well_Id_org"] = df_cpds["Well_Id"]
    df_cpds["Well_Id"] = labels

    df_cl = cpa.get_func_cluster_features("LCH")
    cl_feat = list(cpa.get_func_cluster_features("LCH", include_well_id=False).keys())
    non_cl_feat = cpa.remaining_features(cl_feat)
    print(len(non_cl_feat))
    heatmap(df_cpds, fn=f"ex_{cluster_name.lower()}_hm_cluster_non-lch_feat", features=non_cl_feat)