# Define Masks

<div align="right">Last executed: 02-May-2022</div>

Define cluster masks from lists of compounds.

The list of measurements provided by Slava is used for this (Cluster definitions_SZ_15_03_22_for Axel.xlsx)

In [1]:
%reload_ext autoreload
%autoreload 2

import gc
import time
# import os.path as op

# Type hints
from typing import Iterable, List, Set, Dict, Union, Optional

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from jupy_tools import plt_style, cpa
from jupy_tools import utils as u
from jupy_tools.utils import info
u.timestamp()

Timestamp: 20-May-2022 15:17:37


## Read Data Set and Define Parameters

In [2]:
df_org = u.read_tsv("../input/ds_refs.tsv")

read_tsv                           : [    3547 / 589 ] 


In [3]:
filter_method = "signum"
fraction = 0.85

well_ids = u.read_tsv("../input/cluster_cpds.tsv")[["Well_Id", "Cluster"]]

read_tsv                           : [     227 /   2 ] ( Well_Id, Cluster )


### Calculate Feature Set and Median Profile for Clusters

In [4]:
tmp = well_ids.sort_values("Cluster").groupby("Cluster").count().reset_index().rename(columns={"Well_Id": "Count"})
clusters = sorted(list(well_ids["Cluster"].unique()))
print(clusters)
tmp

['AKT_PI3K_MTOR', 'Aurora', 'BET', 'DNA_Synth', 'HDAC', 'HSP90', 'LCH', 'Protein_Synth', 'Tubulin', 'Uncoupler']


Unnamed: 0,Cluster,Count
0,AKT_PI3K_MTOR,18
1,Aurora,22
2,BET,31
3,DNA_Synth,29
4,HDAC,30
5,HSP90,4
6,LCH,49
7,Protein_Synth,3
8,Tubulin,34
9,Uncoupler,7


In [5]:
for cl in clusters:
    tmp = well_ids[well_ids["Cluster"] == cl]
    df_cl = pd.merge(tmp, df_org, on="Well_Id", how="inner")
    
    cl_feat = cpa.cluster_features(df_cl, fraction=fraction)
    print(f"Cluster: {cl:13s}    Measurements: {len(tmp):3d}    Found: {len(df_cl):3d}    Features: {len(cl_feat):3d}")
    
    _ = open(f"../output/parms_{cl}_{fraction:.2f}.txt", "w").write("\n".join(cl_feat))

Cluster: AKT_PI3K_MTOR    Measurements:  18    Found:  18    Features: 435
Cluster: Aurora           Measurements:  22    Found:  22    Features: 358
Cluster: BET              Measurements:  31    Found:  31    Features: 497
Cluster: DNA_Synth        Measurements:  29    Found:  29    Features: 288
Cluster: HDAC             Measurements:  30    Found:  30    Features: 378
Cluster: HSP90            Measurements:   4    Found:   4    Features: 406
Cluster: LCH              Measurements:  49    Found:  49    Features: 504
Cluster: Protein_Synth    Measurements:   3    Found:   3    Features: 409
Cluster: Tubulin          Measurements:  34    Found:  34    Features: 424
Cluster: Uncoupler        Measurements:   7    Found:   7    Features: 415


# Calculate Median Profiles

In [6]:
for cl in clusters:
    tmp = well_ids[well_ids["Cluster"] == cl]
    cl_feat = u.load_list(f"../output/parms_{cl}_{fraction:.2f}.txt")
    print(f"Cluster: {cl:13s}    Features: {len(cl_feat):3d}")
    df_cl = pd.merge(tmp, df_org, on="Well_Id", how="inner")
    med_prof = df_cl[cl_feat].median().values
    df_mp = pd.DataFrame(data=(med_prof,), columns=cl_feat)
    df_mp["Well_Id"] = cl
    
    size = 2.77 + (17.0 * len(cl_feat) / 579)
    width = int(160.0 + (1020 * len(cl_feat) / 579))
    hm = cpa.heat_mpl(
        df_mp, id_prop="Well_Id", features=cl_feat, show=False, 
        img_size = size, img_tag_options=f'style="width: {width}px;"',
        save_to_file=[f"plots/hm_med_prof_{cl}_{fraction:.2f}.png", f"plots/hm_med_prof_{cl}_{fraction:.2f}.svg"]
    )
    plt.clf()
    plt.close()
    gc.collect()
    df_mp.to_csv(f"../output/med_prof_{cl}.tsv", sep="\t", index=False)

Cluster: AKT_PI3K_MTOR    Features: 435
  - Re-calculating xticks...


  plt.tight_layout()


2559

Cluster: Aurora           Features: 358
  - Re-calculating xticks...


  plt.tight_layout()


0

Cluster: BET              Features: 497
  - Re-calculating xticks...


  plt.tight_layout()


0

Cluster: DNA_Synth        Features: 288
  - Re-calculating xticks...


  plt.tight_layout()


0

Cluster: HDAC             Features: 378
  - Re-calculating xticks...


  plt.tight_layout()


0

Cluster: HSP90            Features: 406
  - Re-calculating xticks...


  plt.tight_layout()


0

Cluster: LCH              Features: 504
  - Re-calculating xticks...


  plt.tight_layout()


0

Cluster: Protein_Synth    Features: 409
  - Re-calculating xticks...


  plt.tight_layout()


0

Cluster: Tubulin          Features: 424
  - Re-calculating xticks...


  plt.tight_layout()


0

Cluster: Uncoupler        Features: 415
  - Re-calculating xticks...


  plt.tight_layout()


0