In [1]:
import pandas as pd
from collections import OrderedDict

import sys
sys.path.insert(0, '..')

from unpast.utils.io import read_bic_table



import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
anno_tcga = pd.read_csv("fig_data/TCGA-BRCA.annotation.tsv.gz",sep="\t", index_col =0)
exprs_tcga = pd.read_csv("fig_data/TCGA-BRCA.exprs.tsv.gz",sep="\t", index_col =0)

exprs_meta = pd.read_csv("fig_data/METABRIC.exprs.tsv.gz",sep="\t", index_col =0)
anno_meta= pd.read_csv("fig_data/METABRIC.annotation.tsv.gz",sep="\t", index_col =0)

In [None]:
bics_tcga = read_bic_table("fig_data/TCGA-BRCA.biclusters.tsv")
bics_meta = read_bic_table("fig_data/METABRIC.biclusters.tsv")
bics_tcga.head(3)

### bicluster format:
important columns:
 * "samples" (set) - set of sample_ids in a bicluster (same as columns in expression data and rows in annotation)
 * "genes" (set) - set of sample_ids in a bicluster (same as rows in expression data)
 * "genes_up", "genes_down" (set)  - sets of up- and down-regulated 
 * "direction" (str) - whether bicluster genes are all up- or down-regulated, or mixed

### pre-select 5 pairs of best matching biclusters

In [None]:
# bicluster ids
bm_ids1 = [64,100,128,5,36]
bm_ids2 = [35,163,60,5,22]

### defining colors for sample metadata

In [None]:
color_dict = OrderedDict()

color_dict["PAM50"] = OrderedDict({"Her2":"yellow",
                                   "Basal":"lightblue",
                                   "LumA":"lightgreen",
                                   "LumB":"darkgreen",
                                   "Normal":"grey"})

# histology
receptors = OrderedDict({'Positive':"red",
                         "Equivocal":"grey",#"orange",
                         'Indeterminate':"grey",#"gold",
                         'Negative':"green",
                         "NA":"lightgrey"})
color_dict["IHC_HER2"] = receptors
color_dict["IHC_ER"] = receptors 
color_dict["IHC_PR"] = receptors 



### plot top-5 similar biclusters not matching PAM50

In [None]:
from unpast.utils.figs import draw_heatmap2
w = 8 # figure width

### for example, we plot TCGA biclusters 
# select two columns from annotation table
annot = anno_tcga.loc[:,["PAM50","IHC_HER2"]]
annot = annot.fillna("NA")
# expression 
data = exprs_tcga
# selected five biclusters
bics = bics_tcga.loc[bm_ids1, :].sort_values(by="n_samples")

# it would be nice to define optimal heith automatically
h = min(max(3,bics["n_genes"].sum()/5),10)

    
fig, samples, colors = draw_heatmap2(data, #expression to plot
                    bics, # selected biclusters to plot
                    annot=annot,           # sample metadata
                    color_dict=color_dict, # colors for 
                  bicluster_colors="auto",
                  #bicluster_colors="black",
                  #bicluster_colors="redblue",
                  #bicluster_colors=["blue","yellow","orange"], # list of color names
                  figsize = (w,h*0.8), 
                  dendrogram_ratio=(0.01,0.01), # probably this is not needed
                  colors_ratio=(0.015, 0.015), # ideally - I would like to have one parameter for both rows and columns
                  bic_prefix = "b. ",
                  legend_n_cols=0, # annotation legend
                  no_cbar=True,  # heatmap colorscale
                  col_labels = False,
                  #highlight_row_labels=highlight_genes,
                  row_labels = True,
                  no_row_colors = False,
                  cluster_rows=False,
                  #cluster_rows=True,
                  #cluster_columns=True
                 )
ax = fig.ax_heatmap
tmp = ax.set_xlabel("TCGA-BRCA, n=%s samples"%annot.shape[0],fontsize=18)




### plotting legend separately 

In [None]:
from matplotlib.patches import Patch

legend_elements = []
for subt in color_dict["PAM50"].keys():
    p = Patch(facecolor=color_dict["PAM50"][subt], edgecolor='black', label=subt)
    legend_elements.append(p)

# Create the figure
fig, ax = plt.subplots(figsize = (3,3))
legend = ax.legend(
    handles=legend_elements,
    loc='center',
    ncol=3,
    handlelength=5,
    handleheight=4,
    title='',  # Legend title
    fontsize=20  # Legend items font size
)

legend.get_frame().set_alpha(0)

legend.set_title('', prop={'size': 22})

ax.axis('off')

plt.show()


### top-5 biclusters with the highest number of shared genes

In [None]:
# genes to highlight (genes shared between TCGA and METABRIC biclusters)
gene_list = set([])
for i in range(len(bm_ids1)):
    g1 = bics_tcga.loc[bm_ids1[i],"genes"]
    g2 = bics_meta.loc[bm_ids2[i],"genes"]
    gene_list |= g1.intersection(g2)


In [None]:
annot = anno_tcga.loc[:,["PAM50"]]
annot = annot.fillna("NA")
annot.head(3)

bics = bics_tcga.loc[bm_ids1,].sort_values(by="n_samples")

w, h = 5,8

tmp = draw_heatmap2(exprs_tcga,
                    bics,
                    figsize = (w,h), 
                    colors_ratio=(0.02, 0.02),
                    dendrogram_ratio=(0.01,0.01),
                    annot=annot,
                    color_dict=color_dict,
                    bicluster_colors="auto",
                    col_labels = False,
                    row_labels =True,
                    no_row_colors = False,
                    legend_n_cols=0,
                    no_cbar = True,
                    cluster_rows=False,
                    highlight_row_labels=list(gene_list),
                   bic_prefix="bic. ",
                                    )
                
fname = "reproduced_5bics_TCGA"
ax = tmp[0].ax_heatmap
ax.set_xlabel("TCGA-BRCA, n=%s samples"%exprs_tcga.shape[1], fontsize=12)
tmp[0].ax_row_colors.set_xticklabels([""]*(bics.shape[0]))
tmp[0].ax_row_colors.set_xticks([])
tmp[0].ax_col_colors.set_yticklabels(tmp[0].ax_col_colors.get_yticklabels(),fontsize=10)
tmp[0].ax_heatmap.set_yticklabels(ax.get_yticklabels(),fontsize=7)
tmp[0].fig.text(0.02, 0.03, "X (fig. panel label)", ha='left', va='bottom', fontsize=18)
#plt.savefig("Fig5.svg")

plt.show()