In [1]:
import numpy as np
import pandas as pd
import sys
import os
import random
import copy
from time import time

import matplotlib.pyplot as plt
import seaborn as sns
from itertools import product

from utils.method import read_bic_table
from utils.eval import find_best_matches, make_known_groups
from utils.eval import find_best_matching_biclusters


In [2]:
def make_ref_groups(subtypes, annotation, exprs):
    # prepared a dict of subtype classifications {"class1":{"subt1":[],"subt2":[]},"class2":{"subtA":[],"subtB":[]}}
    all_samples = set(exprs.columns.values)
    pam50 = make_known_groups(
        subtypes, exprs, target_col="PAM50", verbose=False)
    lum = {}
    lum["Luminal"] = pam50["LumA"].union(pam50["LumB"])
    scmod2 = make_known_groups(
        subtypes, exprs, target_col='SCMOD2', verbose=False)
    claudin = {}
    claudin["Claudin-low"] = set(subtypes.loc[subtypes['claudin_low']
                                 == 1, :].index.values).intersection(all_samples)

    ihc = {}
    for x in ["IHC_HER2", "IHC_ER", "IHC_PR"]:
        ihc[x] = set(annotation.loc[annotation[x]
                     == "Positive", :].index.values)
    ihc["IHC_TNBC"] = set(
        annotation.loc[annotation["IHC_TNBC"] == 1, :].index.values)

    known_groups = {"PAM50": pam50, "Luminal": lum,
                    "Claudin-low": claudin, "SCMOD2": scmod2, "IHC": ihc}

    weights = {}
    N = 0
    for classification in known_groups.keys():
        for group in known_groups[classification].keys():
            n = len(known_groups[classification][group])
            weights[group] = n
            N += n
    for group in weights.keys():
        weights[group] = weights[group]/N

    return known_groups, weights


def calculate_perfromance(results, known_groups, weights, all_samples):
    # finds best matches for each subtype, calcuates J per subtype and overall performance
    N = len(all_samples)
    best_matches = []

    for classification in known_groups.keys():
        bm = find_best_matches(
            results, known_groups[classification], all_samples, FDR=0.05, verbose=False)
        best_matches.append(bm)

    best_matches = pd.concat(best_matches, axis=0)
    best_matches = best_matches["J"].to_dict()
    overall_performance = 0
    for group in best_matches.keys():
        overall_performance += best_matches[group]*weights[group]
    best_matches["overall_performance"] = overall_performance
    return best_matches


def compare_gene_clusters(tcga_result, metabric_result, N):
    # N - total number of genes
    # finds best matched TCGA -> METABRIC and METABRIC -> TCGA
    # calculates % of matched clusterst, number of genes in matched cluster,
    # and the average J index for best matches
    bm = find_best_matching_biclusters(tcga_result, metabric_result, N)
    bm = bm.dropna()
    bm2 = find_best_matching_biclusters(metabric_result, tcga_result, N)
    bm2 = bm2.dropna()

    bm = bm.loc[bm["n_shared"] > 1, :].sort_values(
        by="n_shared", ascending=False)
    bm2 = bm2.loc[bm2["n_shared"] > 1, :].sort_values(
        by="n_shared", ascending=False)

    clust_similarity = {}
    # number of biclusters
    clust_similarity["n_1"] = tcga_result.shape[0]
    clust_similarity["n_2"] = metabric_result.shape[0]
    #print("% matched biclusters:",bm.shape[0]/tcga_result.shape[0],bm2.shape[0]/metabric_result.shape[0])
    clust_similarity["percent_matched_1"] = bm.shape[0]/tcga_result.shape[0]
    clust_similarity["percent_matched_2"] = bm2.shape[0] / \
        metabric_result.shape[0]
    #print("n matched genes:",bm.loc[:,"n_shared"].sum(),bm2.loc[:,"n_shared"].sum())
    clust_similarity["n_shared_genes_1"] = bm.loc[:, "n_shared"].sum()
    clust_similarity["n_shared_genes_2"] = bm2.loc[:, "n_shared"].sum()
    #print("avg. J:",bm.loc[:,"J"].mean(),bm2.loc[:,"J"].mean())
    clust_similarity["avg_bm_J_1"] = bm.loc[:, "J"].mean()
    clust_similarity["avg_bm_J_2"] = bm2.loc[:, "J"].mean()

    return clust_similarity, bm, bm2


In [3]:
exprs_file_t = "/root/projects/data/real_data/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.log2_exprs_z_v6.tsv"
basename_t = "TCGA"

exprs_file_m = "/root/projects/data/real_data/METABRIC_1904_17Kgenes.log2_exprs_z_v6.tsv"
basename_m = "METABRIC" 

m_subtypes = pd.read_csv("/root/projects/data/real_data/METABRIC_1904_17Kgenes.subtypes_and_signatures_v6.tsv",sep = "\t",index_col=0)
m_annotation = pd.read_csv("/root/projects/data/real_data/METABRIC_1904.annotation_v6.tsv",sep = "\t",index_col=0)

t_subtypes = pd.read_csv("/root/projects/data/real_data/TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.subtypes_and_signatures_v6.tsv",sep = "\t",index_col=0)
t_annotation = pd.read_csv("/root/projects/data/real_data/TCGA-BRCA_1079.Xena_TCGA_PanCan.annotation_v6.tsv",sep = "\t",index_col=0)


exprs_t= pd.read_csv(exprs_file_t,sep = "\t",index_col=0)
exprs_t[exprs_t>3] = 3
exprs_t[exprs_t<-3] = -3

exprs_m= pd.read_csv(exprs_file_m,sep = "\t",index_col=0)
exprs_m[exprs_m>3] = 3
exprs_m[exprs_m<-3] = -3


known_groups_t, weights_t = make_ref_groups(t_subtypes, t_annotation,exprs_t)
known_groups_m, weights_m = make_ref_groups(m_subtypes, m_annotation,exprs_m)

In [4]:
from re import sub

result_t = pd.read_csv(
    "/root/projects/data/outputs/clusters_tcga.tsv", sep="\t")
result_t.index = [sub("\.", "-", x, ) for x in result_t.index]
result_t.head(2)


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X18,X19,X20,X21,X22,X23,X24,X25,X26,X27
TCGA-3C-AAAU-01,5,3,5,5,5,5,2,5,5,5,...,3,3,5,1,5,5,5,5,5,5
TCGA-3C-AALI-01,2,3,2,2,2,2,1,1,2,1,...,1,2,2,2,2,2,3,2,3,2


In [7]:
result_m = pd.read_csv(
    "/root/projects/data/outputs/clusters_mbr.tsv", sep="\t")
result_m.index = [sub("\.", "-", x, ) for x in result_m.index]
result_m.head(2)


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X18,X19,X20,X21,X22,X23,X24,X25,X26,X27
MB-0000,4,4,4,4,4,4,1,4,4,4,...,4,4,4,4,4,4,4,4,4,4
MB-0002,4,1,4,4,4,4,1,5,5,5,...,5,4,4,4,4,4,4,4,5,5


In [12]:
subt_t = []
subt_m = []

for i, run in enumerate(result_m.columns):

    new_result_t = pd.DataFrame([], columns=["samples"] , index=range(1, 6))
    new_result_t.loc[:, "samples"] = [
        result_t.index.values[result_t.loc[:, run] == x] for x in range(1, 6)]

    # find the best matches between TCGA biclusters and subtypes
    # and calculate overall performance == weighted sum of Jaccard indexes
    performance_t = calculate_perfromance(new_result_t, known_groups_t,
                                            weights_t, set(exprs_t.columns.values))
    subt_t.append(performance_t)

    new_result_m = pd.DataFrame([], columns=["samples"] , index=range(1, 6))
    new_result_m.loc[:, "samples"] = [
        result_m.index.values[result_m.loc[:, run] == x] for x in range(1, 6)]
    # find the best matches between METABRIC biclusters and subtypes
    # and calculate overall performance == weighted sum of Jaccard indexes
    performance_m = calculate_perfromance(new_result_m, known_groups_m,
                                            weights_m, set(exprs_m.columns.values))
    subt_m.append(performance_m)


In [13]:
pd.DataFrame.from_records(subt_t).to_csv(
    "/root/projects/data/outputs/subts_TCGA.tsv", sep="\t")
pd.DataFrame.from_records(subt_m).to_csv(
    "/root/projects/data/outputs/subts_METABRIC.tsv", sep="\t")