In [1]:
import sys,os
import numpy as np
import pandas as pd

from utils.method import read_bic_table

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# read sample annotations 
cohort = "BRCA"
annot = pd.read_csv("data/PanCan/"+cohort+".annotation.tsv.gz",sep ="\t",index_col=0)

annot["stage_3+4"] = annot["stage_3"] + annot["stage_4"] 
annot.loc[:,["stage_2","stage_3","stage_4","stage_3+4","age","sex"]].sum()

stage_2        622.0
stage_3        250.0
stage_4         20.0
stage_3+4      270.0
age          64019.0
sex           1086.0
dtype: float64

In [3]:
# read biclusters 
fname = cohort+".consensus_kmeans.42.bin=kmeans,pval=0.01,clust=WGCNA,ds=0,dch=0.995.biclsuters.tsv"
folder = "/home/olya/Downloads/"+cohort+"/"

biclusters = read_bic_table(folder+fname)
print("All biclusters:", biclusters.shape[0])
biclusters = biclusters.loc[biclusters["detected_n_times"]>1,:]
print("Detected 2+ times:", biclusters.shape[0])

All biclusters: 149
Detected 2+ times: 110


# Survival analysis

In [4]:
from utils.eval import  add_survival

covariates =["age","stage_2","stage_3","stage_4"]

In [None]:
# add survival analysis results to bicluster table
biclusters = add_survival(biclusters, annot, event= "OS",
                          surv_time ="OS.time",covariates = covariates)
biclusters = add_survival(biclusters, annot, event= "DFI",
                          surv_time ="DFI.time",covariates = covariates)
biclusters = add_survival(biclusters, annot, event= "PFI",
                          surv_time ="PFI.time",covariates = covariates)

biclusters

perfect separation for biclsuter of  12/1061 samples variances: 0.00 0.01


In [None]:
# adjusted p-value for OS, DFI or PFI is < 0.05:
biclusters[biclusters[["OS.p_value_BH","DFI.p_value_BH","PFI.p_value_BH"]].min(axis=1)<0.05].loc[:,["SNR","n_genes","n_samples","genes","direction",
                                                                                                    "OS.p_value_BH","DFI.p_value_BH","PFI.p_value_BH"]].sort_values(by="OS.p_value_BH")

In [None]:
#" ".join(sorted(biclusters.loc[47,"genes"]))


### Associaiton with sex

In [None]:
# male samples
males = set(annot.loc[annot["sex"]==0,:].index.values) 
females = set(annot.loc[annot["sex"]==1,:].index.values) 
# all samples
N = annot.shape[0]  
print("%s males, %s females"%(len(males), len(females)))

In [None]:
from utils.eval import add_sex

add_sex(biclusters,males = males,females=females).sort_values(by=["sex.pval_BH"])

In [None]:
from statsmodels.stats.multitest import fdrcorrection
from utils.eval import test_sample_overlap

sample_sets = {"male":males,"female":females}
dfs =[]
for sex in ["male","female"]:
    sample_set = sample_sets[sex]
    df = biclusters.apply(lambda row: test_sample_overlap(row, sample_set, N),axis=1)
    df.columns = [sex+"."+x for x in df.columns]
    bh_res, adj_pval = fdrcorrection(df[sex+".pval"].values, alpha=0.05)
    df[sex+".pval_BH"] =  adj_pval
    dfs.append(df)
biclusters = pd.concat([biclusters]+dfs,axis=1)
biclusters["sex.pval_BH"] = biclusters.loc[:,["male.pval_BH","female.pval_BH"]].min(axis=1)

biclusters.sort_values(by=["sex.pval_BH"])