In [1]:
%load_ext autoreload
%autoreload 2

### CNVAR

In [11]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import chromATAC as ca
from chromATAC.integrated import IntData, save_collision
from functools import reduce
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import matplotlib.gridspec as gridspec
import h5py


RES = [1e03]

def Clustered_TEs(chromosome_layer, **kwargs):
    resolution = kwargs.get('resolution')
    conditions = {}
    annotations = {}
    resolution = kwargs.get('resolution')
    ann = {0:'TE with no defined CORTE in any chromosome', 1:'TE with defined CORTEs'}
    for c in range(1, 25):
        idx = np.array([i.split('>')[-1] for i in ind.chr.layers[resolution]['TEs']['index'][c]])
        conditions[c] = np.array([int(i in idx) for i in chromosome_layer['index'][c]])
        annotations[c] = ann
    return conditions, annotations

def get_cancertype(chromosome_layer, **kwargs):
    conditions = {}
    annotations = {}
    resolution=kwargs.get('resolution')
    ann = {i:v for i, v in enumerate(tcga_met[' Project'].apply(lambda x:x.split('TCGA-')[-1]).unique())}
    mapping = {v:i for i, v in enumerate(tcga_met[' Project'].apply(lambda x:x.split('TCGA-')[-1]).unique())}
    for c in ca.info.CHROMOSOMES['numericals'].values():
        samples = chromosome_layer['index'][c]
        conditions[c] = [mapping[i.split('_')[0].split('>')[-1]] for i in ind.chr.layers[resolution]['TCGA']['index'][c]]
        annotations[c] = ann
    return conditions, annotations
    
def CTeCore_filter(chromosome_layer, **kwargs):
    conditions = {}
    annotations = {}    
    ann = {0:'TEs with No Significant Difference between #Cores and #Elements in this Chromosome', 1:'TEs with Significant Difference between #Cores and #Elements in this Chromosome'}
    index = kwargs.get('index')
    test_res = kwargs.get('test_res')
    maj = kwargs.get('majority', 3)
    for c in range(1, 25):
        idx = np.array([i.split('>')[-1] for i in index[test_res[:, c-1]>=maj]])
        conditions[c] = np.array([int(i.split('>')[-1] in idx) for i in chromosome_layer['index'][c]])
        annotations[c] = ann
    return conditions, annotations

def GTeCore_filter(genome_layer, **kwargs):
    conditions = {}
    annotations = {}
    ann = {0:'TEs with No Significant Difference between #Cores and #Elements in this Chromosome', 1:'TEs with Significant Difference between #Cores and #Elements in this Chromosome'}
    index = kwargs.get('index')
    test_res = kwargs.get('test_res')
    maj = kwargs.get('majority', 3)
    for c in range(1, 25):
        idx = np.array([i.split('>')[-1] for i in index[test_res[:, c-1]>=maj]])
        conditions[c] = np.array([int(i.split('>')[-1] in idx) for i in genome_layer['index']])
        annotations[c] = ann
    return conditions, annotations

def save_correlation(omic, layers, resolution, c='all', path=".", cmp_method='sim'):
    metrs = ['score', 'pvalue']
    if c=='all':
        c=ca.info.CHROMOSOMES['names']
    if not isinstance(c, list):
        c = [c]
    for c in c:
        path = os.path.join(path, cmp_method+f'/{c}')
        if not os.path.exists(path):
            os.makedirs(path)
        for m in metrs:
            for k, v in omic.layers[resolution]['co'][tuple(sorted(set(layers)))][cmp_method][c][m].items():
                v.to_csv(f'{path}/{c}_{k}_{m}.csv')
def tcga_mapping(name):
    return f"{tcga_met.iloc[np.where([i in name for i in tcga_met['File Name'].apply(lambda x: x.split('_')[0].split('-')[-1])])[0]][' Project'].apply(lambda x:x.split('TCGA-')[-1]).values[0]}_{name}"
    
def TE_family(chromosome_layer, **kwargs):
    conditions = {}
    annotations = {}    
    ann = {i:te.split('>')[-1] for i, te in enumerate(ind.chr.layers[1e06]['TEs']['index'][1])}
    con = {te.split('>')[-1]:i for i, te in enumerate(ind.chr.layers[1e06]['TEs']['index'][1])}
    for c in range(1, 25):
        conditions[c] = np.array([con[i.split('>')[-1]] for i in chromosome_layer['index'][c]])
        annotations[c] = ann
    return conditions, annotations
    
def Cnv_cancertype(chromosome_layer, **kwargs):
    conditions = {}
    annotations = {}
    ann = {i:cancer for i, cancer in enumerate(cnv_meta['cancer_type'].unique())}
    con =  {cancer:i for i, cancer in enumerate(cnv_meta['cancer_type'].unique())}
    for c in tqdm(range(1, 25)):
        samples = chromosome_layer['index'][c]
        conditions[c] = np.array([con[cnv_meta.iloc[np.where(cnv_meta['name']==sample.split('.bed')[0].split(">")[-1])[0]]['cancer_type'].values.item()] for sample in samples if sample.split(">")[-1] in cnv_meta['name'].values])
        annotations[c] = ann
    return conditions, annotations
        
def normalize_cotes(matrix, **kwargs):
    index = kwargs.get('index')
    df = n_cotes.reindex([i.split('CoTEs>')[-1] for i in index])
    v = df['n_cotes'].values
    m = matrix/v[:, np.newaxis]
    return m
    
def cnv_class_filter(row, **kwargs):
    index_col=kwargs.get('index_col')
    return bool(sample_sig.loc[row.iloc[index_col], row.loc['CN']])

def tcga_cancertype(chromosome_layer, **kwargs):
    conditions = {}
    annotations = {}
    resolution=kwargs.get('resolution')
    ann = {i:v for i, v in enumerate(tcga_meta[' Project'].apply(lambda x:x.split('TCGA-')[-1]).unique())}
    mapping = {v:i for i, v in enumerate(tcga_meta[' Project'].apply(lambda x:x.split('TCGA-')[-1]).unique())}
    df = tcga_meta.reindex([x.split('_')[0].split('-')[-1] for x in tcga_samples['samples']])
    for chr in ca.info.CHROMOSOMES['numericals'].values():
        samples = [i.split('>')[-1] for i in chromosome_layer['index'][chr]]
        conditions[chr] = df.loc[samples][' Project'].apply(lambda x: mapping[x.split('-')[-1]]).values
        annotations[chr] = ann
    return conditions, annotations
    
def tcga_mapping(name):
    return f"{tcga_met.iloc[np.where([i in name for i in tcga_met['File Name'].apply(lambda x: x.split('_')[0].split('-')[-1])])[0]][' Project'].apply(lambda x:x.split('TCGA-')[-1]).values[0]}_{name}"

def tcga_gain(matrix):
    return-1 * matrix.copy()

def cnv_gain(matrix):
    m = matrix.copy()
    m[m<=2]=0
    # m[m!=0]=1
    return m
    
def cnv_loss(matrix):
    m = matrix.copy()
    m[m>=2]=0
    # m[m!=0]=1
    return m

In [6]:
parent_dir = '/Users/mossishahi/Code/lupien/IntData/'

te_dir = parent_dir+'/data/V2-TEs/non_olap'

cote_dir = parent_dir+'data/V2-TEs/cores1000'

In [7]:
n_cotes = pd.read_csv(cote_dir+'/n_clusters_per_TE.tsv', sep='\t', names=['TE', 'n_cotes'])
n_cotes['TE'] = n_cotes['TE'].apply(lambda x: x.split('_Merged')[0])
n_cotes = n_cotes.set_index('TE')

In [9]:
ind = IntData()
ind.add_layer("TEs", 
              input=te_dir, 
              index_mapper=lambda x: x.split("_Merged.bed")[0], 
              resolutions=RES)
ind.add_layer("CoTEs", 
              input=cote_dir, 
              index_mapper=lambda x: x.split("_Merged.bed")[0], 
              resolutions=RES)

loading files from: /Users/mossishahi/Code/lupien/IntData//data/V2-TEs/non_olap


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 975/975 [04:34<00:00,  3.55it/s]


loading files from: /Users/mossishahi/Code/lupien/IntData/data/V2-TEs/cores1000


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 456/456 [02:03<00:00,  3.70it/s]


In [10]:
ind.add_layer("TCGA",  
              resolutions=RES, 
              input=parent_dir+'/data/TCGA/filtered_TCGA', 
              index_mapper=lambda x: x.split('_')[0].split('-')[-1],
             feature_type='signal',
             overlap_method='max',
             feature_column=4)

loading files from: /Users/mossishahi/Code/lupien/IntData//data/TCGA/filtered_TCGA


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 434/434 [05:27<00:00,  1.32it/s]


In [22]:
files = [i.split('_peaks.')[0] for i in os.listdir(parent_dir+'/data/TCGA/filtered_TCGA/') if i.endswith('.bed')]

In [23]:
tcga_meta = pd.read_csv(parent_dir+'/data/TCGA/GDC_identifiers_no_duplicates.tsv', sep='\t')
tcga_samples = pd.DataFrame(files, columns=['samples'])
tcga_meta['id'] = tcga_meta['File Name'].apply(lambda x: x.split('_')[0].split('-')[-1])
tcga_meta = tcga_meta.set_index('id')
df=tcga_meta.reindex([x.split('_')[0].split('-')[-1] for x in tcga_samples['samples']])

In [25]:
ind.chr.add_filter('Cancertype', tcga_cancertype, ['TCGA'], resolutions=RES)

In [None]:
specificity = False
# for c in tqdm(
#     list(ca.info.CHROMOSOMES['numericals'].values())
#     ):
ind.chr.collide_layers(['CoTEs', 'TCGA'], 
                       resolution=RES[0], 
                       groups={'TCGA':{'Cancertype':list(tcga_meta[' Project'].apply(lambda x: x.split('TCGA-')[-1]).unique())}}, 
                       # chrom=c, genome wide
                       kernels = {'CoTEs':normalize_cotes}, 
                       coll_key='gain', 
                       specificity=specificity,
                       classifier=lambda x: 'TCGA' in x, 
                       by={'TCGA':'group', 'CoTEs':'sample'})
ind.chr.collide_layers(['CoTEs', 'TCGA'], 
                       resolution=RES[0], 
                       groups={'TCGA':{'Cancertype':list(tcga_meta[' Project'].apply(lambda x: x.split('TCGA-')[-1]).unique())}}, 
                       # chrom=c, genome wide
                       kernels = {'TCGA':tcga_loss, 'CoTEs':normalize_cotes}, 
                       coll_key='loss',
                       specificity=specificity,
                       classifier=lambda x: 'TCGA' in x, 
                       by={'TCGA':'group', 'CoTEs':'sample'})

In [None]:
save_collision(ind.chr,
               specificity=False
               ['CoTEs', 'TCGA'], 
               'gain', resolution=RES[0], 
               parent='/Users/mossishahi/Code/lupien/IntData/analysis/experiment200G-TCGA/')
save_collision(ind.chr, 
               ['CoTEs', 'TCGA'],
              specificity=False
               'loss', resolution=RES[0], 
               parent='/Users/mossishahi/Code/lupien/IntData/analysis/experiment200G-TCGA/')