In [20]:
import pandas as pd
import os, time, logging
import numpy as np
from src.data import Hicmat, plot_data, preprocess_data
from src.tad_algo import TopDom, TADtree
from src.utils import *
import CTCF

In [99]:
def bedPicks(file, chrom, resolution):
    
    df = pd.read_csv(file, sep='\t', comment = 't', header=None)
    header = ['chrom', 'chStart', 'chEnd', 'name', 'score', 'strand', 'sigValue', 'pValue', 'qValue', 'peak']
    df.columns = header[:len(df.columns)]
    l_peak = []                               #to store pairs (chromStart, chromEnd) for a specific chrom
    
    #delete non useful columns 
    if set(df['name'])=={'.'}:
        del df['name']
    if set(df['strand'])=={'.'}:
        del df['strand']
    if set(df['pValue'])=={-1.0}:
        del df['pValue']
        
    #we take into account data for a specific chromosome 
    df = df[df['chrom']==chrom]
    df = df.sort_values(by = 'chStart')
    
    #just some tests for a eventual filter 
    score = np.array(df['sigValue'])
    #print(score.mean(), score.min(), score.max()) 
    index = df.index.tolist()
    for ctcf in index:
        if int(df['sigValue'][ctcf])>=140:
            l_peak.append(int(round(df['chStart'][ctcf]/resolution, 0)))
            l_peak.append(int(round(df['chEnd'][ctcf]/resolution, 0)))
    return list(set(l_peak))

In [38]:
def consensus(list1, list2, resolution, ctcf, k):
    resolution_to_gap = {100000: k,
                         50000: 2,
                         25000: 4}
    gap = resolution_to_gap[resolution]
    true_TADs = []
    for tad in list1:
        for i in range(-gap, gap+1):
            for j in range(-gap, gap+1):
                if (tad[0]+i, tad[1]+j) in list2:
                    true_TADs.append((min(tad[0], tad[0]+i), max(tad[1], tad[1]+j)))
    # smoothing
    counter=0
    n=len(true_TADs)
    for i in range(n-1):
        for j in range(i+1, n):
            if true_TADs[i][0]<true_TADs[j][0] and true_TADs[i][1]<true_TADs[j][1] and true_TADs[i][1]>true_TADs[j][0]:
                middle = int((true_TADs[i][1]-true_TADs[j][0])/2)
                for boundary in range(true_TADs[j][0], true_TADs[i][1]+1):
                    if boundary in ctcf:
                        true_boundary = boundary
                        counter+=1
                        break
                    else:
                        true_boundary = true_TADs[j][0]+middle
                true_TADs[i] = (true_TADs[i][0], true_boundary)
                true_TADs[j] = (true_boundary, true_TADs[j][1])
            if true_TADs[i][0]>true_TADs[j][0] and true_TADs[i][1]>true_TADs[j][1] and true_TADs[i][0]<true_TADs[j][1]:
                middle = int((true_TADs[j][1]-true_TADs[i][0])/2)
                for boundary in range(true_TADs[j][0], true_TADs[i][1]+1):
                    if boundary in ctcf:
                        true_boundary = boundary
                        counter+=1
                        break
                    else:
                        true_boundary = true_TADs[i][0]+middle
                true_TADs[i] = (true_boundary, true_TADs[i][1])
                true_TADs[j] = (true_TADs[j][0], true_boundary)
    print(counter)
    return true_TADs

In [100]:
fileCTCF = os.path.join('..', 'CTCF', 'ENCFF796WRU.bed')
l_peak_GM = bedPicks(fileCTCF, 'chr1', 100000)

In [47]:
tads_by_tadtree = pd.read_csv('../CHROMOSOMES/GM12878/100kb/TADtree_outputs/chr1/N399.txt', delimiter='\t')
tads_by_tadtree = tads_by_tadtree.iloc[:, [1,2]]
tadtree = []
for i in range(len(tads_by_tadtree['start'])):
    tadtree.append((tads_by_tadtree['start'][i], tads_by_tadtree['end'][i]))
print(tadtree)

[(13, 16), (17, 19), (21, 23), (23, 26), (23, 32), (27, 32), (35, 37), (37, 60), (40, 45), (55, 60), (60, 80), (62, 65), (67, 70), (70, 79), (80, 91), (81, 84), (85, 89), (91, 93), (93, 96), (96, 100), (96, 105), (100, 105), (105, 111), (105, 121), (107, 110), (112, 118), (114, 117), (121, 123), (123, 127), (127, 142), (138, 142), (142, 149), (142, 158), (149, 154), (154, 157), (158, 161), (161, 164), (164, 166), (166, 168), (168, 173), (173, 202), (174, 177), (179, 187), (179, 192), (189, 192), (192, 195), (196, 200), (202, 205), (202, 210), (205, 209), (210, 223), (211, 216), (216, 219), (220, 223), (225, 233), (230, 233), (233, 238), (235, 238), (238, 241), (241, 243), (244, 250), (245, 248), (250, 256), (252, 255), (256, 258), (258, 262), (258, 264), (264, 268), (269, 272), (274, 277), (274, 281), (277, 280), (282, 285), (287, 290), (291, 296), (291, 311), (298, 301), (311, 340), (312, 315), (318, 321), (322, 325), (326, 329), (329, 333), (333, 336), (337, 340), (340, 352), (342, 3

In [94]:
folder = os.path.join('..', 'CHROMOSOMES', 'GM12878', '100kb')
data_path = os.path.join(folder, 'chr1_100kb.npy')
resolution=100000
if not os.path.isfile(data_path):
    preprocess_data(folder, resolution)
hic_mat = Hicmat(data_path, resolution)
hic_mat.filter(threshold = 1)
topdom = TopDom()
topdom_tads = topdom.getTADs(hic_mat, window=10)

TopDom Step 1 : Generating binSignals by computing bin-level contact frequencies
TopDom Step 2 : Detect TD boundaries based on binSignals
TopDom Step 3 : Statistical Filtering of false positive TD boundaries
TopDom : Exporting TADs


In [95]:
for i in range(len(topdom_tads)):
    topdom_tads[i]=(int(topdom_tads[i][0]/resolution), int(topdom_tads[i][1]/resolution))
print(topdom_tads)

[(6, 34), (34, 60), (60, 80), (80, 129), (137, 171), (171, 192), (192, 210), (210, 243), (243, 256), (256, 268), (268, 296), (299, 312), (312, 340), (340, 352), (352, 369), (369, 402), (402, 415), (415, 436), (436, 452), (452, 468), (468, 494), (494, 508), (508, 537), (537, 555), (555, 573), (573, 589), (589, 604), (604, 626), (626, 652), (652, 672), (672, 703), (703, 731), (731, 763), (763, 817), (817, 843), (843, 868), (868, 891), (891, 905), (905, 917), (917, 933), (933, 944), (944, 958), (958, 974), (974, 986), (986, 1002), (1002, 1035), (1043, 1064), (1064, 1090), (1090, 1129), (1129, 1154), (1154, 1166), (1166, 1185), (1185, 1206), (1447, 1458), (1495, 1520), (1520, 1533), (1533, 1552), (1552, 1581), (1581, 1591), (1591, 1614), (1614, 1634), (1634, 1650), (1650, 1668), (1668, 1682), (1682, 1698), (1698, 1714), (1714, 1750), (1750, 1769), (1769, 1782), (1782, 1796), (1796, 1828), (1828, 1863), (1863, 1892), (1892, 1905), (1905, 1944), (1944, 1967), (1967, 1996), (1996, 2030), (203

In [16]:
consensus_GM12878 = consensus(tadtree, topdom_tads, 100000, l_peak_GM, 2)

In [17]:
print(consensus_GM12878)

[(60, 80), (311, 340), (340, 352), (415, 436), (451, 468), (905, 919), (1520, 1533), (1581, 1592), (2030, 2059), (2100, 2114), (2114, 2134), (2318, 2335)]


In [12]:
print(consensus_GM12878)

[(59, 80), (171, 192), (311, 340), (451, 468), (537, 555), (589, 604), (1682, 1699), (2030, 2059), (2114, 2134), (2287, 2307)]


TopDom trouve pas bcp de tads et ils correspondent pas avec ceux de TadTree

In [18]:
len(topdom_tads)

99

In [19]:
for z in range(7):
    print(len(consensus(tadtree, topdom_tads, 100000, l_peak_GM, z)))

3
8
12
18
20
26
40


In [26]:
true_results = read_arrowhead_result(os.path.join('..', 'RESULTS', 'GSE63525_HMEC_Arrowhead_domainlist.txt'), '1', 100000)

In [104]:
consensus1 = consensus(tadtree, true_results, 100000, l_peak_GM, 2)

14


In [91]:
sorted(l_peak_GM)

[3,
 8,
 9,
 10,
 11,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 28,
 34,
 35,
 36,
 37,
 38,
 39,
 42,
 46,
 47,
 48,
 55,
 56,
 57,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 83,
 84,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 95,
 96,
 97,
 98,
 99,
 100,
 102,
 103,
 104,
 105,
 107,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 126,
 127,
 132,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 160,
 161,
 162,
 164,
 165,
 166,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 186,
 187,
 188,
 189,
 190,
 191,
 192,
 193,
 194,
 195,
 196,
 197,
 198,
 199,
 200,
 201,
 202,
 204,
 205,
 206,
 207,
 210,
 212,
 213,
 214,
 215,
 216,
 217,
 218,
 220,
 221,
 223,
 224,
 226,
 227,
 229,
 230,
 231,
 232,
 234

In [102]:
a=0
b=[]
c=set()
for tad in tadtree:
    if tad[0] in l_peak_GM and tad[0] not in b:
        a+=1
        b.append(tad[0])
    if tad[1] in l_peak_GM and tad[1] not in b:
        a+=1
        b.append(tad[1])
    c.add(tad[0])
    c.add(tad[1])
a/len(c)

0.6101010101010101

In [48]:
len(tadtree)

374

In [51]:
len(c)

495

In [101]:
len(l_peak_GM)

1075

In [103]:
a=0
b=[]
c=set()
for tad in topdom_tads:
    if tad[0] in l_peak_GM and tad[0] not in b:
        a+=1
        b.append(tad[0])
    if tad[1] in l_peak_GM and tad[1] not in b:
        a+=1
        b.append(tad[1])
    c.add(tad[0])
    c.add(tad[1])
a/len(c)

0.4716981132075472

In [105]:
a=0
b=[]
c=set()
for tad in consensus1:
    if tad[0] in l_peak_GM and tad[0] not in b:
        a+=1
        b.append(tad[0])
    if tad[1] in l_peak_GM and tad[1] not in b:
        a+=1
        b.append(tad[1])
    c.add(tad[0])
    c.add(tad[1])
a/len(c)

0.6449086161879896

In [106]:
a=0
b=[]
c=set()
for tad in true_results:
    if tad[0] in l_peak_GM and tad[0] not in b:
        a+=1
        b.append(tad[0])
    if tad[1] in l_peak_GM and tad[1] not in b:
        a+=1
        b.append(tad[1])
    c.add(tad[0])
    c.add(tad[1])
a/len(c)

0.6057233704292527