Implementation of a DanQ-like network on sequences landing in peaks of H3K27ac ChIP-Seq data from three different cell types (neuron, glia and microglia) sorted from the dorsolateral prefrontal cortex of postmortem healthy brains

Package dependencies: ucscgenome, intervaltree, pandas

In [1]:
import ucscgenome
import pandas as pd
import os
import intervaltree

DNA_ALPHABET = {"N":0, "A":1, "G":2, "C":3, "T":4}


hg19 = ucscgenome.Genome('hg19', cache_dir='/home/taeyoonp/multiple_alignment-python/seqfiles/', use_web=False)
peak_file_dir = '/projects/pfenninggroup/jemmie/3expHg/rawData/bam/case_control_peaks_stringent/'

label_names = {
               'G':os.path.join(peak_file_dir,'controls_Glia_peaks_peaks.narrowPeak'),
               'M':os.path.join(peak_file_dir,'controls_Microglia_peaks_peaks.narrowPeak'),
               'N':os.path.join(peak_file_dir,'controls_Neuron_peaks_peaks.narrowPeak')
              }

In [2]:
def parse_narrow_peak_bed_file(narrow_peak_file, reference_genome_name):
    # headers specific to narrow peak format (order matters)
    narrow_peak_headers = ["chromosome_name", "chromosome_start_index", "chromosome_end_index",
                           "region_name", "score", "strand", "signal_value", "p_value", "q_value", "peak"]

    with open(narrow_peak_file, 'r') as narrow_peak_bed_csv:
        narrow_peak_df = pd.read_csv(narrow_peak_bed_csv, sep='\t', header=None, names=narrow_peak_headers)
        narrow_peak_df['reference_genome'] = reference_genome_name
    return narrow_peak_df.to_dict(orient="records")

In [3]:
def calcOverlapLength(first_start, first_end, second_start, second_end):
    if first_start > second_end:
        return 0
    elif second_start > first_end:
        return 0
    elif first_start >= second_start:
        if first_end <= second_end:
            return first_end - first_start
        else:
            return second_end - first_start
    elif second_start >= first_start:
        if second_end <= first_end:
            return second_end - second_start
        else:
            return first_end - second_start
    


In [4]:
def are_coordinates_half_in_peaks(trees_by_chromosome, chromosome, start, stop):
    if not chromosome in trees_by_chromosome:
        return False
    
    half_length = (stop-start)/2
    
    chromosome_peak_tree = trees_by_chromosome[chromosome]
    
    overlapPeaks = chromosome_peak_tree[start:stop]
    
    overlapLength = 0
    
    for overlapPeak in overlapPeaks:
        currOverlap = calcOverlapLength(overlapPeak.begin, overlapPeak.end, start, stop)
        overlapLength+=currOverlap
        
    return overlapLength >= half_length

In [5]:
def construct_interval_tree(peak_data):
    trees_by_chromosome = dict()
    for peak in peak_data:
        chromosome = peak['chromosome_name']
        start_index = peak['chromosome_start_index']
        end_index = peak['chromosome_end_index']
        
        if chromosome not in trees_by_chromosome:
            trees_by_chromosome[chromosome] = intervaltree.IntervalTree()
        trees_by_chromosome[chromosome].addi(start_index, end_index)
    return trees_by_chromosome

In [6]:
data_dict = {}

for label in label_names:
    data_dict[label] = parse_narrow_peak_bed_file(label_names[label], hg19.genome_file)

In [7]:
peak_coordinate_trees = {}

for label in data_dict:
    # sorted_peak_coordinates[label] = get_sorted_peak_coordinates(data_dict[label])
    peak_coordinate_trees[label] = construct_interval_tree(data_dict[label])

In [8]:
bin_size = 200
flank_size = 400

for chrom in hg19:
    chrom_size = hg19.sequence_sizes()[chrom]
    
    countNumTrue = {}
    countNumUniqueTrue = 0    
    for i in xrange(0, chrom_size, bin_size):
        #sequence = hg19[chrom][i:i+bin_size]
        response_variables = {}
        
        for label in data_dict:
            response_variables[label] = are_coordinates_half_in_peaks(peak_coordinate_trees[label], chrom, i, i+bin_size)
            if response_variables[label]:
                if not label in countNumTrue:
                    countNumTrue[label] = 0
                countNumTrue[label] = countNumTrue[label] + 1
        
        isOneTrue = False
        for label in response_variables:
            if response_variables[label]:
                isOneTrue = True
        
        if isOneTrue:
            countNumUniqueTrue+=1
    
    print chrom
    print countNumTrue
    print countNumUniqueTrue
                
        
            

chr19_gl000208_random
{}
0
chr8_gl000197_random
{}
0
chr6_apd_hap1
{}
0
chr13
{'M': 19517, 'G': 21604, 'N': 19070}
37628
chr12
{'M': 39018, 'G': 38712, 'N': 48613}
78776
chr11
{'M': 49899, 'G': 52412, 'N': 53089}
92649
chr10
{'M': 45000, 'G': 45732, 'N': 41200}
80274
chr17
{'M': 44685, 'G': 43642, 'N': 41415}
73169
chr16
{'M': 30383, 'G': 30405, 'N': 31079}
54468
chr15
{'M': 29104, 'G': 30740, 'N': 27857}
52610
chr14
{'M': 29644, 'G': 30262, 'N': 27242}
52310
chr19
{'M': 34928, 'G': 32432, 'N': 30873}
52185
chr18
{'M': 15886, 'G': 17231, 'N': 18510}
33414
chr9_gl000198_random
{}
0
chrUn_gl000239
{}
0
chrUn_gl000238
{}
0
chrUn_gl000233
{}
0
chrUn_gl000232
{}
0
chrUn_gl000231
{}
0
chrUn_gl000230
{}
0
chrUn_gl000237
{}
0
chrUn_gl000236
{}
0
chrUn_gl000235
{}
0
chrUn_gl000234
{}
0
chr6_qbl_hap6
{}
0
chr11_gl000202_random
{}
0
chr17_gl000206_random
{}
0
chr6_cox_hap2
{}
0
chr4_gl000193_random
{}
0
chrUn_gl000248
{}
0
chrUn_gl000249
{}
0
chrUn_gl000246
{}
0
chrUn_gl000247
{}
0
chrUn_gl000244

In [9]:
def get_sorted_peak_coordinates(peak_data):
    all_peak_coordinates = dict()
    for peak in peak_data:
        chromosome = peak['chromosome_name']
        start_index = peak['chromosome_start_index']
        end_index = peak['chromosome_end_index']
        
        if chromosome not in all_peak_coordinates:
            all_peak_coordinates[chromosome] = []
        all_peak_coordinates[chromosome].append([start_index, end_index])

    for chromosome in all_peak_coordinates:
        all_peak_coordinates[chromosome] = sorted(all_peak_coordinates[chromosome], key=lambda x: (x[0],x[1]))
    return all_peak_coordinates

In [10]:
def testCalcOverlapLength():
    peak_start = 100
    peak_end = 700
    coord_start = 500
    coord_end = 700
    assert(calcOverlapLength(peak_start, peak_end, coord_start, coord_end)==200)
    coord_start = 0
    coord_end = 99
    assert(calcOverlapLength(peak_start, peak_end, coord_start, coord_end)==0)
    coord_start = 0
    coord_end = 400
    assert(calcOverlapLength(peak_start, peak_end, coord_start, coord_end)==300)
    coord_start= 200
    coord_end = 400
    assert(calcOverlapLength(peak_start, peak_end, coord_start, coord_end)==200)
    coord_start = 50
    coord_end = 740
    assert(calcOverlapLength(peak_start, peak_end, coord_start, coord_end)==600)
    coord_start = 0
    coord_end = 100
    assert(calcOverlapLength(peak_start, peak_end, coord_start, coord_end)==0)
    coord_start = 700
    coord_end = 900
    assert(calcOverlapLength(peak_start, peak_end, coord_start, coord_end)==0)
    coord_start = 0
    coord_end = 101
    assert(calcOverlapLength(peak_start, peak_end, coord_start, coord_end)==1)
    coord_start = 699
    coord_end = 899
    assert(calcOverlapLength(peak_start, peak_end, coord_start, coord_end)==1)    
    
testCalcOverlapLength()

In [15]:
def test_are_coordinates_half_in_peaks():    
    """THIS TEST WAS SUPPOSED TO WORK FOR A PREVIOUS FUNCTION VERSION"""
    sorted_peak_coordinates = {'chr1': [[540603, 540681],
                                      [713280, 713471],
                                      [713888, 714341],
                                      [714450, 714755],
                                      [762430, 762597],
                                      [762657, 763009],
                                      [785042, 785218],
                                      [785399, 785647]],
                                'chr10': [[177214, 177361],
                                      [9972654, 9972743],
                                      [9973249, 9973451],
                                      [9973703, 9974065],
                                      [9974388, 9974520]],
                                 'chr11': [[187562, 187823],
                                  [188147, 188324],
                                  [189338, 190324],
                                  [190409, 190945],
                                  [191958, 193127],
                                  [193181, 193802],
                                  [193857, 194098],
                                  [194152, 194274],
                                  [194361, 194833],
                                  [195038, 196019],
                                  [196845, 196947],
                                  [199618, 199785],
                                  [206704, 209406],
                                  [209480, 209604],
                                  [210355, 210658],
                                  [210895, 210968],
                                  [211227, 211305],
                                  [211594, 211695],
                                  [212835, 212908],
                                  [213103, 213179],
                                  [213230, 213393],
                                  [214011, 214438],
                                  [215456, 215536],
                                  [215767, 216267],
                                  [216336, 216460],
                                  [217046, 217144],
                                  [218204, 219831],
                                  [220558, 220810],
                                  [221072, 223807],
                                  [224043, 224957],
                                  [226100, 226275],
                                  [227488, 227577],
                                  [227932, 228052],
                                  [235354, 235456],
                                  [235523, 237520],
                                  [288598, 288700],
                                  [288781, 288918],
                                  [355736, 355995],
                                  [356240, 356419],
                                  [368586, 369026],
                                  [369099, 369294],
                                  [370071, 372365],
                                  [373145, 373254],
                                  [373582, 374143],
                                  [374239, 374614],
                                  [375220, 376089],
                                  [376441, 378182],
                                  [379687, 379835],
                                  [381818, 382050],
                                  [382239, 382341],
                                  [382433, 382650],
                                  [384800, 384883],
                                  [385698, 386675],
                                  [386769, 388769],
                                  [392943, 393017],
                                  [395881, 396106],
                                  [396202, 396522],
                                  [396574, 397632],
                                  [397699, 397824],
                                  [406597, 406732],
                                  [406792, 407089],
                                  [447937, 449038],
                                  [449614, 449688],
                                  [449827, 451974],
                                  [455313, 455907],
                                  [456490, 457993],
                                  [458074, 458151],
                                  [460394, 460486],
                                  [461374, 461459],
                                  [461717, 461942],
                                  [462562, 462670],
                                  [464316, 464396],
                                  [464448, 464702],
                                  [467997, 470642],
                                  [470901, 471083],
                                  [471269, 471691],
                                  [471813, 473174],
                                  [473284, 473361],
                                  [473429, 473532],
                                  [474112, 474292],
                                  [476399, 476550],
                                  [476793, 477900],
                                  [479081, 479225],
                                  [479341, 479473],
                                  [487532, 487610],
                                  [487707, 487817],
                                  [487912, 488051],
                                  [488615, 489262],
                                  [490030, 490196],
                                  [490394, 490493],
                                  [490648, 490788],
                                  [490856, 491674],
                                  [492660, 492743],
                                  [492981, 493195],
                                  [494194, 494410],
                                  [494578, 494865],
                                  [494928, 495881],
                                  [495939, 496278],
                                  [498152, 498373],
                                  [498568, 498806],
                                  [499076, 499238],
                                  [499302, 499441],
                                  [499517, 499690],
                                  [499781, 500298],
                                  [501083, 501174],
                                  [501610, 501710],
                                  [501844, 503116],
                                  [503234, 503395],
                                  [503624, 504822],
                                  [506154, 507346],
                                  [507473, 507883],
                                  [507997, 508076],
                                  [518468, 518776],
                                  [519032, 519108],
                                  [533615, 533731],
                                  [534107, 535414],
                                  [535629, 536469],
                                  [536710, 537288],
                                  [537370, 538225],
                                  [538385, 538482],
                                  [554688, 554809],
                                  [554912, 554985],
                                  [555463, 555613],
                                  [555798, 555871],
                                  [560308, 560427],
                                  [560520, 560754],
                                  [561018, 561760],
                                  [567871, 569759],
                                  [574985, 575100],
                                  [575173, 575901],
                                  [576129, 576752],
                                  [576905, 577403],
                                  [581610, 581787],
                                  [582457, 582533],
                                  [598178, 598472],
                                  [605392, 605481],
                                  [605542, 605688],
                                  [606044, 606165],
                                  [606316, 606771],
                                  [627113, 627250],
                                  [629048, 629203],
                                  [629475, 629605],
                                  [632494, 632623],
                                  [636919, 637208],
                                  [637264, 637678],
                                  [637755, 637924],
                                  [638135, 638330],
                                  [638676, 638986],
                                  [639089, 640680],
                                  [640795, 641207],
                                  [641269, 641396],
                                  [641474, 641653],
                                  [642684, 642828],
                                  [645639, 645785],
                                  [645862, 646010],
                                  [657719, 657805],
                                  [659373, 659473],
                                  [659735, 659808],
                                  [661062, 661147],
                                  [665413, 666282],
                                  [678746, 678888],
                                  [679341, 679713],
                                  [680344, 680433],
                                  [688882, 689014],
                                  [689785, 689892],
                                  [690406, 690524],
                                  [690699, 692591],
                                  [692884, 693088],
                                  [693218, 696855],
                                  [696927, 697179],
                                  [697237, 697367],
                                  [697421, 697706],
                                  [697758, 698183],
                                  [698274, 699233],
                                  [699788, 699891],
                                  [700003, 700187],
                                  [702014, 702459],
                                  [702546, 703740],
                                  [705028, 707706],
                                  [719666, 719806],
                                  [720652, 721647],
                                  [721712, 721834],
                                  [721910, 722600],
                                  [725511, 726402],
                                  [726575, 726706],
                                  [726779, 726910],
                                  [727737, 727842],
                                  [727923, 728169],
                                  [728500, 728700],
                                  [729341, 729446],
                                  [737645, 737857],
                                  [746434, 748304],
                                  [748377, 749086],
                                  [759919, 760564],
                                  [761595, 763202],
                                  [763340, 765869],
                                  [765933, 766568],
                                  [766973, 767132],
                                  [767386, 767488],
                                  [771719, 772165],
                                  [772435, 773741],
                                  [773793, 773875],
                                  [776484, 776561],
                                  [776621, 778465],
                                  [779200, 779289],
                                  [779593, 779700],
                                  [779898, 779972],
                                  [781657, 782002],
                                  [782179, 783313],
                                  [783464, 783735],
                                  [784036, 784209],
                                  [784420, 784783],
                                  [784879, 785411],
                                  [785742, 787327],
                                  [787389, 787514],
                                  [787577, 787753],
                                  [787994, 788318],
                                  [788617, 790287],
                                  [790364, 790599],
                                  [790769, 790949],
                                  [791427, 791559],
                                  [792918, 798735],
                                  [799113, 799975],
                                  [800045, 800345],
                                  [800428, 800636],
                                  [800702, 800861],
                                  [800989, 801282],
                                  [801369, 801442],
                                  [801906, 802062],
                                  [802731, 802806],
                                  [802860, 802936],
                                  [803021, 803150],
                                  [803256, 803366],
                                  [803472, 803707],
                                  [803895, 805844],
                                  [808638, 810571],
                                  [818389, 820786],
                                  [820880, 820998],
                                  [821096, 821474],
                                  [821609, 821811],
                                  [822023, 822278],
                                  [832265, 833924],
                                  [834364, 835366]]
                            }
    chromosome = 'chr1'
    start = 540470
    stop = 540670
    assert(are_coordinates_half_in_peaks(sorted_peak_coordinates, chromosome, start, stop)[0]==False)
    
    chromosome = 'chr1'
    start = 540510
    stop = 540710
    assert(are_coordinates_half_in_peaks(sorted_peak_coordinates, chromosome, start, stop)[0]==False)
    
    import time
    origTime = time.time()
    chromosome = 'chr11'
    start = 834310
    stop = 834510
    print time.time() - origTime
    print are_coordinates_half_in_peaks(sorted_peak_coordinates, chromosome, start, stop)[1]
    assert(are_coordinates_half_in_peaks(sorted_peak_coordinates, chromosome, start, stop)[0])
    
    origTime = time.time()
    chromosome = 'chr11'
    start = 187800
    stop = 188000
    assert(are_coordinates_half_in_peaks(sorted_peak_coordinates, chromosome, start, stop)[0]==False)
    print time.time()-origTime
    
    chromosome = 'chr11'
    start = 187500
    stop = 835368
    print are_coordinates_half_in_peaks(sorted_peak_coordinates, chromosome, start, stop)[1]
    assert(are_coordinates_half_in_peaks(sorted_peak_coordinates, chromosome, start, stop)[0]==False)
    
    print sum([val[1]-val[0] for val in sorted_peak_coordinates['chr11']])
    
test_are_coordinates_half_in_peaks()

TypeError: 'bool' object has no attribute '__getitem__'