# Notebook to create a consensus peak bed file from the aggregated ARC and ATAC peak sets

In [1]:
!date

Mon Jul 31 12:01:39 EDT 2023


#### import libraries

In [2]:
from pandas import read_csv, concat
from pybedtools import BedTool

#### set notebook variables

In [3]:
# naming
proj_name = 'aging_phase2'

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'

# in files
arc_bed_file = f'{wrk_dir}/src_data/arc/aging_phase2_arc_aggr/outs/atac_peaks.bed'
atac_bed_file = f'{wrk_dir}/src_data/atac/aging_phase2_atac_aggr/outs/peaks.bed'

# out files 
consensus_bed_file = f'{wrk_dir}/src_data/{proj_name}_consensus_atac_peaks.bed'

# variables
DEBUG = True
max_bp_dist = 2

### load the ARC peaks

In [4]:
arc_df = read_csv(arc_bed_file, comment='#', delimiter='\s+', header=None)
arc_df.columns = ['chrom', 'start', 'end']
print(f'arc peaks shape {arc_df.shape}')
# look at the lengths
arc_lengths = arc_df.end - arc_df.start
display(arc_lengths.describe())

if DEBUG:
    display(arc_df.head())

arc peaks shape (134600, 3)


count    134600.000000
mean        851.666263
std          87.127190
min         114.000000
25%         829.000000
50%         873.000000
75%         902.000000
max        1672.000000
dtype: float64

Unnamed: 0,chrom,start,end
0,chr1,9771,10662
1,chr1,180569,181343
2,chr1,191022,191921
3,chr1,628945,629664
4,chr1,629696,630272


### load the ATAC peaks

In [5]:
atac_df = read_csv(atac_bed_file, comment='#', delimiter='\s+', header=None)
atac_df.columns = ['chrom', 'start', 'end']
print(f'arc peaks shape {atac_df.shape}')
# look at the lengths
atac_lengths = atac_df.end - atac_df.start
display(atac_lengths.describe())

if DEBUG:
    display(atac_df.head())

arc peaks shape (214269, 3)


count    214269.000000
mean        837.433497
std          95.600131
min         126.000000
25%         811.000000
50%         865.000000
75%         898.000000
max        2108.000000
dtype: float64

Unnamed: 0,chrom,start,end
0,chr1,180641,181334
1,chr1,191027,191926
2,chr1,628886,629684
3,chr1,631954,632845
4,chr1,633587,634583


### find and keep the contig sort order, should match the reference that was used

In [6]:
contigs = []
with open(arc_bed_file, 'r') as file:
    for line in file:
        if not line.startswith('#'):
            break
        elif line.startswith('# primary_contig='):
            this_chrom = line.replace('# primary_contig=', '').strip()
            contigs.append(this_chrom)
print(contigs)

['chr1', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr2', 'chr20', 'chr21', 'chr22', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chrX', 'chrY', 'KI270728.1', 'KI270727.1', 'GL000009.2', 'GL000194.1', 'GL000205.2', 'GL000195.1', 'GL000219.1', 'KI270734.1', 'GL000213.1', 'GL000218.1', 'KI270731.1', 'KI270721.1', 'KI270726.1', 'KI270711.1', 'KI270713.1']


### concatenate the peak sets

In [7]:
peaks_df = concat([arc_df, atac_df])
peaks_df = peaks_df.sort_values(by=list(peaks_df.columns),axis=0)
print(f'peaks shape {peaks_df.shape}')

if DEBUG:
    display(peaks_df.head())

peaks shape (348869, 3)


Unnamed: 0,chrom,start,end
134557,GL000194.1,55817,56628
214221,GL000194.1,55831,56657
134558,GL000194.1,58189,58986
214222,GL000194.1,58214,58991
134559,GL000194.1,59547,60495


### merge peaks into consensus
- using bedtools

In [8]:
peaks_bed = BedTool.from_dataframe(peaks_df)
print(peaks_bed.count())
print(peaks_bed.field_count())
if DEBUG:
    display(peaks_bed.to_dataframe().head())

348869
3


Unnamed: 0,chrom,start,end
0,GL000194.1,55817,56628
1,GL000194.1,55831,56657
2,GL000194.1,58189,58986
3,GL000194.1,58214,58991
4,GL000194.1,59547,60495


In [9]:
consensus_peaks_bed = peaks_bed.merge(d=max_bp_dist)
print(consensus_peaks_bed.count())
print(consensus_peaks_bed.field_count())
if DEBUG:
    display(consensus_peaks_bed.to_dataframe().head())

221430
3


Unnamed: 0,chrom,start,end
0,GL000194.1,55817,56657
1,GL000194.1,58189,58991
2,GL000194.1,59547,60505
3,GL000194.1,67008,67944
4,GL000194.1,68209,69035


### compute peak interal summary stats

In [10]:
consensus_peaks_df = consensus_peaks_bed.to_dataframe()
lengths = consensus_peaks_df.end - consensus_peaks_df.start

display(lengths.describe())

count    221430.000000
mean        857.106336
std         102.872170
min         114.000000
25%         825.000000
50%         882.000000
75%         916.000000
max        2327.000000
dtype: float64

In [11]:
consensus_peaks_df.head()

Unnamed: 0,chrom,start,end
0,GL000194.1,55817,56657
1,GL000194.1,58189,58991
2,GL000194.1,59547,60505
3,GL000194.1,67008,67944
4,GL000194.1,68209,69035


### order the consensus peaks by the reference contig order

In [12]:
from pandas import Categorical
consensus_peaks_df['chrom_class'] = Categorical(consensus_peaks_df.chrom, 
                                                categories=contigs, 
                                                ordered=True)
consensus_peaks_df = consensus_peaks_df.sort_values(by=['chrom_class', 'start'])
consensus_peaks_df = consensus_peaks_df.drop(columns=['chrom_class'])
print(f'sorted consensus peaks shape {consensus_peaks_df.shape}')
if DEBUG:
    display(consensus_peaks_df.head())

sorted consensus peaks shape (221430, 3)


Unnamed: 0,chrom,start,end
60,chr1,9771,10662
61,chr1,180569,181343
62,chr1,191022,191926
63,chr1,628886,629684
64,chr1,629696,630272


### save the consensus bed

In [13]:
%%time
consensus_peaks_df.to_csv(consensus_bed_file, index=False, 
                          header=False, sep='\t')

CPU times: user 201 ms, sys: 9.04 ms, total: 210 ms
Wall time: 231 ms


In [14]:
!date

Mon Jul 31 12:01:44 EDT 2023
