Script to get the FAIRE condition specific peaks from the consensus_peaks.mLb.clN.boolean.txt file (output from the nfcore/atacseq pipeline). Change the peak_type and min_reps to get the combination that you want. Default here: peak_type = 'broad', minreps = 1


In [2]:
import pandas as pd
import os

cwd = os.getcwd() # current working directory. All scripts in this directory. Inside : RNA, FAIRE, and integrated
FAIRE_directory = cwd + '/FAIRE/'

# 2 types of peaks as output from nfcore/atac pipeline: broad and narrow
peak_type = 'broad'
# minimum number of peaks in each condition (here male/partheno)
min_reps = 1

consensus_peaks_path = f'{FAIRE_directory}{peak_type}/bwa/merged_library/macs2/{peak_type}_peak/consensus/consensus_peaks.mLb.clN.boolean.txt'
consensus_peaks = pd.read_csv(consensus_peaks_path, sep='\t', comment='#')
consensus_peaks

Unnamed: 0,chr,start,end,interval_id,num_peaks,num_samples,Male_REP1.mLb.clN.bool,Male_REP2.mLb.clN.bool,Partheno_REP1.mLb.clN.bool,Partheno_REP2.mLb.clN.bool,...,Male_REP1.mLb.clN.start,Male_REP2.mLb.clN.start,Partheno_REP1.mLb.clN.start,Partheno_REP2.mLb.clN.start,Partheno_REP3.mLb.clN.start,Male_REP1.mLb.clN.end,Male_REP2.mLb.clN.end,Partheno_REP1.mLb.clN.end,Partheno_REP2.mLb.clN.end,Partheno_REP3.mLb.clN.end
0,scaffold_1,32808,35384,Interval_1,4,4,True,True,True,False,...,32808,32889,33104,,34024,35302,35384,35347,,34270
1,scaffold_1,43510,43769,Interval_2,2,2,True,True,False,False,...,43510,43516,,,,43698,43769,,,
2,scaffold_1,47327,49271,Interval_3,6,4,True,True,True,False,...,47334;48673,47331,47327;48733,,47795,47922;49130,49271,47925;49251,,49228
3,scaffold_1,92727,93096,Interval_4,2,2,False,True,True,False,...,,92806,92727,,,,93096,93022,,
4,scaffold_1,126129,126676,Interval_5,3,3,True,True,False,False,...,126130,126235,,,126129,126635,126676,,,126609
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44246,scaffold_89,2804,2991,Interval_44247,1,1,False,False,True,False,...,,,2804,,,,,2991,,
44247,scaffold_9,86879,87165,Interval_44248,2,2,False,True,True,False,...,,86879,86921,,,,87128,87165,,
44248,scaffold_9,89987,90307,Interval_44249,1,1,False,True,False,False,...,,89987,,,,,90307,,,
44249,scaffold_99,12669,14125,Interval_44250,3,3,True,True,True,False,...,12741,12669,12899,,,13432,13677,14125,,


Fetching the boolean columns for each condition

In [3]:
# write here the names of your columns for the first condition
male_bool_columns = [
    'Male_REP1.mLb.clN.bool', 
    'Male_REP2.mLb.clN.bool'
]

# write here the names of your columns for the second condition
partheno_bool_columns = [
    'Partheno_REP1.mLb.clN.bool', 
    'Partheno_REP2.mLb.clN.bool', 
    'Partheno_REP3.mLb.clN.bool'
]

# Separating condition specific peaks
To get the **specific** peaks for the males: we only want the rows that have minimum min_reps True in the male_REP[sample].mLb.clN.bool columns AND not (~) any True in the partheno_REP[sample].mLb.clN.bool

In [4]:
consensus_peaks_male = consensus_peaks[(consensus_peaks[male_bool_columns].sum(axis=1) >= min_reps) & (~consensus_peaks[partheno_bool_columns].any(axis=1))]
consensus_peaks_male

Unnamed: 0,chr,start,end,interval_id,num_peaks,num_samples,Male_REP1.mLb.clN.bool,Male_REP2.mLb.clN.bool,Partheno_REP1.mLb.clN.bool,Partheno_REP2.mLb.clN.bool,...,Male_REP1.mLb.clN.start,Male_REP2.mLb.clN.start,Partheno_REP1.mLb.clN.start,Partheno_REP2.mLb.clN.start,Partheno_REP3.mLb.clN.start,Male_REP1.mLb.clN.end,Male_REP2.mLb.clN.end,Partheno_REP1.mLb.clN.end,Partheno_REP2.mLb.clN.end,Partheno_REP3.mLb.clN.end
1,scaffold_1,43510,43769,Interval_2,2,2,True,True,False,False,...,43510,43516,,,,43698,43769,,,
7,scaffold_1,205473,205653,Interval_8,1,1,True,False,False,False,...,205473,,,,,205653,,,,
24,scaffold_1,666042,666280,Interval_25,1,1,False,True,False,False,...,,666042,,,,,666280,,,
36,scaffold_1,856129,856490,Interval_37,2,2,True,True,False,False,...,856129,856200,,,,856490,856409,,,
45,scaffold_1,1054779,1055762,Interval_46,1,1,False,True,False,False,...,,1054779,,,,,1055762,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44206,scaffold_553,3414,3767,Interval_44207,1,1,False,True,False,False,...,,3414,,,,,3767,,,
44230,scaffold_75,21132,21352,Interval_44231,1,1,False,True,False,False,...,,21132,,,,,21352,,,
44237,scaffold_81,300,688,Interval_44238,1,1,False,True,False,False,...,,300,,,,,688,,,
44244,scaffold_88,18958,19913,Interval_44245,2,2,True,True,False,False,...,19225,18958,,,,19454,19913,,,


In [5]:
consensus_peaks_partheno = consensus_peaks[(consensus_peaks[partheno_bool_columns].sum(axis=1) >= min_reps) & (~consensus_peaks[male_bool_columns].any(axis=1))]
consensus_peaks_partheno

Unnamed: 0,chr,start,end,interval_id,num_peaks,num_samples,Male_REP1.mLb.clN.bool,Male_REP2.mLb.clN.bool,Partheno_REP1.mLb.clN.bool,Partheno_REP2.mLb.clN.bool,...,Male_REP1.mLb.clN.start,Male_REP2.mLb.clN.start,Partheno_REP1.mLb.clN.start,Partheno_REP2.mLb.clN.start,Partheno_REP3.mLb.clN.start,Male_REP1.mLb.clN.end,Male_REP2.mLb.clN.end,Partheno_REP1.mLb.clN.end,Partheno_REP2.mLb.clN.end,Partheno_REP3.mLb.clN.end
5,scaffold_1,179494,179833,Interval_6,1,1,False,False,False,True,...,,,,179494,,,,,179833,
6,scaffold_1,200955,201912,Interval_7,2,2,False,False,True,True,...,,,200955,201018,,,,201912,201884,
8,scaffold_1,223529,224363,Interval_9,3,3,False,False,True,True,...,,,223609,223529,223700,,,223815,224363,223887
9,scaffold_1,238353,238580,Interval_10,1,1,False,False,False,True,...,,,,238353,,,,,238580,
11,scaffold_1,346999,347489,Interval_12,2,2,False,False,True,False,...,,,346999,,347053,,,347311,,347489
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44236,scaffold_8,119575,119755,Interval_44237,1,1,False,False,False,False,...,,,,,119575,,,,,119755
44242,scaffold_82,1035,1377,Interval_44243,2,2,False,False,True,False,...,,,1060,,1035,,,1377,,1297
44243,scaffold_86,10021,10370,Interval_44244,2,2,False,False,True,False,...,,,10023,,10021,,,10355,,10370
44245,scaffold_89,45,887,Interval_44246,2,2,False,False,True,False,...,,,82,,45,,,887,,804


# Getting the intergroup peaks
To get the **intergroup** peaks between males and partheno: we only want the rows that have at least min_reps True in the male_REP[sample].mLb.clN.bool columns AND at least min_reps True in the partheno_REP[sample].mLb.clN.bool : this way there can be 1 partheno that have a false and it could still be considered a intergroup peak

In [6]:
consensus_peaks_intergroup = consensus_peaks[(consensus_peaks[male_bool_columns].sum(axis=1) >= min_reps) & (consensus_peaks[partheno_bool_columns].sum(axis=1) >= min_reps)]
consensus_peaks_intergroup

Unnamed: 0,chr,start,end,interval_id,num_peaks,num_samples,Male_REP1.mLb.clN.bool,Male_REP2.mLb.clN.bool,Partheno_REP1.mLb.clN.bool,Partheno_REP2.mLb.clN.bool,...,Male_REP1.mLb.clN.start,Male_REP2.mLb.clN.start,Partheno_REP1.mLb.clN.start,Partheno_REP2.mLb.clN.start,Partheno_REP3.mLb.clN.start,Male_REP1.mLb.clN.end,Male_REP2.mLb.clN.end,Partheno_REP1.mLb.clN.end,Partheno_REP2.mLb.clN.end,Partheno_REP3.mLb.clN.end
0,scaffold_1,32808,35384,Interval_1,4,4,True,True,True,False,...,32808,32889,33104,,34024,35302,35384,35347,,34270
2,scaffold_1,47327,49271,Interval_3,6,4,True,True,True,False,...,47334;48673,47331,47327;48733,,47795,47922;49130,49271,47925;49251,,49228
3,scaffold_1,92727,93096,Interval_4,2,2,False,True,True,False,...,,92806,92727,,,,93096,93022,,
4,scaffold_1,126129,126676,Interval_5,3,3,True,True,False,False,...,126130,126235,,,126129,126635,126676,,,126609
10,scaffold_1,280480,281073,Interval_11,4,4,False,True,True,True,...,,280493,280480,280524,280507,,281073,280711,280932,280687
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44240,scaffold_81,6562,7497,Interval_44241,3,3,True,True,True,False,...,6737,6562,6654,,,7221,7497,7218,,
44241,scaffold_81,13541,14019,Interval_44242,3,3,True,True,True,False,...,13541,13740,13797,,,13892,14019,13974,,
44247,scaffold_9,86879,87165,Interval_44248,2,2,False,True,True,False,...,,86879,86921,,,,87128,87165,,
44249,scaffold_99,12669,14125,Interval_44250,3,3,True,True,True,False,...,12741,12669,12899,,,13432,13677,14125,,


# Exporting

In [28]:
consensus_peaks_male.to_csv(f'{FAIRE_directory}DAR_{peak_type}_male_minreps{min_reps}.bed', sep='\t', index=False, na_rep='NA', header=False)
consensus_peaks_partheno.to_csv(f'{FAIRE_directory}DAR_{peak_type}_partheno_minreps{min_reps}.bed', sep='\t', index=False, na_rep='NA', header=False)
consensus_peaks_intergroup.to_csv(f'{FAIRE_directory}DAR_{peak_type}_intergroup_minreps{min_reps}.bed', sep='\t', index=False, na_rep='NA', header=False)