In [1]:
import numpy as np
import pandas as pd
from os import environ

In [None]:
#############################################################
### Fill in this information before running this section ####

#Decide if you want to make a sample file
run_hintatac_fixbeds = False
run_hintatac_fixmpbsbeds = False
run_hintatac_split_footprints = False
run_hintatac_split_mpbs = False

#Set number of acores
acores = 9

hintatac_dir = 'hintatac/'
hintatac_split_dir = 'hintatac_9acoresplit/'
hintatac_footprint_pattern = '_hintatac_all.bed'
hintatac_footprint_fixedpattern = '_hintatac_all_fixed.bed'

match_dir = 'match_all/'
match_split_dir = 'match_all_9acoresplit/'
match_pattern = '_hintatac_all_mpbs.bed'
match_fixedpattern = '_hintatac_all_fixed_mpbs.bed'

stage_array = ['13', '14', '15', '17', '18', '19', '19plus', '20', '21', '22', '23', '24', '25', '26', '27']
Sstage_array = ['S' + i for i in stage_array]

Mfuzz_acorebed_pattern = 'Mfuzz_' + str(acores) + 'acores_acore*_*_peaks.bed'

#############################################################
#############################################################

#HINT-ATAC generates a trailing tab for all its files, which bedtools does not like
#Run these to fix that
if run_hintatac_fixbeds:
    hintatac_footprint_list = [hintatac_dir + stage + hintatac_footprint_pattern for stage in Sstage_array]
    
    for footprint_file in hintatac_footprint_list:
        fixed_file = footprint_file.replace(hintatac_footprint_pattern, hintatac_footprint_fixedpattern)
        
        #double check to make sure you don't erase the footprint file!
        if fixed_file == footprint_file:
            raise ValueError("Fixed file and footprint file are identically named!")
        
        !sed 's/\t$//' {footprint_file} > {fixed_file}

#HINT-ATAC generates a trailing tab for all its files, which bedtools does not like
#Run these to fix that
if run_hintatac_fixmpbsbeds:
    match_list = [match_dir + stage + match_pattern for stage in Sstage_array]
    
    for match_file in match_list:
        fixed_file = match_file.replace(match_pattern, match_fixedpattern)
        
        #double check to make sure you don't erase the match file!
        if fixed_file == match_file:
            raise ValueError("Fixed file and match file are identically named!")
        
        !sed 's/\t$//' {match_file} > {fixed_file}

if run_hintatac_split_footprints:
    #Generate string of all hintatac footprint bed file names
    hintatac_footprint_list = [hintatac_dir + stage + hintatac_footprint_fixedpattern for stage in Sstage_array]
    hintatac_footprint_string = ' '.join(hintatac_footprint_list)
    
    !printf "Collecting overlaps for all footprint files listed: {hintatac_footprint_string}\n"
    
    #Iterate through number of acores
    for acore_num in np.arange(1, acores + 1):
        
        #Generate names for files
        Mfuzz_acorebed = Mfuzz_acorebed_pattern.replace('*_*', str(acore_num))
        splitbed_filename = hintatac_split_dir + 'Sall_hintatac_all_acore' + str(acore_num) + '.bed'
        
        #Bedtools intersect each individual acore bed file vs. all footprint addresses
        !bedtools intersect -a {Mfuzz_acorebed} -b {hintatac_footprint_string} -wb > {splitbed_filename}
        
        wb_filename = splitbed_filename.replace('.bed', '_wb.bed')
        !printf "getting jus the columns we want from "{splitbed_filename}" as "{wb_filename}" using pandas\n"
        temp_df = pd.DataFrame(pd.read_csv(splitbed_filename, header = None, sep = '\t'))
        out_df = temp_df[[7, 8, 9, 10, 11, 12]]
        out_df.to_csv(wb_filename, header = None, index = None, sep = '\t')
        
        #Use sort to remove non-unique coordinates, output to a new file
        splitbed_uniquename = wb_filename.replace('_wb.bed', '_unique.bed')
        !printf "removing lines from "{wb_filename}" with non-unique coordinates and output to "{splitbed_uniquename}"\n"
        !sort -k1,1 -k2,2n -k3,3n -u {wb_filename} > {splitbed_uniquename}

if run_hintatac_split_mpbs:
    #Generate string of all hintatac footprint bed file names
    match_list = [match_dir + stage + match_fixedpattern for stage in Sstage_array]
    match_string = ' '.join(match_list)
    
    !printf "Collecting overlaps for all mpbs files listed: {match_string}\n"
    
    #Iterate through number of acores
    for acore_num in np.arange(1, acores + 1):
        
        #Generate names for files
        Mfuzz_acorebed = Mfuzz_acorebed_pattern.replace('*_*', str(acore_num))
        splitbed_filename = match_split_dir + 'Sall_hintatac_all_acore' + str(acore_num) + '_mpbs.bed'
        
        #Bedtools intersect each individual acore bed file vs. all footprint addresses
        !bedtools intersect -a {Mfuzz_acorebed} -b {match_string} -wb > {splitbed_filename}
        
        wb_filename = splitbed_filename.replace('_mpbs.bed', '_wb_mpbs.bed')
        !printf "getting jus the columns we want from "{splitbed_filename}" as "{wb_filename}" using pandas\n"
        temp_df = pd.DataFrame(pd.read_csv(splitbed_filename, header = None, sep = '\t'))
        out_df = temp_df[[7, 8, 9, 10, 11, 12]]
        out_df.to_csv(wb_filename, header = None, index = None, sep = '\t')
        
        #Use sort to remove non-unique coordinates, output to a new file
        #Also keep any matched lines
        splitbed_uniquename = wb_filename.replace('_wb_mpbs.bed', '_unique_mpbs.bed')
        !printf "removing lines from "{wb_filename}" with non-unique entries and output to "{splitbed_uniquename}"\n"
        !sort -k1,1 -k2,2n -k3,3n -k4,4n -k5,5n -u {wb_filename} > {splitbed_uniquename}

In [None]:
#############################################################
### Fill in this information before running this section ####

#Decide if you want to make a sample file
run_hintatac_enrichment_mfuzz = False

acores = 9

hintatac_splitloc = 'hintatac_9acoresplit/'
hintatac_splitbed = 'Sall_hintatac_all_acore*_unique.bed'

match_splitloc = 'match_all_9acoresplit/'

enrich_splitloc = 'enrichment_all_9acoresplit/'

#############################################################
#############################################################

if run_hintatac_enrichment_mfuzz:
    bed_array = [hintatac_splitloc + hintatac_splitbed.replace('*', str(acore_num)) for acore_num in np.arange(1, acores + 1)]
    bed_string = ' '.join(bed_array)
    !rgt-motifanalysis enrichment --organism=ph5 --matching-location {match_splitloc} --motif-dbs ~/rgtdata/motifs/jaspar_core_nr --output-location {enrich_splitloc} random_regions_smaller.bed {bed_string}

In [None]:
#############################################################
### Fill in this information before running this section ####

#Decide if you want to make a sample file
run_hintatac_fixbeds = False
run_hintatac_fixmpbsbeds = False
run_hintatac_split_footprints = False
run_hintatac_split_mpbs = False

#Set number of acores
acores = 9

hintatac_dir = 'hintatac/'
hintatac_split_dir = 'hintatac_specific_9acoresplit/'
hintatac_footprint_pattern = '_hintatac_specific.bed'
hintatac_footprint_fixedpattern = '_hintatac_specific_fixed.bed'

match_dir = 'match_specific/'
match_split_dir = 'match_specific_9acoresplit/'
match_pattern = '_hintatac_specific_mpbs.bed'
match_fixedpattern = '_hintatac_specific_fixed_mpbs.bed'

stage_array = ['13', '14', '15', '17', '18', '19', '19plus', '20', '21', '22', '23', '24', '25', '26', '27']
Sstage_array = ['S' + i for i in stage_array]

Mfuzz_acorebed_pattern = 'Mfuzz_' + str(acores) + 'acores_acore*_*_peaks.bed'

#############################################################
#############################################################

#HINT-ATAC generates a trailing tab for all its files, which bedtools does not like
#Run these to fix that
if run_hintatac_fixbeds:
    hintatac_footprint_list = [hintatac_dir + stage + hintatac_footprint_pattern for stage in Sstage_array]
    
    for footprint_file in hintatac_footprint_list:
        fixed_file = footprint_file.replace(hintatac_footprint_pattern, hintatac_footprint_fixedpattern)
        
        #double check to make sure you don't erase the footprint file!
        if fixed_file == footprint_file:
            raise ValueError("Fixed file and footprint file are identically named!")
        
        !sed 's/\t$//' {footprint_file} > {fixed_file}

#HINT-ATAC generates a trailing tab for all its files, which bedtools does not like
#Run these to fix that
if run_hintatac_fixmpbsbeds:
    match_list = [match_dir + stage + match_pattern for stage in Sstage_array]
    
    for match_file in match_list:
        fixed_file = match_file.replace(match_pattern, match_fixedpattern)
        
        #double check to make sure you don't erase the match file!
        if fixed_file == match_file:
            raise ValueError("Fixed file and match file are identically named!")
        
        !sed 's/\t$//' {match_file} > {fixed_file}

if run_hintatac_split_footprints:
    #Generate string of all hintatac footprint bed file names
    hintatac_footprint_list = [hintatac_dir + stage + hintatac_footprint_fixedpattern for stage in Sstage_array]
    hintatac_footprint_string = ' '.join(hintatac_footprint_list)
    
    !printf "Collecting overlaps for all footprint files listed: {hintatac_footprint_string}\n"
    
    #Iterate through number of acores
    for acore_num in np.arange(1, acores + 1):
        
        #Generate names for files
        Mfuzz_acorebed = Mfuzz_acorebed_pattern.replace('*_*', str(acore_num))
        splitbed_filename = hintatac_split_dir + 'Sall_hintatac_specific_acore' + str(acore_num) + '.bed'
        
        #Bedtools intersect each individual acore bed file vs. all footprint addresses
        !bedtools intersect -a {Mfuzz_acorebed} -b {hintatac_footprint_string} -wb > {splitbed_filename}
        
        wb_filename = splitbed_filename.replace('.bed', '_wb.bed')
        !printf "getting jus the columns we want from "{splitbed_filename}" as "{wb_filename}" using pandas\n"
        temp_df = pd.DataFrame(pd.read_csv(splitbed_filename, header = None, sep = '\t'))
        out_df = temp_df[[7, 8, 9, 10, 11, 12]]
        out_df.to_csv(wb_filename, header = None, index = None, sep = '\t')
        
        #Use sort to remove non-unique coordinates, output to a new file
        splitbed_uniquename = wb_filename.replace('_wb.bed', '_unique.bed')
        !printf "removing lines from "{wb_filename}" with non-unique coordinates and output to "{splitbed_uniquename}"\n"
        !sort -k1,1 -k2,2n -k3,3n -u {wb_filename} > {splitbed_uniquename}

if run_hintatac_split_mpbs:
    #Generate string of all hintatac footprint bed file names
    match_list = [match_dir + stage + match_fixedpattern for stage in Sstage_array]
    match_string = ' '.join(match_list)
    
    !printf "Collecting overlaps for all mpbs files listed: {match_string}\n"
    
    #Iterate through number of acores
    for acore_num in np.arange(1, acores + 1):
        
        #Generate names for files
        Mfuzz_acorebed = Mfuzz_acorebed_pattern.replace('*_*', str(acore_num))
        splitbed_filename = match_split_dir + 'Sall_hintatac_specific_acore' + str(acore_num) + '_mpbs.bed'
        
        #Bedtools intersect each individual acore bed file vs. all footprint addresses
        !bedtools intersect -a {Mfuzz_acorebed} -b {match_string} -wb > {splitbed_filename}
        
        wb_filename = splitbed_filename.replace('_mpbs.bed', '_wb_mpbs.bed')
        !printf "getting jus the columns we want from "{splitbed_filename}" as "{wb_filename}" using pandas\n"
        temp_df = pd.DataFrame(pd.read_csv(splitbed_filename, header = None, sep = '\t'))
        out_df = temp_df[[7, 8, 9, 10, 11, 12]]
        out_df.to_csv(wb_filename, header = None, index = None, sep = '\t')
        
        #Use sort to remove non-unique coordinates, output to a new file
        #Also keep any matched lines
        splitbed_uniquename = wb_filename.replace('_wb_mpbs.bed', '_unique_mpbs.bed')
        !printf "removing lines from "{wb_filename}" with non-unique entries and output to "{splitbed_uniquename}"\n"
        !sort -k1,1 -k2,2n -k3,3n -k4,4n -k5,5n -u {wb_filename} > {splitbed_uniquename}

In [None]:
#############################################################
### Fill in this information before running this section ####

#Decide if you want to make a sample file
run_hintatac_enrichment_mfuzz = True

acores = 9

hintatac_splitloc = 'hintatac_specific_9acoresplit/'
hintatac_splitbed = 'Sall_hintatac_specific_acore*_unique.bed'

match_splitloc = 'match_specific_9acoresplit/'

enrich_splitloc = 'enrichment_specific_9acoresplit/'

#############################################################
#############################################################

if run_hintatac_enrichment_mfuzz:
    bed_array = [hintatac_splitloc + hintatac_splitbed.replace('*', str(acore_num)) for acore_num in np.arange(1, acores + 1)]
    bed_string = ' '.join(bed_array)
    !rgt-motifanalysis enrichment --organism=ph5 --matching-location {match_splitloc} --motif-dbs ~/rgtdata/motifs/jaspar_core_nr --output-location {enrich_splitloc} random_regions_smaller.bed {bed_string}