# Setup
0. added the following lines to`data.config.user` and moved all the files to the right place

```
[ph5]
genome: ph5/phaw_5.0.fa
chromosome_sizes: ph5/phaw_5.0.chrom.sizes
gene_regions: ph5/mikado.loci.sorted.transcripts.bed
annotation: ph5/mikado.loci.sorted.gtf
gene_alias: ph5/DAS.mikado.loci.fasta.gene_trans_map
```

1. downloaded all non-redundant CORE JASPAR sequences as .jaspar files from <http://jaspar.genereg.net/downloads/>
2. Used the R script `prepare_for_rgt.Rmd` to convert all JASPAR sequences to the format RGT likes
3. `prepare_for_rgt.Rmd` also generates a nice .mtf file for us to use
4. upload to correct directory in `~/rgtdata/motifs`
5. move the .mtf file to the right place

```
mv jaspar_core_nr.mtf ../
```

6. go back to `~/rgtdata` and run this:

```
python setupLogoData.py --all
```

7. modify `data.config.user` to add a line specifying `jaspar_core_nr` as default database
```
repositories: jaspar_core_nr
```

In [None]:
#############################################################
### Fill in this information before running this section ####

#Decide if you want to make a sample file
run_hintatac_footprint = False
run_hintatac_tracks_rerun = False
run_hintatac_wigtobigwig = False
run_hintatac_motifmatch = False
run_hintatac_motifmatch_random = False
run_hintatac_motifmatch_manualrandom = False
run_hintatac_enrichment = False
run_hintatac_motifmatch_manualrandom_smaller = False
run_hintatac_enrichment_smaller = False
run_hintatac_enrichment_smaller_all = False

stage_array = ['13', '14', '15', '17', '18', '19', '19plus', '20', '21', '22', '23', '24', '25', '26', '27']
bam_pattern = '*AB_bothruns_q10.Genrich_resorted.bam'
narrowPeak_pattern = '*A_bothruns_q10.Genrich_sorted.ATAC.q005.narrowPeak'
allpeaks_bed = 'OmniATAC_bothruns_q005_allpeaks.igv_new.bed'

match_specific = 'match_specific'
enrichment_specific = 'enrichment_specific'
match_all = 'match_all'
enrichment_all = 'enrichment_all'
manualrandom_specific = 'random_regions_forspecific.bed'
manualrandom_all = 'random_regions_forall.bed'

enrichment_specific_small = 'enrichment_specific_small'
enrichment_all_small = 'enrichment_all_small'

#############################################################
#############################################################

if run_hintatac_footprint:
    for stage in stage_array:
        stage_prefix = 'S' + stage 
        stage_bam = bam_pattern.replace('*', stage_prefix)
        stage_abbrev = stage_prefix + '_hintatac'
        stage_narrowPeak = 'Genrich_sorted_bothruns/q005_narrowPeak/' + narrowPeak_pattern.replace('*', stage_prefix)
        
        !echo '### performing stage specific analysis of '{stage_prefix}'\n'
        stage_specific = stage_abbrev + '_specific'
        !rgt-hint footprinting --atac-seq --paired-end --organism=ph5 --output-location=./hintatac --output-prefix={stage_specific} {stage_bam} {stage_narrowPeak}
        
        stage_specific_bc = stage_specific + '_bc'
        !rgt-hint tracks --bc --bigWig --organism=ph5 {stage_bam} {stage_narrowPeak} --output-prefix={stage_specific_bc}
        
        !echo '### performing allpeaks analysis of '{stage_prefix}'\n'
        stage_all = stage_abbrev + '_all'
        !rgt-hint footprinting --atac-seq --paired-end --organism=ph5 --output-location=./hintatac --output-prefix={stage_all} {stage_bam} {allpeaks_bed}
        
        stage_all_bc = stage_all + '_bc'
        !rgt-hint tracks --bc --bigWig --organism=ph5 {stage_bam} {allpeaks_bed} --output-prefix={stage_all_bc}

if run_hintatac_tracks_rerun:
    #used vim /global/home/users/dasun/.local/lib/python3.7/site-packages/rgt/HINT/signalProcessing.py
    #to comment out the for loop beginning with "for read in self.bam.fetch"
    #as per this GitHub thread: https://github.com/CostaLab/reg-gen/issues/162
    #also adjusted so that if p1, p2, p1_w, or p2_w are <= 0, sets area to 0
    for stage in stage_array:
        stage_prefix = 'S' + stage 
        stage_bam = bam_pattern.replace('*', stage_prefix)
        stage_abbrev = stage_prefix + '_hintatac'
        stage_narrowPeak = 'Genrich_sorted_bothruns/q005_narrowPeak/' + narrowPeak_pattern.replace('*', stage_prefix)
        
        !echo '### performing stage specific analysis of '{stage_prefix}'\n'
        stage_specific = stage_abbrev + '_specific'
        stage_specific_bc = stage_specific + '_bc'
        !rgt-hint tracks --bc --organism=ph5 {stage_bam} {stage_narrowPeak} --output-prefix={stage_specific_bc}
        
        !echo '### performing allpeaks analysis of '{stage_prefix}'\n'
        stage_all = stage_abbrev + '_all'
        stage_all_bc = stage_all + '_bc'
        !rgt-hint tracks --bc --organism=ph5 {stage_bam} {allpeaks_bed} --output-prefix={stage_all_bc}

if run_hintatac_wigtobigwig:
    #had to run this separately because the current install via HINT-ATAC for wigToBigWig is somehow broken
    #downloaded wigToBigWig from UCSC using wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/wigToBigWig
    #chmod +x wigToBigWig
    for stage in stage_array:
        all_name = 'S' + stage + '_hintatac_all_bc.wig'
        all_bw_name = all_name.replace('.wig', '.bw')
        !/global/scratch/dasun/wigToBigWig {all_name} phaw_5.0.chrom.sizes {all_bw_name}
        
        specific_name = 'S' + stage + '_hintatac_specific_bc.wig'
        specific_bw_name = specific_name.replace('.wig', '.bw')
        !/global/scratch/dasun/wigToBigWig {specific_name} phaw_5.0.chrom.sizes {specific_bw_name}
        
if run_hintatac_motifmatch:
    specific_bed_array = ['./hintatac/S' + stage + '_hintatac_specific.bed' for stage in stage_array]
    specific_bed_string = ' '.join(specific_bed_array)
    !rgt-motifanalysis matching --motif-dbs ~/rgtdata/motifs/jaspar_core_nr --organism=ph5 --input-files {specific_bed_string}
    
    all_bed_array = ['./hintatac/S' + stage + '_hintatac_all.bed' for stage in stage_array]
    all_bed_string = ' '.join(all_bed_array)
    !rgt-motifanalysis matching --motif-dbs ~/rgtdata/motifs/jaspar_core_nr --organism=ph5 --input-files {all_bed_string}

if run_hintatac_motifmatch_random:
    specific_bed_array = ['./hintatac/S' + stage + '_hintatac_specific.bed' for stage in stage_array]
    specific_bed_string = ' '.join(specific_bed_array)
    !rgt-motifanalysis matching --motif-dbs ~/rgtdata/motifs/jaspar_core_nr --organism=ph5 --rand-proportion 5 --output-location {match_specific} --input-files {specific_bed_string} 
    
    all_bed_array = ['./hintatac/S' + stage + '_hintatac_all.bed' for stage in stage_array]
    all_bed_string = ' '.join(all_bed_array)
    !rgt-motifanalysis matching --motif-dbs ~/rgtdata/motifs/jaspar_core_nr --organism=ph5 --rand-proportion 5 --output-location {match_all} --input-files {all_bed_string} 

if run_hintatac_motifmatch_manualrandom:
    !echo '### running motif matching on random peaks from '{manualrandom_specific}'\n'
    !rgt-motifanalysis matching --motif-dbs ~/rgtdata/motifs/jaspar_core_nr --organism=ph5 --output-location {match_specific} --input-files {manualrandom_specific}
    !echo '### running motif matching on random peaks from '{manualrandom_all}'\n'
    !rgt-motifanalysis matching --motif-dbs ~/rgtdata/motifs/jaspar_core_nr --organism=ph5 --output-location {match_all} --input-files {manualrandom_all}
    
if run_hintatac_enrichment:
    specific_bed_array = ['./hintatac/S' + stage + '_hintatac_specific.bed' for stage in stage_array]
    specific_bed_string = ' '.join(specific_bed_array)
    !rgt-motifanalysis enrichment --organism=ph5 --matching-location {match_specific} --motif-dbs ~/rgtdata/motifs/jaspar_core_nr --output-location {enrichment_specific} random_regions_forspecific.bed {specific_bed_string}
    
    all_bed_array = ['./hintatac/S' + stage + '_hintatac_all.bed' for stage in stage_array]
    all_bed_string = ' '.join(all_bed_array)
    !rgt-motifanalysis enrichment --organism=ph5 --matching-location {match_all} --motif-dbs ~/rgtdata/motifs/jaspar_core_nr --output-location {enrichment_all} random_regions_forspecific.bed {all_bed_string}
    
if run_hintatac_motifmatch_manualrandom_smaller:
    !echo '### running motif matching on random peaks from random_regions_smaller.bed\n'
    !rgt-motifanalysis matching --motif-dbs ~/rgtdata/motifs/jaspar_core_nr --organism=ph5 --output-location {match_specific} --input-files random_regions_smaller.bed
    !cp match_specific/random_regions_smaller_mbps.bed match_all/random_regions_smaller_mbps.bed
    
if run_hintatac_enrichment_smaller:
    specific_bed_array = ['./hintatac/S' + stage + '_hintatac_specific.bed' for stage in stage_array]
    specific_bed_string = ' '.join(specific_bed_array)
    !rgt-motifanalysis enrichment --organism=ph5 --matching-location {match_specific} --motif-dbs ~/rgtdata/motifs/jaspar_core_nr --output-location {enrichment_specific_small} random_regions_smaller.bed {specific_bed_string}

if run_hintatac_enrichment_smaller_all:
    all_bed_array = ['./hintatac/S' + stage + '_hintatac_all.bed' for stage in stage_array]
    all_bed_string = ' '.join(all_bed_array)
    !rgt-motifanalysis enrichment --organism=ph5 --matching-location {match_all} --motif-dbs ~/rgtdata/motifs/jaspar_core_nr --output-location {enrichment_all_small} random_regions_smaller.bed {all_bed_string}

In [None]:
#############################################################
### Fill in this information before running this section ####

#Decide if you want to make a sample file
run_hintatac_split_footprints = False
run_hintatac_split_mbps = False

#Set number of acores
acores = 9

hintatac_dir = 'hintatac/'
hintatac_split_dir = 'hintatac_9acoresplit/'
hintatac_footprint_pattern = '_hintatac_all.bed'

match_dir = 'match_all/'
match_split_dir = 'match_all_9acoresplit/'
match_pattern = '_hintatac_all_mpbs.bed'

stage_array = ['13', '14', '15', '17', '18', '19', '19plus', '20', '21', '22', '23', '24', '25', '26', '27']
Sstage_array = ['S' + i for i in stage_array]

Mfuzz_acorebed_pattern = 'Mfuzz_' + str(acores) + 'acores_acore*_*_peaks.bed'

#############################################################
#############################################################

if run_hintatac_split_footprints:
    #Generate string of all hintatac footprint bed file names
    hintatac_footprint_list = [hintatac_dir + stage + hintatac_footprint_pattern for stage in Sstage_array]
    hintatac_footprint_string = ' '.join(hintatac_footprint_list)
    
    !printf "Collecting overlaps for all footprint files listed: "{hintatac_footprint_string}"\n"
    
    #Iterate through number of acores
    for acore_num in np.arange(1, acores + 1):
        
        #Generate names for files
        Mfuzz_acorebed = Mfuzz_acorebed_pattern.replace('*_*', str(acore_num))
        splitbed_filename = hintatac_split_dir + 'Sall_hintatac_all_acore' + str(acore_num) + '.bed'
        
        #Bedtools intersect each individual acore bed file vs. all footprint addresses
        !bedtools intersect -a {Mfuzz_acorebed} -b {hintatac_footprint_string} -sorted -wb > {splitbed_filename}
        
        #Use sort to remove non-unique coordinates, output to a new file
        splitbed_uniquename = splitbed_filename.replace('.bed', '.unique.bed')
        !printf "removing lines from "{splitbed_filename}" with non-unique coordinates and output to "{splitbed_uniquename}
        !sort -k1,1 -k2,2n -k3,3n -u {splitbed_filename} > {splitbed_uniquename}

if run_hintatac_split_mbps:
    #Generate string of all hintatac footprint bed file names
    match_list = [match_dir + stage + match_pattern for stage in Sstage_array]
    match_string = ' '.join(match_list)
    
    !printf "Collecting overlaps for all mbps files listed: "{match_string}"\n"
    
    #Iterate through number of acores
    for acore_num in np.arange(1, acores + 1):
        
        #Generate names for files
        Mfuzz_acorebed = Mfuzz_acorebed_pattern.replace('*_*', str(acore_num))
        splitbed_filename = match_split_dir + 'Sall_hintatac_all_acore' + str(acore_num) + '_mbps.bed'
        
        #Bedtools intersect each individual acore bed file vs. all footprint addresses
        !bedtools intersect -a {Mfuzz_acorebed} -b {match_string} -sorted -wb > {splitbed_filename}
        
        #Use sort to remove non-unique coordinates, output to a new file
        #Also keep any matched lines
        splitbed_uniquename = splitbed_filename.replace('.bed', '.unique.bed')
        !printf "removing lines from "{splitbed_filename}" with non-unique entries and output to "{splitbed_uniquename}
        !sort -k1,1 -k2,2n -k3,3n, -k4,4n, -k5,5n -u {splitbed_filename} > {splitbed_uniquename}