In [1]:
#To convert this to a Python script, use:
#jupyter nbconvert --to script Omni-ATAC_all-analysis.ipynb

#To run on a SLURM scheduler or other shared computing system, you may need to load modules.
#For Savio, utilize the script ATAC-Seq_phaw5.0.sh

#To upload this to SLURM, along with other necessary files, use
#scp Omni-ATAC_all-analysis.py Omni-ATAC_all-analysis.sh dasun@dtn.brc.berkeley.edu:/global/scratch/dasun/Omni-ATAC-Seq

In [2]:
#Configuration of Python script
import subprocess
import os
import os.path
import fnmatch
import pandas as pd
import numpy as np
import json

## Read concatenation and adpater trimming per barcode 

This section of the script trims the Illumina adapters off the sequences.

In [None]:
#############################################################
### Fill in this information before running this section ####

#Decide if you want to run this section
run_read_concatenation = False
run_read_merging = False
run_NanoPlot = False

outer_folder_list = ['run1_basecalled_barcodes/', 'run2_basecalled_barcodes/', 'run3_basecalled_barcodes/']
inner_folder_list = ['barcode01/', 'barcode02/', 'barcode03/', 'barcode04/']
barcode_list = ['barcode01', 'barcode02', 'barcode03', 'barcode04']

#############################################################
#############################################################

#checks if you want to trim your reads
if run_read_concatenation == True:
    for folder in outer_folder_list:
        for subfolder in inner_folder_list:
            current = './' + folder + subfolder
            current_fastq = current + '*.fastq'
            output = './' + folder + subfolder.replace('/', '_qcat/')
            !cat $current_fastq | qcat -b $output

if run_read_merging == True:
    #generate new folder list with qcat suffix
    new_inner_folder_list = [i.replace('/', '_qcat/') for i in inner_folder_list]
    
    #create empty dict to put all files for given barcode in
    barcode_dict = {barcode:[] for barcode in barcode_list}
    
    #iterate through everything
    for folder in outer_folder_list:
        for subfolder in new_inner_folder_list:
            for barcode in barcode_list:
                
                #simulate file name
                simulated_directory = folder + subfolder
                simulated_file = simulated_directory + barcode + '.fastq'
                print(simulated_file)
                
                #check if file exists
                if os.path.exists(simulated_file):
                    !printf 'Found file '$simulated_file' in '$simulated_directory'\n'
                    !printf 'Adding '$simulated_file' to dictionary \n'
                    barcode_dict[barcode] = barcode_dict[barcode] + [simulated_file]
                    print(barcode_dict)
                    with open('test.txt', 'a+') as test:
                        test.write(json.dumps(barcode_dict))
                        test.write(" \n")
    
    for barcode in barcode_dict:
        string = ' '.join(barcode_dict[barcode])
        output = barcode + '_allruns.fastq'
        !printf 'Joining all of '$barcode' files together into '$output' \n'
        !cat $string > $output

if run_NanoPlot == True:
    for barcode in barcode_list:
        file = barcode + '_allruns.fastq'
        folder = barcode + '_Nanoplot'
        !NanoPlot --verbose --fastq $file -o $folder -p $barcode

## Map reads and generate StringTie2 transcriptome 

This section of the script trims the Illumina adapters off the sequences.

In [None]:
#############################################################
### Fill in this information before running this section ####

#Decide if you want to run this section
run_minimap2 = False

#############################################################
#############################################################

#checks if you want to trim your reads
if run_minimap2 == True:
    !minimap2 -ax splice phaw_5.0.fa allbarcodes_allruns.fastq > allbarcodes_allruns.sam


## Generate error-corrected reads using Canu



In [None]:
#############################################################
### Fill in this information before running this section ####

#Decide if you want to run this section
run_canu_correction = False
run_canu_trimming = False
run_canu_assembly = False

#############################################################
#############################################################

#checks if you want to trim your reads
if run_canu_correction == True:
    !canu -correct -p ab_ar_corr -d allbarcodes_allruns_correction genomeSize=30k minReadLength=150 minOverlapLength=150 useGrid=False -nanopore-raw allbarcodes_allruns.fastq

if run_canu_trimming == True:
    !canu -trim -p ab_ar_trim -d allbarcodes_allruns_trim genomeSize=30k minReadLength=150 minOverlapLength=150 useGrid=False -nanopore -raw allbarcodes_allruns.fastq
    
if run_canu_assembly == True:
    !canu -p ab_ar_asmb -d allbarcodes_allruns_assembly genomeSize=30k minReadLength=150 minOverlapLength=150 useGrid=False -nanopore-raw allbarcodes_allruns.fastq

In [None]:
#############################################################
### Fill in this information before running this section ####

#Decide if you want to run this section
run_hisat2 = True
run_clean_hisat = True
run_stringtie2_mix = True

genome_file = 'phaw_5.0.fa'
index_base = genome_file.replace('.fa', '')

#############################################################
#############################################################

if run_hisat2 == True:
    r1 = 'S13A1_S84_R1_001_rd1_1_all_1.fq.gz_ext_all_reads.normalized_K25_C50_pctSD10000.fq'
    r2 = 'S13A1_S84_R2_001_rd1_2_all_2.fq.gz_ext_all_reads.normalized_K25_C50_pctSD10000.fq'
    Sfile = 'Phaw_RNA_hisat2.dta.sam'
    !hisat2 --dta -x $index_base -1 $r1 -2 $r2 -S $Sfile

if run_clean_hisat == True:
    start_sam = 'Phaw_RNA_hisat2.dta.sam'
    cleaned_bam = start_sam.replace('.sam', '.cleaned.bam')
    !samtools view -b -F 4 $start_sam > $cleaned_bam
    sorted_bam = cleaned_bam.replace('.cleaned.', '.sorted.')
    sorted_pref = cleaned_bam.replace('.cleaned.bam', '')
    !samtools sort -T $sorted_pref -o $sorted_bam $cleaned_bam
    
if run_stringtie2_mix == True:
    hisat2 = 'Phaw_RNA_hisat2.dta.sorted.bam'
    minimap2 = 'allbarcodes_allruns.sorted.bam'
    output = 'Par-haw_Nanopore_stringtie2_mix.gtf'
    !stringtie $hisat2 $minimap2 --mix -o $output