# RNA-Seq Analysis

In [None]:
#To convert this to a Python script, use:
#jupyter nbconvert --to script Omni-ATAC_all-analysis.ipynb

#To run on a SLURM scheduler or other shared computing system, you may need to load modules.
#For Savio, utilize the script ATAC-Seq_phaw5.0.sh

#To upload this to SLURM, along with other necessary files, use
#scp Omni-ATAC_all-analysis.py Omni-ATAC_all-analysis.sh dasun@dtn.brc.berkeley.edu:/global/scratch/dasun/Omni-ATAC-Seq

In [1]:
#Configuration of Python script
import subprocess
import os
import os.path
import fnmatch
import pandas as pd
import numpy as np

## Read Trimming with trim-galore

This section of the script trims the Illumina adapters off the sequences.

In [None]:
#############################################################
### Fill in this information before running this section ####

#Decide if you want to run this section
run_read_trimming = False

#Write a base pattern for your raw fastq file names
#This pattern should occur in all of your file names
base_pattern = 'S*_R1_001.fastq.gz'

#############################################################
#############################################################

#checks if you want to trim your reads
if run_read_trimming == True:

    #Scan through all of the files in the folder specified within the single quotes in the line below
    #for each file listed in a sorted list of the directory's contents
    for file in sorted(os.listdir('.')):
        
        #determines if the file is a .fastq.gz file
        if fnmatch.fnmatch(file, base_pattern):
            
            #gets the name of that file
            one = file
            
            #simulates the name of the other read of that file
            two = file.replace('R1_001','R2_001')
            
            #checks if there is not an appropriate pair for the file, if so, then exits and returns an error
            if not os.path.exists(two):
                raise Exception('A matching paired file for', file, 'was not found')
                
            #sends progress message to stdout
            !echo 'Performing trim-galore on' $one 'and' $two
            
            #performs trim_galore on the pair of reads identified
            #removing automatically detected adapters and performing FastQC
            !trim_galore --illumina --fastqc --paired --stringency 5 --gzip --retain_unpaired $one $two

    #Moves the .html and .zip files into a folder called "FASTQC"
    ![ -d "FASTQC" ] && echo 'FASTQC folder already exists' || echo 'Making FASTQC folder'; mkdir FASTQC
    !mv *.zip FASTQC
    !mv *.html FASTQC
    !mv *.txt FASTQC

    #Make the directory "untrimmed" if it does not already exist
    ![ -d "untrimmed" ] && echo 'untrimmed directrory already exists' || echo 'Making untrimmed folder'; mkdir untrimmed
    !mv *.fastq.gz untrimmed

## Read Trimming with trim-galore (round 2)

After FASTQC, we observed an additional adapter sequence left over from the TaKaRa SMART-Seq kit:

AAGCAGTGGTATCAACGCAGAGTAC

To remove this, and perform quality trimming, we perform a second round of trim-galore trimming.

In [None]:
#############################################################
### Fill in this information before running this section ####

#Decide if you want to run this section
run_read_trimming_rd2 = False

#Write a base pattern for your raw fastq file names
#This pattern should occur in all of your file names
base_pattern = 'S*_R1_001_val_1.fq.gz'

#############################################################
#############################################################

#checks if you want to trim your reads
if run_read_trimming_rd2 == True:

    #Scan through all of the files in the folder specified within the single quotes in the line below
    #for each file listed in a sorted list of the directory's contents
    for file in sorted(os.listdir('.')):
        
        #determines if the file is a .fastq.gz file
        if fnmatch.fnmatch(file, base_pattern):
            
            #gets the name of that file
            one = file
            
            #simulates the name of the other read of that file
            two = file.replace('R1', 'R2').replace('val_1', 'val_2')
            
            #generates new names for the files
            new_one = one.replace('val', 'rd1')
            new_two = two.replace('val', 'rd1')
            
            #checks if there is not an appropriate pair for the file, if so, then exits and returns an error
            if not os.path.exists(two):
                raise Exception('A matching paired file for', file, 'was not found')
            
            #renames the original files
            !echo 'Renaming ' $one ' to ' $new_one
            !mv $one $new_one
            !echo 'Renaming ' $two ' to ' $new_two
            !mv $two $new_two

            #sends progress message to stdout
            !echo 'Performing trim-galore round 2 on' $new_one 'and' $new_two
            
            #performs trim_galore on the pair of reads identified
            #removing automatically detected adapters and performing FastQC again
            !trim_galore -q 20 --fastqc --paired -a AAGCAGTGGTATCAACGCAGAGTAC --stringency 5 --gzip --retain_unpaired $new_one $new_two

## Trinity RNA-Seq de novo Assembly
Use the code below to assemble via Trinity RNA-Seq.
You can specify to run either *de novo* assembly, genome-guided assembly, or both.

In [4]:
#############################################################
### Fill in this information before running this section ####

#Decide if you want to make a sample file
run_make_Trinity_samplefile = False
run_Trinity_denovo = False

#Write a dictionary with prefixes and conditions
samples_file_name = '20210429_samples_file.txt'
base_pattern_2 = '*_val_1.fq.gz'

#############################################################
#############################################################

#checks if you want to generate a sample file
if run_make_Trinity_samplefile == True:

    #make empty dataframe for handling data
    dataframe = pd.DataFrame(columns=['Condition', 'Replicate', 'FwdSeq', 'RevSeq'])

    #search through current directory
    for file in sorted(os.listdir('.')):
        
        #check if it's read1
        if fnmatch.fnmatch(file, base_pattern_2):
            
            prefix = file.split('_')[0]
            
            #simulate the name of read2
            read1, read2 = file, file.replace('R1', 'R2').replace('val_1', 'val_2').replace('rd1_1', 'rd1_2')
            unpaired1 = read1.replace('val', 'unpaired')
            unpaired2 = read2.replace('val', 'unpaired')
            
            #check if the other read exists in the current directory
            #if not, move on to examining next file
            if not os.path.exists(read2):
                raise Exception('A matching paired file for', read1, 'was not found')
                continue
            if not os.path.exists(unpaired1):
                raise Exception('A unpaired read1 file for', unpaired1, 'was not found')
                continue
            if not os.path.exists(unpaired2):
                raise Exception('A unpaired read2 file for', unpaired2, 'was not found')
                continue
            
            #generate "full" datasets for left and right read pairs so Trinity will handle them
            newname1 = read1.replace('val_1', 'all_1')
            newname2 = read2.replace('val_2', 'all_2')
            
            !cat $read1 $unpaired1 > $newname1
            !cat $read2 $unpaired2 > $newname2
                
            #tell the user that a pair has been discovered
            print('Adding rows for', newname1, newname2)
            
            #append data to the dataframe
            dataframe = dataframe.append({'Condition': prefix[0:3], 
                                        'Replicate': prefix,
                                        'FwdSeq': newname1, 
                                        'RevSeq': newname2}, ignore_index = True)

    #write sample file
    print('Writing sample file to', samples_file_name)
    dataframe.to_csv(samples_file_name, sep='\t', index = False, header = False)

if run_Trinity_denovo == True:
    !Trinity --seqType fq --max_memory 1500G --samples_file $samples_file_name --CPU 32 --output trinity_out_dir

# Trinity RNA-Seq Genome Guided Assembly

In [None]:
#############################################################
### Fill in this information before running this section ####

#Decide if you want to make a sample file
run_bowtie2_align = False
run_combine_bams = False
run_Trinity_genomeguided = False
run_insilico_mapping = False
run_Trinity_gg_insilico = False
run_Trinity_gg_hisat2 = False

#set q score cutoff for bowtie2 alignment
qscore = 10

#set the maximum intron size for genome-guided assembly
max_intron = 300000

#Set the names of your .fasta file
genome_file = 'phaw_5.0.fa'
#Container for the chromsizes variable
bt2lib = genome_file.replace('.fa', '.bt2lib')
bt2libmarker = genome_file.replace('.fa', '.bt_marker.txt')
bt2marker_exists = True
chromsizes = genome_file.replace('.fa', '.chrom.sizes')
subfinalfile = 'Sall_RNA_unsorted.bam'
finalfile = 'Sall_RNA_sorted.bam'

#############################################################
#############################################################

#checks if you want to trim your reads
if run_bowtie2_align == True:

    #check if the bt2 marker has already been flagged
    !printf 'checking if index file has already been generated for '$genome_file'\n'
    for file in sorted(os.listdir('.')):
        if fnmatch.fnmatch(file, bt2libmarker):
            bt2marker_exists = True
            !printf 'genome index file already exists for '$genome_file'\n'

    #search through current directory
    for file in sorted(os.listdir('.')):
        #if the genome file is found and there is no bt2marker_exists flag
        if fnmatch.fnmatch(file, genome_file) and bt2marker_exists == False:
            #check for files in the current directory
            !printf 'no index file exists for '$genome_file'\n'
            !printf 'generating bt2 index for '$genome_file'\n'
            bt2_output = genome_file.replace('.fa', '_bowtie2-build-output.txt')
            !bowtie2-build -f --threads 12 $genome_file $bt2lib > $bt2_output
        
            #generate chrom.sizes file
            fainame = genome_file + '.fai'
            !samtools faidx $genome_file
            !cut -f1,2 $fainame > $chromsizes

            #make a marker file to indicate that the bowtie2 library has already been built
            !touch $bt2libmarker

    #check for files in the current directory
    for file in sorted(os.listdir('.')):
    
        #initialize and/or blank variables
        one, two, unone, untwo = '', '', '', ''
    
        #determine the file name and its partners, if it is R1 trimmed read
        if fnmatch.fnmatch(file,'S*rd1_1_val_1.fq.gz'):
            one = file
            two = file.replace('R1', 'R2').replace('val_1', 'val_2').replace('rd1_1', 'rd1_2')
            unone = one.replace('val', 'unpaired')
            untwo = two.replace('val', 'unpaired')
            print('Starting bowtie2 alignment on ', one, two, unone, untwo)
        else: continue
        
        #get prefix of samples
        prefix = one.split('_')[0]
        
        #name sam file
        samname = prefix + '_RNA.sam'
    
        #run bowtie2
        !printf 'running bowtie2 on '$prefix'\n'
        !bowtie2 --local --very-sensitive-local --threads 40 --time -x $bt2lib -1 $one -2 $two -U $unone,$untwo -S $samname

        #name bam files
        bampref = samname.replace('.sam','_sorted')
        bamname = samname.replace('.sam','_q' + str(qscore) + '.bam')
        !printf 'running samtools sort on '$prefix'\n'
        !samtools view -bS -q $qscore $samname | samtools sort -T $bampref -o $bamname

if run_combine_bams == True:
    
    #initialize container for all bam file names
    bam_list = ''
    bam_num = 0
    
    for file in sorted(os.listdir('.')):
        if fnmatch.fnmatch(file,'S*_RNA_q10.bam'):
            if bam_num == 0:
                bam_list = file
                bam_num = bam_num + 1
                !printf 'Found file '$file' and added to list at position '$bam_num'\n'
            else:
                bam_list = bam_list + ' ' + file
                bam_num = bam_num + 1
                !printf 'Found file '$file' and added to list at position '$bam_num'\n'
        else: continue
    
    !printf 'Final list of bams is: $bam_list\n'
    
    #merge all bam files into one
    finalbampref = finalfile.split('.')[0]
    !printf 'Merging $bam_list into $subfinalfile\n'
    !samtools merge $subfinalfile $bam_list
    !printf 'Sorting $subfinalfile into $finalfile\n'
    !samtools sort -T $finalbampref -o $finalfile $subfinalfile

if run_Trinity_genomeguided == True:
    !Trinity --genome_guided_bam $finalfile --genome_guided_max_intron $max_intron --max_memory 1500G --CPU 32 --bflyHeapSpaceMax 20G --bflyHeapSpaceInit 4G --bflyCalculateCPU --output trinity_out_dir_GG

if run_insilico_mapping == True:
    r1 = 'S13A1_S84_R1_001_rd1_1_all_1.fq.gz_ext_all_reads.normalized_K25_C50_pctSD10000.fq'
    r2 = 'S13A1_S84_R2_001_rd1_2_all_2.fq.gz_ext_all_reads.normalized_K25_C50_pctSD10000.fq'
    samname = 'Phaw_RNA_insilico.sam'
    !printf 'running bowtie2 on '$r1', '$r2'\n'
    !bowtie2 --local --very-sensitive-local --threads 40 --time -x $bt2lib -1 $r1 -2 $r2 -S $samname
    bampref = samname.replace('.sam','_sorted')
    bamname = samname.replace('.sam','_q' + str(qscore) + '.bam')
    !printf 'running samtools sort on '$prefix'\n'
    !samtools view -bS -q $qscore $samname | samtools sort -T $bampref -o $bamname

if run_Trinity_gg_insilico == True:
    finalfile2 = 'Phaw_RNA_insilico_q10.bam'
    !Trinity --genome_guided_bam $finalfile2 --genome_guided_max_intron $max_intron --max_memory 1500G --CPU 32 --output trinity_out_dir_GG_insilico

if run_Trinity_gg_hisat2 == True:
    subfinalfile = 'Phaw_RNA_hisat2.cleaned.bam'
    finalbampref = subfinalfile.replace('.bam', '')
    finalfile = subfinalfile.replace('.bam', '.sorted.bam')
    !samtools sort -T $finalbampref -o $finalfile $subfinalfile
    !Trinity --genome_guided_bam $finalfile --genome_guided_max_intron $max_intron --max_memory 1500G --CPU 32 --output trinity_out_dir_GG_hisat2

# Make individual transcriptomes per developmental stage

In [None]:
#############################################################
### Fill in this information before running this section ####

#Decide if you want to make a sample file
run_make_Trinity_samplefile_singles = False
run_Trinity_denovo_singles = False

#Write a dictionary with prefixes and conditions
samples_file_basename = '20210429_*_samples_file.txt'
base_pattern_3 = '*_all_1.fq.gz'

#############################################################
#############################################################

#set global variables
file_dict = {}
stages = ['S13', 'S19', 'S21', 'S23']

#checks if you want to generate a sample file
if run_make_Trinity_samplefile_singles == True:

    #make empty dataframe for handling data
    for stage in stages:
        dataframe = pd.DataFrame(columns=['Condition', 'Replicate', 'FwdSeq', 'RevSeq'])
        file_dict[stage] = dataframe
        
    #search through current directory
    for file in sorted(os.listdir('.')):
        
        #check if it's read1
        if fnmatch.fnmatch(file, base_pattern_3):
            
            prefix = file.split('_')[0]
            
            #simulate the name of read2
            read1, read2 = file, file.replace('R1', 'R2').replace('all_1', 'all_2').replace('rd1_1', 'rd1_2')
                
            #tell the user that a pair has been discovered
            !printf 'Adding rows for $read1 and $read2'
            
            #append data to the dataframe
            file_dict[prefix[0:3]] = file_dict[prefix[0:3]].append({'Condition': prefix[0:3], 
                                        'Replicate': prefix,
                                        'FwdSeq': read1,
                                        'RevSeq': read2}, ignore_index = True)

    #write sample file
    for dict_stage in file_dict:
        sample_file_forstage = samples_file_basename.replace('_*_', '_' + dict_stage + '_')
        !printf 'Writing sample file to $sample_file_forstage'
        file_dict[dict_stage].to_csv(sample_file_forstage, sep='\t', index = False, header = False)

if run_Trinity_denovo_singles == True:
    samples_file_list = []
    counter = 0

    for file in sorted(os.listdir('.')):
        #check if it's read1
        if fnmatch.fnmatch(file, samples_file_basename):
            if counter == 0:
                samples_file_list = [file]
                counter = counter + 1
            else:
                samples_file_list = samples_file_list + [file]
                counter = counter + 1
    
    for file in samples_file_list:
        output_dir = 'trinity_out_dir_' + file.split('_')[1]
        !printf 'Generating Trinity run for $samples_file_list'
        !Trinity --seqType fq --max_memory 1500G --samples_file $file --CPU 32 --output $output_dir

In [1]:
#############################################################
### Fill in this information before running this section ####

#Decide if you want to make a sample file
run_kallisto = False
run_kallisto_matrix = True

#Write a dictionary with prefixes and conditions
samples_file = '20210429_samples_file.txt'
transcript_file = 'DAS.mikado.loci.fasta'
gene_trans_map_file = 'DAS.mikado.loci.fasta.gene_trans_map'
output_dir = transcript_file + '_run'

stages = ['S13A1', 'S13B1', 'S13C1', 'S19A1', 'S19B1', 'S19C1', 'S21A', 'S21B', 'S21C', 'S23A', 'S23B', 'S23C']
basedirs_list = [i + '/abundance.tsv' for i in stages]
basedirs = ' '.join(basedirs_list)

#############################################################
#############################################################

#set global variables
if run_kallisto == True:
    !/clusterfs/vector/home/groups/software/sl-7.x86_64/modules/trinity/2.5.1/util/align_and_estimate_abundance.pl --transcripts {transcript_file} --samples_file {samples_file} --seqType fq --est_method kallisto --output_dir {output_dir} --gene_trans_map {gene_trans_map_file} --prep_reference --thread_count 12
    
if run_kallisto_matrix == True:
    !/clusterfs/vector/home/groups/software/sl-7.x86_64/modules/trinity/2.5.1/util/abundance_estimates_to_matrix.pl --est_method kallisto --gene_trans_map {gene_trans_map_file} --name_sample_by_basedir {basedirs}


'S13A1/abundance.tsv S13B1/abundance.tsv S13C1/abundance.tsv S15A1/abundance.tsv S15B1/abundance.tsv S15C1/abundance.tsv S21A/abundance.tsv S21B/abundance.tsv S21C/abundance.tsv S23A/abundance.tsv S23B/abundance.tsv S23C/abundance.tsv'