# Metatranscriptomic workflow

## All scripts and commands used for the pre-processing and mapping of metatrascriptomic data

#### All code was run on ACE servers. Code chunks beginning with #!/bin/bash were run as separate scripts

### Concatenate samples

In [None]:
ls -d PB* | cut -d '_' -f 3 | parallel -j 15 "cat PBJC_20190115_{}/*_1.fq.gz > {}_1_concatenated.fq.gz && echo "{}_done" "

### Trim adaptors and low quality seqs

In [None]:
#!/bin/bash

# load programs
module load ngs-bits
module load parallel/20180222

# trim seqs
ls *_1_*.gz | cut -d '_' -f 1 | parallel -j 5 \
SeqPurge \
-in1 {}_1_concatenated.fq.gz \
-in2 {}_2_concatenated.fq.gz \
-out1 /srv/projects3/sponge_metatranscriptome/Data/Cut_adapt/{}_1_trimmed.fq.gz \
-out2 /srv/projects3/sponge_metatranscriptome/Data/Cut_adapt/{}_2_trimmed.fq.gz \
-a1 AAGTCGGAGGCCAAGCGGTCTTAGGAAGACAA \
-a2 AAGTCGGATCGTAGCCATGTCGTTCTGTGAGCCAAGGAGTTG \
-threads 6 \
-out3 {}_singleton \
-summary summary

conda deactivate

### Run sortmeRNA to remove rRNA contamination 

In [None]:
#!/bin/bash

# load programs
module load parallel/20180222
conda activate sortmerna_4.2.0

# run SortmeRNA
ls *_1_*.gz | cut -d '_' -f 1 | parallel -j 4 \
echo sorting_{} && \
sortmerna \
--ref /srv/db/sortmerna/4.0.0/rRNA_databases/rfam-5.8s-database-id98.fasta \
--ref /srv/db/sortmerna/4.0.0/rRNA_databases/rfam-5s-database-id98.fasta \
--ref /srv/db/sortmerna/4.0.0/rRNA_databases/silva-arc-16s-id95.fasta \
--ref /srv/db/sortmerna/4.0.0/rRNA_databases/silva-arc-23s-id98.fasta \
--ref /srv/db/sortmerna/4.0.0/rRNA_databases/silva-bac-16s-id90.fasta \
--ref /srv/db/sortmerna/4.0.0/rRNA_databases/silva-bac-23s-id98.fasta \
--ref /srv/db/sortmerna/4.0.0/rRNA_databases/silva-euk-18s-id95.fasta \
--ref /srv/db/sortmerna/4.0.0/rRNA_databases/silva-euk-28s-id98.fasta \
--reads /srv/projects3/sponge_metatranscriptome/Data/SeqPurge/Trimmed_reads/{}_1_trimmed.fq.gz \
--reads /srv/projects3/sponge_metatranscriptome/Data/SeqPurge/Trimmed_reads/{}_2_trimmed.fq.gz \
--workdir /srv/projects3/sponge_metatranscriptome/Data/Sortmerna/{}/ \
--fastx \
--sam \
--aligned {}_aligned_rrna \
--other {}_non-aligned_rrna \
--paired_in \
--out2 \
--threads 12

conda deactivate

## Map metatranscriptomic reads to reference MAGs

Note: run separately for each host species. i.e., map reads from one host to the MAGs derived from the same species

### First dereplicate MAGs from host species

#### Carterio

Get checkm quality

In [None]:
#!/bin/bash

# load programs
conda activate checkm-genome-1.1.3

# checkm quality
checkm lineage_wf \
/srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/Carterio/ \
/srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/checkm_car \
-x fna -t 16

# Create checkm file
# Note: for use in coverm cluster (dereplication), need short checkm output
# short = 1, long = 2
checkm qa \
/srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/checkm_car/lineage.ms \
/srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/checkm_car \
-o 1 -f /srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/checkm_car/checkm_car_short.tsv --tab_table -t 16

conda deactivate

Run coverm cluster to dereplicate genomes

In [None]:
#!/bin/bash

# load programs
conda activate coverm-0.6.0

# dereplicate genomes
coverm cluster \
--genome-fasta-directory /srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/Carterio/ \
-x fna \
--ani 95 \
--checkm-tab-table /srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/checkm_car/checkm_car_short.tsv \
--output-representative-fasta-directory /srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/Carterio/Dereplicated \
--precluster-method finch \
--min-completeness 50 \
--max-contamination 10 \
-t 16

conda deactivate

#### Ircinia blue

Get checkm quality

In [None]:
#!/bin/bash

# load programs
conda activate checkm-genome-1.1.3

checkm lineage_wf \
/srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/Iblue/ \
/srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/checkm_iblue \
-x fa -t 16

# Create checkm file
checkm qa \
/srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/checkm_iblue/lineage.ms \
/srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/checkm_iblue \
-o 1 -f /srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/checkm_iblue/checkm_iblue_short.tsv --tab_table -t 16

conda deactivate

Run coverm cluster to dereplicate genomes

In [None]:
#!/bin/bash

# load programs
conda activate coverm-0.6.0

# dereplicate genomes
coverm cluster \
--genome-fasta-directory /srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/Iblue/ \
-x fa \
--ani 95 \
--checkm-tab-table /srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/checkm_car/checkm_iblue_short.tsv \
--output-representative-fasta-directory /srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/Iblue/Dereplicated \
--precluster-method finch \
--min-completeness 50 \
--max-contamination 10 \
-t 16

conda deactivate

#### Ircinia ramosa

Get checkm quality

In [None]:
#!/bin/bash

# load programs
conda activate checkm-genome-1.1.3

# get checkm quality
checkm lineage_wf \
/srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/I_ramosa \
/srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/checkm_iramosa \
-x fa -t 16

# Create checkm file
checkm qa \
/srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/checkm_iramosa/lineage.ms \
/srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/checkm_iramosa \
-o 1 -f /srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/checkm_iramosa/checkm_irc_short.tsv --tab_table -t 1

conda deactivate

Run coverm cluster to dereplicate genomes

In [None]:
#!/bin/bash

# load programs
conda activate coverm-0.6.0

coverm cluster \
--genome-fasta-directory /srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/I_ramosa \
-x fa \
--ani 95 \
--checkm-tab-table /srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/checkm_iramosa/checkm_irc_short.tsv \
--output-representative-fasta-directory /srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/I_ramosa/Derep_95 \
--precluster-method finch \
--min-completeness 50 \
--max-contamination 10 \
-t 1

conda deactivate

### Concatenate into one reference file

In [None]:
#Carterio
cat Carterio/Dereplicate/*.fna > combined_mags_carterio.fna

#Ircinia blue
cat Iblue/Dereplicated/*.fa > combined_mags_iblue.fna

#Ircinia ramosa
cat I_ramosa/Derep_95/*.fa > combined_mags_iramosa.fna

### Make bam files using coverM

In [None]:
#!/bin/bash

# load programs
conda activate coverm-0.6.0

# Carterio
coverm make \
-c /srv/projects3/sponge_metatranscriptome/Data/Shangjin/Car* \
-r /srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/Carterio/Dereplicated/combined_mags_carterio.fna \
-o /srv/projects3/sponge_metatranscriptome/Bam_files/Car \
-t 20

# Ircinia blue
coverm make \
-c /srv/projects3/sponge_metatranscriptome/Data/Shangjin/IrB* \
-r /srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/Iblue/Dereplicated/combined_mags_iblue.fna \
-o /srv/projects3/sponge_metatranscriptome/Bam_files/IrB \
-t 20

# Ircinia ramosa
coverm make \
-c /srv/projects3/sponge_metatranscriptome/Data/Inka/irc* \
-r /srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/I_ramosa/Derep_95/combined_mags_iramosa.fna \
-o /srv/projects3/sponge_metatranscriptome/Bam_files/Irc \
-t 20

conda deactivate

### Sort bam files using sam tools

In [None]:
#!/bin/bash

# load programs
conda activate samtools_1.11

# sort bam files

#Note: bam files are sorted by name for input to htseq,
# this makes them incompatible with samtools index command

for BAM in $(ls /srv/projects3/sponge_metatranscriptome/Bam_files/*/*.bam); do
# get variables
    PREF=$(basename $BAM | cut -d "." -f 3 | cut -d "_" -f 1)
    echo "sorting $BAM"
    echo "basename is $PREF"
# run samtools
    samtools sort $BAM -o /srv/projects3/sponge_metatranscriptome/Bam_files/Sorted/${PREF}_sorted.bam -n -m 40G -@ 12
done

conda deactivate

### Annotate reference MAGs for analysis

In [None]:
#!/bin/bash

# load programs
conda activate enrichm_0.6.3

# For caterio combined MAGs
enrichm annotate \
--output /srv/projects3/sponge_metatranscriptome/Annotation/Combined_mags/Carterio \
--genome_file /srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/Carterio/Dereplicated/combined_mags_carterio.fna \
--ko \
--pfam \
--cazy \
--threads 16 \
--force \
--suffix .fna

# For Iblue combined MAGs
enrichm annotate \
--output /srv/projects3/sponge_metatranscriptome/Annotation/Combined_mags/Iblue \
--genome_file /srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/Iblue/Dereplicated/combined_mags_iblue.fna \
--ko \
--pfam \
--cazy \
--threads 16 \
--force \
--suffix .fna

## Killed - VERY SLOW FOR IBLUE - STILL RUNNING AFTER 9DAYS

conda deactivate

Try annotating each genome separately, then concatenating .gff files

In [None]:
#!/bin/bash

# load programs
conda activate enrichm_0.6.3

enrichm annotate \
--output /srv/projects3/sponge_metatranscriptome/Annotation/Dereplicated_mags/Iblue \
--genome_directory /srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/Iblue/Dereplicated/ \
--ko \
--pfam \
--cazy \
--threads 20 \
--force \
--suffix .fa

for FILE in *.gff; do echo $FILE; cat $FILE >> iblue_combined.gff; done


# Ircinia ramosa
# Trying above method (annnotate then concatenate) to save computing time

enrichm annotate \
--output /srv/projects3/sponge_metatranscriptome/Annotate/ \
--genome_directory /srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/I_ramosa/Derep_95 \
--ko \
--pfam \
--cazy \
--threads 20 \
--force \
--suffix .fa

for FILE in /srv/projects3/sponge_metatranscriptome/Annotate/*.gff; do echo $FILE; cat $FILE >> iramosa_combined.gff; done

conda deactivate

### Run HTseq-count to count number of reads that map to genes

In [None]:
#!/bin/bash

# load programs
module load htseq/0.9.1

# Note: counting transcripts from each sample separately

#Carterio
for FILE in $(ls /srv/projects3/sponge_metatranscriptome/Bam_files/Sorted/Car*.bam); do
# Check variable
        PREF=$(echo $FILE | cut -d '/' -f 7 | cut -d '_' -f 1)
        echo $FILE
        echo $PREF
# Run htseq
	htseq-count $FILE \
        /srv/projects3/sponge_metatranscriptome/Annotation/Combined_mags/Carterio/annotations_gff/combined_mags_carterio.gff \
        -f bam \
        -s no \
        -t CDS \
        -i seq_id \
        --additional-attr annotations > /srv/projects3/sponge_metatranscriptome/HTseq/Carterio/${PREF}_htseq.csv
done

#IBlue
for FILE in $(ls /srv/projects3/sponge_metatranscriptome/Bam_files/Sorted/IrB*.bam); do
# Check variable
        PREF=$(echo $FILE | cut -d '/' -f 7 | cut -d '_' -f 1)
        echo $FILE
        echo $PREF
# Run htseq
	htseq-count $FILE \
        ~/Metatranscriptomics/Annotation/Dereplicated_mags/Iblue/annotations_gff/iblue_combined.gff \
        -f bam \
        -s no \
        -t CDS \
        -i seq_id \
        --additional-attr annotations > /srv/projects3/sponge_metatranscriptome/HTseq/Iblue/${PREF}_htseq.csv
done

#I ramosa
for FILE in $(ls /srv/projects3/sponge_metatranscriptome/Bam_files/Sorted/irc*.bam); do
# Check variable
        PREF=$(echo $FILE | cut -d '/' -f 7 | cut -d '_' -f 1)
        echo $FILE
        echo $PREF
# Run htseq
	htseq-count $FILE \
        /srv/projects3/sponge_metatranscriptome/Annotate_iramosa/annotations_gff/iramosa_combined.gff \
        -f bam \
        -s reverse \
        -t CDS \
        -i seq_id \
        --additional-attr annotations > /srv/projects3/sponge_metatranscriptome/HTseq/${PREF}_htseq.csv
done

conda deactivate

### Create mag-contig mapping file

In [None]:
#First get contigs to grep

cat IrB4_htseq.csv | awk '{print $1}' | cut -d '_' -f 1-6 | uniq > to_grep.txt

# grep through each contig in each mag and save output
for MAG in 5*.fa; do grep -F -f to_grep.txt $MAG /dev/null >> mag_contig_iblue; done

#note: -f Obtains patterns from a file -F interprets patterns from the file as a fixed string

#Convert to tab separated file
sed -i -e 's/:>/\t/g' mag_contig_iblue 

#Clean MAG names
sed -i -e 's/.fa//g' mag_contig_iblue 

### Check mapping success using coverm genome coverage

In [None]:
#!/bin/bash

# load programs
conda activate coverm-0.6.0

#Get proportion of reads that map to genomes

#Carterio
coverm genome \
-t 20 \
-m relative_abundance \
-d ~/Metatranscriptomics/Reference_MAGs/Carterio/Dereplicated \
-c ~/Metatranscriptomics/Sorted_reads_Shangjin/Car*.fq.gz \
--min-read-aligned-percent 0.75 \
--min-read-percent-identity 0.95 \
--output-format dense \
-o carterio_metatranscriptome_mapping.tsv

#IBlue
coverm genome \
-t 20 \
-m relative_abundance \
-d ~/Metatranscriptomics/Reference_MAGs/Iblue/Dereplicated \
-c ~/Metatranscriptomics/Sorted_reads_Shangjin/IrB*.fq.gz \
-x fa \
--min-read-aligned-percent 0.75 \
--min-read-percent-identity 0.95 \
--output-format dense \
-o iblue_metatranscriptome_mapping.tsv

#I ramosa
coverm genome \
-t 20 \
-m relative_abundance \
-d /srv/projects3/sponge_metatranscriptome/Data/Reference_MAGs/I_ramosa/Derep_95 \
-c /srv/projects3/sponge_metatranscriptome/Data/Inka/*.fq.gz \
-x fa \
--min-read-aligned-percent 0.75 \
--min-read-percent-identity 0.95 \
--output-format dense \
-o ~/Metatranscriptomics/Coverage/iramosa_metatranscriptome_mapping.tsv

conda deactivate