# Anvio workflow

resource: https://merenlab.org/2016/06/22/anvio-tutorial-v2/

In [None]:
#INSTALLATION
module load conda/latest
conda activate anvio-8

#additional set up
anvi-setup-scg-taxonomy

anvi-setup-ncbi-cogs --num-threads 11

anvi-setup-kegg-data

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=100G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/anvio/mcav/slurm-anvio-%j.out  # %j = job ID

module load conda/latest
conda activate anvio-8

#Contig database from assembled genomes. stores information related to your sequences: positions of open reading frames, k-mer frequencies for each contig, functional and taxonomic annotation of genes, etc.
#set parameters:
SAMPLENAME="mcav"
CONTIGPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/mapping/${SAMPLENAME}"
CONTIGFILE="${SAMPLENAME}.contigs-fixed.fsa"
BAMPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/mapping/${SAMPLENAME}"
DBPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/anvio/${SAMPLENAME}"
mkdir -p "$DBPATH"
OUTDIR="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/anvio/${SAMPLENAME}/profiles"
mkdir -p "$OUTDIR"


#from merged, fixed contig fasta file created in previous step..need for downstream analysis
#default k-mer frequency is 4
anvi-gen-contigs-database -f $CONTIGPATH/$CONTIGFILE --project-name $SAMPLENAME -o $DBPATH/$SAMPLENAME.contigs.db  

#integrate HMMs into the database:
anvi-run-hmms -c $DBPATH/$SAMPLENAME.contigs.db --num-threads 6

#setup our COG database
#anvi-setup-ncbi-cogs -T 11 --just-do-it

#this runs NCBI COGs against your contigs.db, integrating gene functions
anvi-run-ncbi-cogs -c $DBPATH/$SAMPLENAME.contigs.db -T 4

#ADD KEGG-KOFAM
anvi-run-kegg-kofams -c $DBPATH/$SAMPLENAME.contigs.db \
                     -T 4 #these are the threads that Anvi'O is allowed to use
#ADD CONTIG STATS
anvi-display-contigs-stats $DBPATH/$SAMPLENAME.contigs.db --report-as-text --as-markdown -o $DBPATH/anvio_stats.txt

cd $BAMPATH
#create sample profiles
for f in $(cat 032024_mcav_sampleids.txt) 
do
    echo "processing $f file" 
anvi-profile -c $DBPATH/$SAMPLENAME.contigs.db  \
            -i "$f".bam \
            --min-percent-identity 95 \
            --min-contig-length 1500 \
            --sample-name "mcav"_"$f" \
            --output-dir $OUTDIR/"$f"
done
#Will fail if output dir already exists. sample name can't start with an number, hence the mcav in front
#keep parameters consistent in order to merge to larger profile 
echo "Anvio profiling: All samples sucessfully completed!"


#merge single sample profiles to one profile
anvi-merge $OUTDIR/*/PROFILE.db \
            -c $DBPATH/$SAMPLENAME.contigs.db \
            -o $OUTDIR/"$SAMPLENAME"_profile_merged

echo "Anvio profiling: All profiles merged"
conda deactivate

# JOB-ID: 27300405, 27353332,27359919, 27374000
# bash script file name: /nikea/COL/bash_scripts/Col_anvio.sh

failed at "create sample profile step". After lots of troubleshooting, got it to run. The output directory needs to be unique for each sample and not made ahead of time. Did piecemeal for mcav to get script running smoothly (that is why there are 4 job IDs). The whole script should run smoothly for the other species!

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=50G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/anvio/mcav/slurm-anvio-%j.out  # %j = job ID

module load conda/latest
conda activate anvio-8

#Contig database from assembled genomes. stores information related to your sequences: positions of open reading frames, k-mer frequencies for each contig, functional and taxonomic annotation of genes, etc.
#set parameters:
SAMPLENAME="ofav"
CONTIGPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/mapping/${SAMPLENAME}"
CONTIGFILE="${SAMPLENAME}.contigs-fixed.fsa"
BAMPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/mapping/${SAMPLENAME}"
DBPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/anvio/${SAMPLENAME}"
mkdir -p "$DBPATH"
OUTDIR="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/anvio/${SAMPLENAME}/profiles"
mkdir -p "$OUTDIR"
 

#from merged, fixed contig fasta file created in previous step..need for downstream analysis
#default k-mer frequency is 4
anvi-gen-contigs-database -f $CONTIGPATH/$CONTIGFILE --project-name $SAMPLENAME -o $DBPATH/$SAMPLENAME.contigs.db  

#integrate HMMs into the database:
anvi-run-hmms -c $DBPATH/$SAMPLENAME.contigs.db --num-threads 6

#this runs NCBI COGs against your contigs.db, integrating gene functions
anvi-run-ncbi-cogs -c $DBPATH/$SAMPLENAME.contigs.db -T 4

#ADD KEGG-KOFAM
anvi-run-kegg-kofams -c $DBPATH/$SAMPLENAME.contigs.db \
                     -T 4 #these are the threads that Anvi'O is allowed to use
#ADD CONTIG STATS
anvi-display-contigs-stats $DBPATH/$SAMPLENAME.contigs.db --report-as-text --as-markdown -o $DBPATH/anvio_stats.txt

cd $BAMPATH

#create sample profiles
#make sure sample list is in the folder w/ BAM files
for f in $(cat 032024_ofav_sampleids.txt) 
do
    echo "processing $f file" 
anvi-profile -c $DBPATH/$SAMPLENAME.contigs.db  \
            -i "$f".bam \
            --min-percent-identity 95 \
            --min-contig-length 1500 \
            --sample-name "ofav"_"$f" \
            --output-dir $OUTDIR/"$f"
done
#Will fail if output dir already exists. sample name can't start with an number, hence the species name in front
#keep parameters consistent in order to merge to larger profile 
echo "Anvio profiling: All samples sucessfully completed!"

#merge single sample profiles to one profile
anvi-merge $OUTDIR/*/PROFILE.db \
            -c $DBPATH/$SAMPLENAME.contigs.db \
            -o $OUTDIR/"$SAMPLENAME"_profile_merged

echo "Anvio profiling: All profiles merged"
conda deactivate

# JOB-ID: 27377112
# bash script file name: /nikea/COL/bash_scripts/Col_anvio.sh