# Gene annotation and mapping for per sample information COL_032024 seq data

Annotating metagenomic assembled genomes with MetaCerberus:https://github.com/raw-lab/MetaCerberus

In [None]:
#INSTALLATION
#using scratch workspace I created in 4Col_taxonomy
cd /scratch3/workspace/nikea_ulrich_uml_edu-gtdb
module load conda/latest
conda create --prefix /scratch3/workspace/nikea_ulrich_uml_edu-gtdb/envs/metacerberus -c conda-forge -c bioconda metacerberus -y
conda activate /scratch3/workspace/nikea_ulrich_uml_edu-gtdb/envs/metacerberus
metacerberus.py --setup
metacerberus.py --download

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=50G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 48:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/annotation/mcav/slurm-metacerberus-%j.out  # %j = job ID  # %j = job ID

module load conda/latest
conda activate /scratch3/workspace/nikea_ulrich_uml_edu-gtdb/envs/metacerberus

#for troubleshooting purposes, copied mcav bins to /scratch3

# Set parameters
SAMPLENAME="mcav"
BINS="/scratch3/workspace/nikea_ulrich_uml_edu-gtdb"
#BINPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/binning/${SAMPLENAME}/${SAMPLENAME}_DASTool_bins"
OUT="/scratch3/workspace/nikea_ulrich_uml_edu-gtdb/metacerberus_output"
#OUTDIR="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/annotation/${SAMPLENAME}"
#mkdir -p $OUT

cd $BINS

#metacerberus.py --prodigal metabat2.5.fa --hmm ALL --dir_out $OUT/metabat2.5.fa


#this all works fine below, but was hoping to get to the issue with doing indiv. still no dice
#make list of bins
#ls *.fa > "$SAMPLENAME"_bins.txt

#run metacerberus
#for f in $(cat "$SAMPLENAME"_bins.txt) 
#do
  #  echo "processing $f"
#metacerberus.py --prodigal "$f" --hmm ALL --dir_out $OUT/"$f"
#done
    
  ##--minscore MINSCORE   Score cutoff for parsing HMMER results [60]
  ##--evalue EVALUE       E-value cutoff for parsing HMMER results [1e-09]
  ##--remove-n-repeats    Remove N repeats, splitting contigs [False]

conda deactivate

# JOB-ID:
# bash script file name: /nikea/COL/bash_scripts/Col_metacerberus.sh

let's try DRAM instead

In [None]:
#INSTALLATION
wget https://raw.githubusercontent.com/WrightonLabCSU/DRAM/master/environment.yaml
module load conda/latest
conda env create -f environment.yaml --prefix /scratch3/workspace/nikea_ulrich_uml_edu-gtdb/envs/DRAM
conda activate /scratch3/workspace/nikea_ulrich_uml_edu-gtdb/envs/DRAM
DRAM-setup.py prepare_databases --output_dir DRAM_data #this has a documented issue (broken script in DRAM-setup.py) new version should be coming out in 2025
conda env remove -n /scratch3/workspace/nikea_ulrich_uml_edu-gtdb/envs/DRAM
#instead let's try installing with bioconda, there's evidence from others that this version has been fixed

module load conda/latest
conda create --prefix /scratch3/workspace/nikea_ulrich_uml_edu-gtdb/envs/dram
conda activate /scratch3/workspace/nikea_ulrich_uml_edu-gtdb/envs/dram
conda install -c bioconda dram
DRAM-setup.py prepare_databases --output_dir DRAM_data #worked but needs to be in a bash script because of memory issues

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=400G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/DRAM-setup/slurm-metacerberus-%j.out  # %j = job ID  # %j = job ID

finish script here if you decide to go with DRAM - maybe eggNOG annotations are the way to go for now...

Map the reads of each sample back to the MAG catalogue and retrieve mapping statistics. This gives you relative abundance information to measure how abundant or rare each bacteria was in each sample. 

In [None]:
#Need to install CoverM in the anvi'o environment 
module load conda/latest
conda activate anvio-8
conda install -c bioconda coverm

CoverM: https://wwood.github.io/CoverM/coverm-genome.html

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/annotation/dlab/slurm-MAG-mapping-%j.out  # %j = job ID

module load conda/latest
conda activate anvio-8


SAMPLENAME="dlab"
READSPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/assembly/${SAMPLENAME}/repaired"
MAGPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/binning/${SAMPLENAME}/${SAMPLENAME}_DASTool_bins"
WORKDIR="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/annotation/${SAMPLENAME}"
mkdir -p "$WORKDIR"
MAGDB="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/annotation/${SAMPLENAME}/MAG_db"
mkdir -p "$MAGDB"
XTRAFILES="/scratch3/workspace/nikea_ulrich_uml_edu-gtdb/annotation/${SAMPLENAME}"
mkdir -p "$XTRAFILES"
LISTPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/"
SAMPLELIST="032024_${SAMPLENAME}_sampleids.txt"


#concatenate MAGs into 1 file to create a MAG database
cat $MAGPATH/*.fa > $MAGDB/"$SAMPLENAME"_mags.fsa

cd $MAGDB
#index MAG database with bowtie2
bowtie2-build --large-index --threads 11 "$SAMPLENAME"_mags.fsa "$SAMPLENAME"_index

while IFS= read -r SAMPLEID; do
    #align reads to your contigs and collects that in a .sam file
    bowtie2 --threads 11 -x "$SAMPLENAME"_index -1 $READSPATH/"${SAMPLEID}"_host_removed_R1.tagged_filter_ready.fastq.gz -2 $READSPATH/"${SAMPLEID}"_host_removed_R2.tagged_filter_ready.fastq.gz -S $XTRAFILES/"${SAMPLEID}".sam
    #make sure to point it to the index not the FIXEDCON file (-x parameter)
    
    #converts your sam file to a bam file, but its neither sorted nor indexed, so we use an Anvi'O script to do so:
    samtools view -F 4 -b -S $XTRAFILES/"${SAMPLEID}".sam -o $WORKDIR/"${SAMPLEID}"-RAW.bam
   
    #index and sort your bam file
    anvi-init-bam $WORKDIR/"${SAMPLEID}"-RAW.bam -o $WORKDIR/"${SAMPLEID}".bam
    
    rm $WORKDIR/"${SAMPLEID}"-RAW.bam
done < "$LISTPATH/${SAMPLELIST}"
echo "Mapping success!"

#extract stats with CoverM
coverm genome \
    -b $WORKDIR/ \
    -m relative_abundance \
    --min-covered-fraction 0 \
    -o $WORKDIR/mapping_rate

coverm genome \
    -b $WORKDIR/ \
    -m count covered_fraction \
    --min-covered-fraction 0 \
    -o $WORKDIR/count_table

conda deactivate

#JOB ID: 28168719 (coverm didn't run)
#bash script: nikea/COL/bash_scripts/Col_MAG_mapping.sh

In [None]:
error: the following required arguments were not provided:
  --separator <separator>
  --genome-fasta-files <genome-fasta-files>...
  --genome-fasta-directory <genome-fasta-directory>
  --genome-fasta-list <genome-fasta-list>
  --genome-definition <genome-definition>

added bin identifier to each contig in MAGs (in /scratch3) so that when they are concatenated into 1 MAG database, the MAGs can be identified still

made changes and ran again

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/annotation/dlab/slurm-MAG-mapping-%j.out  # %j = job ID

module load conda/latest
conda activate anvio-8


SAMPLENAME="dlab"
READSPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/assembly/${SAMPLENAME}/repaired"
MAGPATH="/scratch3/workspace/nikea_ulrich_uml_edu-gtdb/bins/${SAMPLENAME}"
WORKDIR="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/annotation/${SAMPLENAME}"
mkdir -p "$WORKDIR"
MAGDB="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/annotation/${SAMPLENAME}/MAG_db"
mkdir -p "$MAGDB"
XTRAFILES="/scratch3/workspace/nikea_ulrich_uml_edu-gtdb/annotation/${SAMPLENAME}"
mkdir -p "$XTRAFILES"
LISTPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/"
SAMPLELIST="032024_${SAMPLENAME}_sampleids.txt"


#concatenate MAGs into 1 file to create a MAG database
cat $MAGPATH/*.fa > $MAGDB/"$SAMPLENAME"_mags.fsa

cd $MAGDB
#index MAG database with bowtie2
bowtie2-build --large-index --threads 11 "$SAMPLENAME"_mags.fsa "$SAMPLENAME"_index

while IFS= read -r SAMPLEID; do
    #align reads to your contigs and collects that in a .sam file
    bowtie2 --threads 11 -x "$SAMPLENAME"_index -1 $READSPATH/"${SAMPLEID}"_host_removed_R1.tagged_filter_ready.fastq.gz -2 $READSPATH/"${SAMPLEID}"_host_removed_R2.tagged_filter_ready.fastq.gz -S $XTRAFILES/"${SAMPLEID}".sam
    #make sure to point it to the index not the FIXEDCON file (-x parameter)
    
    #converts your sam file to a bam file, but its neither sorted nor indexed, so we use an Anvi'O script to do so:
    samtools view -F 4 -b -S $XTRAFILES/"${SAMPLEID}".sam -o $WORKDIR/"${SAMPLEID}"-RAW.bam
   
    #index and sort your bam file
    anvi-init-bam $WORKDIR/"${SAMPLEID}"-RAW.bam -o $WORKDIR/"${SAMPLEID}".bam
    
    rm $WORKDIR/"${SAMPLEID}"-RAW.bam
done < "$LISTPATH/${SAMPLELIST}"
echo "Mapping success!"

#extract stats with CoverM
coverm genome \
    -b $WORKDIR/*.bam \
    -s . \
    -m relative_abundance \
    --min-covered-fraction 0 \
    -o $WORKDIR/mapping_rate

coverm genome \
    -b $WORKDIR/*.bam \
    -s . \
    -m count covered_fraction \
    --min-covered-fraction 0 \
    -o $WORKDIR/count_table

conda deactivate

#JOB ID:28171168, 28172108(coverm)
#bash script: nikea/COL/bash_scripts/Col_MAG_mapping.sh