# 2.1 Prokaryote metagenome-assembled genomes (MAGs)

## Software and versions used in this study

- BBMap v37.93
- CONCOCT v0.4.1
- MetaBAT v2.12.1
- MaxBin v2.2.4
- DAS_Tool v1.1.1
- CheckM v1.2.1
- GTDB-TK v2.4.0 (database v214)
- dRep v1.4.3
- DRAM v1.3.5

## Additional custom scripts

Note: custom scripts have been tested in python v3.11.6 and R v4.2.1 and may not be stable in other versions.

- scripts/general/compile_dram_annotations.py

*Required python packages: argparse, pandas, numpy, os, glob*

***

## Prokaryote MAGs

Further detail on the process of binning assembled contigs to recover prokaryote MAGs is available [here](github.com/GenomicsAotearoa/environmental_metagenomics)

#### Calculate coverage (per assembly)

Coverage is used in some binning software

In [None]:
cd 
mkdir -p /workdir/DNA/2.prokaryote_mags/1.initial_binning/0.prep/1.coverage

for i in {1..9}; do
    bbmap.sh t=30 \
    ref=DNA/1.assembly.m1000/S${i}.assembly.m1000.fasta nodisk \
    in1=DNA/1.Qual_filtered_trimmomatic/S${i}_R1.fastq \
    in2=DNA/1.Qual_filtered_trimmomatic/S${i}_R2.fastq \
    out=DNA/2.prokaryote_mags/1.initial_binning/0.prep/1.coverage/S${i}.sam
    # convert to bam
    samtools sort -@ 10 -o DNA/2.prokaryote_mags/1.initial_binning/0.prep/1.coverage/S${i}.bam DNA/2.prokaryote_mags/1.initial_binning/0.prep/1.coverage/S${i}.sam
done

#### Binning prep

In [None]:
# MetaBat coverage table (via jgi_summarize_bam_contig_depths from MetaBAT)
for i in {1..9}; do
    jgi_summarize_bam_contig_depths \
    --outputDepth DNA/2.prokaryote_mags/1.initial_binning/0.prep/1.coverage/S${i}.metabat.txt \
    DNA/2.prokaryote_mags/1.initial_binning/0.prep/1.coverage/S${i}.bam
done

# MaxBin coverage table (extract from MetaBat coverage table)
for i in {1..9}; do
    cut -f1,4 DNA/2.prokaryote_mags/1.initial_binning/0.prep/1.coverage/S${i}.metabat.txt > DNA/2.prokaryote_mags/1.binning/0.prep/1.coverage/S${i}.maxbin.txt
done

# CONCOCT prep: fragment contigs and generate coverage table
mkdir -p DNA/2.prokaryote_mags/1.initial_binning/0.prep/2.concoct_prep
for i in {1..9}; do
    # prep
    cut_up_fasta.py DNA/1.assembly.m1000/S${i}.assembly.m1000.fasta -c 10000 -o 0 --merge_last \
    -b DNA/2.prokaryote_mags/1.initial_binning/0.prep/2.concoct_prep/S${i}.10k.bed \
    > DNA/2.prokaryote_mags/1.initial_binning/0.prep/2.concoct_prep/S${i}.10k.fa
    samtools index DNA/2.prokaryote_mags/1.initial_binning/0.prep/1.coverage/S${i}.bam
    concoct_coverage_table.py \
    DNA/2.prokaryote_mags/1.initial_binning/0.prep/2.concoct_prep/S${i}.10k.bed \
    DNA/2.prokaryote_mags/1.initial_binning/0.prep/1.coverage/S${i}.bam \
    > DNA/2.prokaryote_mags/1.initial_binning/0.prep/2.concoct_prep/S${i}.10k.txt
done


#### Recover initial bins

Binning via MetaBAT, and MaxBin, and CONCOCT

In [None]:
cd /workdir/DNA/2.prokaryote_mags/1.initial_binning
mkdir -p 2.binning

# MetaBat
for i in {1..9}; do
    metabat2 -t 10 -s 50000 \
    -i ../../1.assembly.m1000/S${i}.assembly.m1000.fasta \
    -a 0.prep/1.coverage/S${i}.metabat.txt \
    -o 1.binning/1.metabat.S${i}/S${i}.metabat
done 

# MaxBin
for i in {1..9}; do
    run_MaxBin.pl -thread 10 \
    -contig ../../1.assembly.m1000/S${i}.assembly.m1000.fasta \
    -abund 0.prep/1.coverage/S${i}.maxbin.txt \
    -out 1.binning/1.maxbin.S${i}/S${i}.maxbin 
done

# CONCOCT
for i in {1..9}; do
    concoct \
    --composition_file 0.prep/2.concoct_prep/S${i}.10k.fa \
    --coverage_file 0.prep/2.concoct_prep/S${i}.10k.txt \
    -b 1.binning/1.concoct.S${i}
    # Cluster the fragments back into their original form
    merge_cutup_clustering.py \
    1.binning/1.concoct.S${i}/clustering_gt1000.csv \
    > 1.binning/1.concoct.S${i}/clustering_merged.csv
    # Create the bins
    mkdir 1.binning/1.concoct.S${i}/bin_files/
    extract_fasta_bins.py \
    ../../1.assembly.m1000/S${i}.assembly.m1000.fasta \
    1.binning/1.concoct.S${i}/clustering_merged.csv \
    --output_path 1.binning/1.concoct.S${i}/bin_files/
done

Collate bin files

In [None]:
cd /workdir/DNA/2.prokaryote_mags/1.initial_binning/1.binning

for i in {1..9}; do
    mkdir 2.bin_files.S${i}/
    # metabat
    cd 1.metabat.S${i}/
    for x in `ls *.fa`; do o="${x/fa/fna}"; cp ${x} ../2.bin_files.S${i}/${o}; done
    # maxbin
    cd ../1.maxbin.S${i}/
    for x in `ls *.fasta`; do o="${x/fasta/fna}"; cp ${x} ../2.bin_files.S${i}/${o}; done
    # concoct
    cd ../1.concoct.S${i}/bin_files/
    for x in `ls *.fa`; do o="${x/fa/fna}"; cp ${x} ../../2.bin_files.S${i}/S${i}.concoct.${o}; done
done

#### Dereplicate within assemblies via DAS_Tool



Prep metadata files for DAS_Tool

In [None]:
cd /workdir/DNA/2.prokaryote_mags
mkdir -p 2.bin_dereplication_DAS_Tool/0.prep

for i in {1..9}; do
    for x in metabat maxbin concoct; do
        grep ">" 1.initial_binning/1.binning/2.bin_files.S${i}/*${x}* | sed 's/.fna:>/\t/g' | cut -f2 -d '/' | awk 'OFS="\t" {print $2,$1}' \
            > 2.bin_dereplication_DAS_Tool/0.prep/dastool_metadata.S${i}.${x}.txt
    done
done


Run DAS_Tool on bins from each assembly

In [None]:
cd /workdir/DNA/2.prokaryote_mags/2.bin_dereplication_DAS_Tool

for i in {1..9}; do
    mkdir -p dastool.S${i}
    DAS_Tool -t 36 --write_bins 1 --search_engine blast -l metabat,maxbin,concoct \
    -i 0.prep/dastool_metadata.S${i}.metabat.txt,0.prep/dastool_metadata.S${i}.maxbin.txt,0.prep/dastool_metadata.S${i}.concoct.txt \
    -c ../../1.assembly.m1000/S${i}.assembly.m1000.fasta \
    -o ${i}
done

Compile DASTool_bins from all assemblies for downstream use

In [None]:
cd /workdir/DNA/2.prokaryote_mags/2.bin_dereplication_DAS_Tool
mkdir -p DASTool_All_bins

for i in {1..9}; do
    cp S${i}_DASTool_bins/*.fna DASTool_All_bins/
done

#### Prokaryote MAG quality and completeness via CheckM

In [None]:
cd /workdir/DNA/2.prokaryote_mags
mkdir -p 3.checkm

checkm lineage_wf -t 10 --pplacer_threads 10 -x fna \
--tab_table -f 3.checkm/genomes_checkm.txt \
2.bin_dereplication_DAS_Tool/DASTool_All_bins \
3.checkm

#### Prokaryote MAG taxonomy prediction

Assign taxonomy to prokaryote MAGs via gtdb-tk

In [None]:
cd /workdir/DNA/2.prokaryote_mags
mkdir -p 3.taxonomy

gtdbtk classify_wf -x fa --cpus 8 \
--genome_dir 2.bin_dereplication_DAS_Tool/DASTool_All_bins \
--out_dir 3.taxonomy \
--mash_db 3.taxonomy/mash_database 


#### Dereplicate across assemblies via dRep

For this study, primary and secondary clustering were run at 90% and 98% identity, respectively, and filtering by completeness or contamination was excluded within dRep.

Note: checkM results summary was generated prior to running dRep, with the columns: "genome,completeness,contamination"

In [None]:
cd /workdir/DNA/2.prokaryote_mags

# Generate checkM summary file
mkdir -p 4.bin_dereplication_dRep/0.prep
echo -e "genome,completeness,contamination" > 4.bin_dereplication_dRep/0.prep/genomes_checkm_for_drep.csv
tail -n +2 3.checkm/checkm_bin_summary.txt | cut -f1,12-13 | sed -i 's/\t/,/g' >> 4.bin_dereplication_dRep/0.prep/genomes_checkm_for_drep.csv

# Set up for dRep
mkdir -p /workdir/DNA/2.prokaryote_mags/4.bin_dereplication_dRep/1.dRep_out
cd /workdir/DNA/2.prokaryote_mags/2.bin_dereplication_DAS_Tool/DASTool_All_bins

# Run dRep with secondary clustering set to 98% identity
dRep dereplicate ../../4.bin_dereplication_dRep/1.dRep_out -g *.fasta \
-p 12 --completeness 0 --contamination 100 --length 50000  -pa 0.90 -sa 0.98 --S_algorithm gANI \
--genomeInfo ../../4.bin_dereplication_dRep/0.prep/genomes_checkm_for_drep.csv


#### Prokaryote MAG gene prediction and annotation via DRAM

Presented as a loop here, but makes sense to run in parallel subset batches of the MAGs (e.g. via slurm array) if you have the resources available.

In [None]:
cd /workdir/DNA/2.prokaryote_mags
mkdir -p 5.gene_annotation/1.dram_annotate_dRep_mags/individual_mags/

for mag_file in 4.bin_dereplication_dRep/1.dRep_out/dereplicated_genomes/*.fa; do
    mag_ID=$(basename ${mag_file} .fa)
    DRAM.py annotate --threads 24 \
    -i ${mag_file} \
    -o 5.gene_annotation/1.dram_annotate_dRep_mags/individual_mags/${mag_ID}
done

# compile annotations, trna, and rrna results
scripts/general/compile_dram_annotations.py \
-i 5.gene_annotation/1.dram_annotate_dRep_mags/individual_mags/ \
-o 5.gene_annotation/1.dram_annotate_dRep_mags/collated_dram_

***