# Binning

Goal: use a combination of MetaBAT, MaxBin, CONCOCT. Then use DAS Tool to integrate the results of these binning algorithms to calculate an optimized, non-redundant set of bins from a single assembly

Software:

MetaBAT2: https://bitbucket.org/berkeleylab/metabat/src/master/README.md

CONCOCT: https://github.com/BinPro/CONCOCT; https://concoct.readthedocs.io/en/latest/usage.html

CheckM: https://github.com/Ecogenomics/CheckM/wiki

DAS_tool: https://github.com/cmks/DAS_Tool

In [None]:
#INSTALLATION
module load conda/latest
conda create -n binning python=3.7
conda activate binning
conda install -c bioconda metabat2
conda install -c bioconda checkm-genome
conda install -c bioconda das_tool

## Metabat2
https://bitbucket.org/berkeleylab/metabat/src/master/README.md

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=50G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/binning/mcav/slurm-metabat2binning-%j.out  # %j = job ID  # %j = job ID

module load conda/latest
conda activate binning

#set parameters for binning:
SAMPLENAME="mcav"
BINDIR="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/binning/${SAMPLENAME}/MetaBAT2_bins"
mkdir -p $BINDIR
CONTIGPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/mapping/${SAMPLENAME}"
CONTIGFILE="${SAMPLENAME}.contigs-fixed.fsa"

#create depth file for MetaBat2
jgi_summarize_bam_contig_depths --outputDepth $BINDIR/MetaBAT2_depth.txt $CONTIGPATH/*.bam

#MetaBat2 script with verbose output, minimum length (m)(has to be >=1500) and no min bin size 
metabat2 -i $CONTIGPATH/$CONTIGFILE -a $BINDIR/MetaBAT2_depth.txt \
-o $BINDIR/metabat2 -m 1500

# MetaBAT2 (v2:2.17)
# default parameters:
#-m [ --minContig ] arg (=2500)    Minimum size of a contig for binning (should be >=1500).
#  --maxP arg (=95)                  Percentage of 'good' contigs considered for binning decided by connection
#                                    among contigs. The greater, the more sensitive.
#  --minS arg (=60)                  Minimum score of a edge for binning (should be between 1 and 99). The 
#                                    greater, the more specific.
#  --maxEdges arg (=200)             Maximum number of edges per node. The greater, the more sensitive.
#  --pTNF arg (=0)                   TNF probability cutoff for building TNF graph. Use it to skip the 
#                                    preparation step. (0: auto).
#  -x [ --minCV ] arg (=1)           Minimum mean coverage of a contig in each library for binning.
#  --minCVSum arg (=1)               Minimum total effective mean coverage of a contig (sum of depth over 
#                                    minCV) for binning.
#  -s [ --minClsSize ] arg (=200000) Minimum size of a bin as the output.
#  -t [ --numThreads ] arg (=0)      Number of threads to use (0: use all cores).

#this runs CheckM immediately after and puts the results alongside your bins
checkm lineage_wf -x fa -t 3 $BINDIR/ $BINDIR/checkm-bins-stats

# JOB-ID: 
# bash script file name: 

## CONCOCT
https://concoct.readthedocs.io/en/latest/installation.html

In [None]:
#INSTALLATION (recommended it is installed in an isolated env)
conda config --add channels defaults
conda config --add channels bioconda
conda config --add channels conda-forge

conda create -n concoct_env python=3 concoct

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=50G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/binning/dlab/slurm-concoctbinning-%j.out  # %j = job ID  # %j = job ID

module load conda/latest
conda activate concoct_env

#set parameters
SAMPLENAME="dlab"
BINPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/binning/${SAMPLENAME}/CONCOCT_bins"
mkdir -p $BINPATH
CONTIGPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/mapping/${SAMPLENAME}"
CONTIGFILE="${SAMPLENAME}.contigs-fixed.fsa"
BAMPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/mapping/${SAMPLENAME}"

TEMPDIR="/project/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL_files/binning/concoct_${SAMPLENAME}_temp"
mkdir -p $TEMPDIR

#creates the CONCOCT depth file
#this part cuts up the contigs into 10kb pieces for CONCOCT to use 
cut_up_fasta.py $CONTIGPATH/$CONTIGFILE -c 10000 -o 0 --merge_last -b $BINPATH/${SAMPLENAME}_contigs_cut.bed > $BINPATH/${SAMPLENAME}_contigs_cut.fa

#estimate contig coverage
concoct_coverage_table.py $BINPATH/${SAMPLENAME}_contigs_cut.bed $BAMPATH/*.bam > $BINPATH/coverage_table_${SAMPLENAME}.tsv || { echo 'Exit code 2: failed to create coverage file, exiting.' && exit; }

#run CONCOCT
concoct --composition_file $BINPATH/${SAMPLENAME}_contigs_cut.fa --coverage_file $BINPATH/coverage_table_${SAMPLENAME}.tsv -t 3 -b $TEMPDIR || { echo 'Exit code 3: CONCOCT failed to run, exiting.' && exit; }
merge_cutup_clustering.py $TEMPDIR/clustering_gt1000.csv > $TEMPDIR/${SAMPLENAME}_clustering_merged.csv || { echo 'Exit code 4: failed to merge clusters, exiting.' && exit; }
extract_fasta_bins.py $CONTIGPATH/$CONTIGFILE $TEMPDIR/${SAMPLENAME}_clustering_merged.csv --output_path $BINPATH || { echo 'Exit code 5: Bins were not extracted, exiting.' && exit; }

# Checkm is in binning env so switching environments 
conda deactivate

conda activate binning

#this runs CheckM immediately after and puts the results alongside your bins
checkm lineage_wf -x fa -t 3 $BINPATH  $BINPATH/CheckM_stats

# JOB-ID:
# bash script file name: 

## DAS Tool
https://github.com/cmks/DAS_Tool

The binning conda env should already have das_tool installed (see top installation window). You will need to convert fasta output from MetaBat2 and CONCOCT into the correct format for DAS tool.

In [None]:
conda activate binning

In [None]:
Fasta_to_Contig2Bin.sh -i MetaBAT2_bins/ -e fa > ./metabat2.contigs2bin.tsv

In [None]:
perl -pe "s/,/\tconcoct./g;" /project/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL_files/binning/concoct_dlab_temp/dlab_clustering_merged.csv > ./concoct.contigs2bin.tsv
#the clustering csv file is in the temp folder over in projects (hence the long file path)

*Download concoct.contigs2bin.tsv and remove the first row (heading) and then re-upload!*

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=50G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/binning/dlab/slurm-das_tool-%j.out  # %j = job ID  # %j = job ID

module load conda/latest
conda activate binning

# Set parameters
SAMPLENAME="dlab"
CONCOCTPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/binning/${SAMPLENAME}/concoct.contigs2bin.tsv"  
METABATPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/binning/${SAMPLENAME}/metabat2.contigs2bin.tsv"
CONTIGPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/mapping/${SAMPLENAME}"
CONTIGFILE="${SAMPLENAME}.contigs-fixed.fsa"
OUTDIR="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/binning/${SAMPLENAME}"

#Run DAS Tool
DAS_Tool -i $CONCOCTPATH,$METABATPATH \
-l concoct,metabat2 \
-c $CONTIGPATH/$CONTIGFILE \
-t 11 \
--write_bin_evals \
--write_bins \
-o $OUTDIR/"${SAMPLENAME}"

# -i input list: tab seperated table of contigs-bins 
#--score_threshold default is 0.5


# Set parameters
BINPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/binning/${SAMPLENAME}/${SAMPLENAME}_DASTool_bins"

#Run CheckM
checkm lineage_wf -x fa -t 3 $BINPATH $BINPATH/CheckM_stats

# JOB-ID:
# bash script file name: 