### Colombia 012025, 032024 Binning & Taxonomy

#### Metabat2, Concoct, Maxbin2

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=180G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH --qos=long # extend time limit to longer than 2 days if needed
#SBATCH -t 72:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/slurms/slurm-binning-mcav-%j.out  # %j = job ID  # %j = job ID

module load conda/latest
conda activate binning

#set parameters for binning:
SAMPLENAME="mcav"
READSPATH="/scratch4/workspace/nikea_ulrich_uml_edu-col_data/all_sequences_concat"
F_READS="${SAMPLENAME}_all_reads_R1.fastq.gz"
R_READS="${SAMPLENAME}_all_reads_R2.fastq.gz"
CONTIGPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/assembly/assemblies_all"
CONTIGFILE="${SAMPLENAME}.contigs-fixed.fsa"
BAMPATH="/scratch4/workspace/nikea_ulrich_uml_edu-col_data/mapping/all_bams/${SAMPLENAME}"
METABINDIR="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/binning/${SAMPLENAME}/MetaBAT2_bins"
mkdir -p $METABINDIR
MAXBINDIR="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/binning/${SAMPLENAME}/Maxbin2_bins"
mkdir -p $MAXBINDIR

#run MetaBAT2
#create depth file for MetaBat2
jgi_summarize_bam_contig_depths --outputDepth $METABINDIR/MetaBAT2_depth.txt $BAMPATH/*.bam

#MetaBat2 script with verbose output, minimum length (m)(has to be >=1500) and no min bin size 
metabat2 -i $CONTIGPATH/$CONTIGFILE -a $METABINDIR/MetaBAT2_depth.txt \
-o $METABINDIR/metabat2 -m 1500

#run CheckM on metabat2 bins
checkm lineage_wf -x fa -t 3 $METABINDIR/ $METABINDIR/checkm-bins-stats

conda deactivate
echo "deactivated binning environment"

#run CONCOCT (has it's own environment)
module load conda/latest
conda activate concoct_env
echo "activated concoct environment"

CONCBINDIR="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/binning/${SAMPLENAME}/Concoct_bins"
mkdir -p $CONCBINDIR
CTEMPDIR="/scratch4/workspace/nikea_ulrich_uml_edu-col_data/binning/concoct_${SAMPLENAME}_temp"
mkdir -p $CTEMPDIR

#creates the CONCOCT depth file 
cut_up_fasta.py $CONTIGPATH/$CONTIGFILE -c 10000 -o 0 --merge_last -b $CONCBINDIR/"${SAMPLENAME}"_contigs_cut.bed > $CONCBINDIR/"${SAMPLENAME}"_contigs_cut.fa

#estimate contig coverage
concoct_coverage_table.py $CONCBINDIR/"${SAMPLENAME}"_contigs_cut.bed $BAMPATH/*.bam > $CONCBINDIR/coverage_table_"${SAMPLENAME}".tsv || { echo 'Exit code 2: failed to create coverage file, exiting.' && exit; }

#run CONCOCT
concoct --composition_file $CONCBINDIR/"${SAMPLENAME}"_contigs_cut.fa --coverage_file $CONCBINDIR/coverage_table_"${SAMPLENAME}".tsv -t 3 -b $CTEMPDIR || { echo 'Exit code 3: CONCOCT failed to run, exiting.' && exit; }
merge_cutup_clustering.py $CTEMPDIR/clustering_gt1000.csv > $CTEMPDIR/"${SAMPLENAME}"_clustering_merged.csv || { echo 'Exit code 4: failed to merge clusters, exiting.' && exit; }
extract_fasta_bins.py $CONTIGPATH/$CONTIGFILE $CTEMPDIR/"${SAMPLENAME}"_clustering_merged.csv --output_path $CONCBINDIR || { echo 'Exit code 5: Bins were not extracted, exiting.' && exit; }

conda deactivate
echo "deactivated conconct environment"

conda activate binning
echo "activated binning environment"

#run CheckM on CONCOCT bins
checkm lineage_wf -x fa -t 3 $CONCBINDIR/ $CONCBINDIR/checkm-bins-stats

#run Maxbin2
run_MaxBin.pl -contig $CONTIGPATH/$CONTIGFILE -reads $READSPATH/$F_READS -reads2 $READSPATH/$R_READS -out $MAXBINDIR/maxbin2 -thread 16

#run CheckM on Maxbin2 bins
checkm lineage_wf -x fasta -t 3 $MAXBINDIR/ $MAXBINDIR/checkm-bins-stats

conda deactivate

# JOB-ID: 50506413, (t1 script- for troubleshooting steps)  50542337, 50556190
# bash script file name: /nikea/COL/bash_scripts/Col_binning.sh

once I got the script running smoothly for mcav samples, submitted it for the others \
dlab: 50556320 \
ofav: 50556315 \
pstr: 50556317 

In [None]:
# MetaBAT2 (v2:2.17)
# default parameters:
#-m [ --minContig ] arg (=2500)    Minimum size of a contig for binning (should be >=1500).
#  --maxP arg (=95)                  Percentage of 'good' contigs considered for binning decided by connection
#                                    among contigs. The greater, the more sensitive.
#  --minS arg (=60)                  Minimum score of a edge for binning (should be between 1 and 99). The 
#                                    greater, the more specific.
#  --maxEdges arg (=200)             Maximum number of edges per node. The greater, the more sensitive.
#  --pTNF arg (=0)                   TNF probability cutoff for building TNF graph. Use it to skip the 
#                                    preparation step. (0: auto).
#  -x [ --minCV ] arg (=1)           Minimum mean coverage of a contig in each library for binning.
#  --minCVSum arg (=1)               Minimum total effective mean coverage of a contig (sum of depth over 
#                                    minCV) for binning.
#  -s [ --minClsSize ] arg (=200000) Minimum size of a bin as the output.
#  -t [ --numThreads ] arg (=0)      Number of threads to use (0: use all cores).

resource for maxbin2: https://gtpb.github.io/AM22/pages/07-binning/7.binning.html \
https://nf-co.re/modules/maxbin2

### Das Tool to de-replicate set of bins from multiple binning software

In [None]:
#prep all the input files for das_tool
salloc -c 6 -p cpu
#Convert fasta output from MetaBat2, CONCOCT, and Maxbin2 into the correct format for DAS tool 
conda activate binning
Fasta_to_Contig2Bin.sh -i MetaBAT2_bins/ -e fa > ./metabat2.contigs2bin.tsv
Fasta_to_Contig2Bin.sh -i Maxbin2_bins/ -e fasta > ./maxbin2.contigs2bin.tsv
perl -pe "s/,/\tconcoct./g;" /scratch4/workspace/nikea_ulrich_uml_edu-col_data/binning/concoct_mcav_temp/mcav_clustering_merged.csv > ./concoct.contigs2bin.tsv

#remove the first row (heading) of concoct.contigs2bin.tsv
sed -i '1d' concoct.contigs2bin.tsv

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=100G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 24:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/slurms/slurm-das_tool-mcav-%j.out  # %j = job ID  # %j = job ID

module load conda/latest
conda activate binning

# Set parameters
SAMPLENAME="mcav"
CONCBINDIR="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/binning/${SAMPLENAME}/Concoct_bins/concoct.contigs2bin.tsv"  
METABINDIR="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/binning/${SAMPLENAME}/MetaBAT2_bins/metabat2.contigs2bin.tsv"
MAXBINDIR="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/binning/${SAMPLENAME}/Maxbin2_bins/maxbin2.contigs2bin.tsv"
CONTIGPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/assembly/assemblies_all"
CONTIGFILE="${SAMPLENAME}.contigs-fixed.fsa"
OUTDIR="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/binning/${SAMPLENAME}/Das_Tool"
mkdir -p $OUTDIR

#Run DAS Tool
DAS_Tool -i $CONCBINDIR,$METABINDIR,$MAXBINDIR \
-l concoct,metabat2,maxbin2 \
-c $CONTIGPATH/$CONTIGFILE \
-t 11 \
--write_bin_evals \
--write_bins \
-o $OUTDIR/"${SAMPLENAME}"

#Run CheckM
checkm lineage_wf -x fa -t 3 $OUTDIR/"${SAMPLENAME}"_DASTool_bins $OUTDIR/CheckM_stats

# -i input list: tab seperated table of contigs-bins 
#--score_threshold default is 0.5

# JOB-ID: 50571591
# bash script file name: /nikea/COL/bash_scripts/Col_das_tool.sh

other job IDs: \
ofav: 50571677 \
pstr: 50571690 \
dlab: 50571691

In [None]:
#DLAB
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  Bin Id                     Marker lineage            # genomes   # markers   # marker sets    0     1    2    3    4   5+   Completeness   Contamination   Strain heterogeneity  
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  metabat2.9         o__Sphingomonadales (UID3310)         26         569           293         5    548   14   2    0   0       99.53            3.04               5.00          
  maxbin2.015_sub   c__Alphaproteobacteria (UID3305)      564         349           230         4    334   11   0    0   0       98.91            3.52               0.00          
  metabat2.22            k__Bacteria (UID2495)            2993        140            85         1    134   5    0    0   0       98.82            3.64              80.00          
  metabat2.7             s__algicola (UID2847)             33         496           263         20   466   10   0    0   0       98.49            1.81              10.00          
  metabat2.74            k__Bacteria (UID3187)            2258        188           117         4    182   2    0    0   0       96.58            1.71               0.00          
  concoct.75        c__Gammaproteobacteria (UID4267)      119         544           284         21   517   6    0    0   0       94.85            1.06              33.33          
  metabat2.43       c__Alphaproteobacteria (UID3305)      564         349           230         28   320   1    0    0   0       90.26            0.43              100.00         
  metabat2.54       c__Alphaproteobacteria (UID3305)      564         349           230         31   307   10   1    0   0       88.63            3.68              61.54          
  concoct.66_sub       o__Cytophagales (UID2936)           47         454           336         53   378   21   2    0   0       88.40            5.57               3.70          
  metabat2.42       c__Alphaproteobacteria (UID3305)      564         349           230         48   292   9    0    0   0       87.40            2.72              44.44          
  metabat2.73          p__Cyanobacteria (UID2192)          79         584           458        131   444   9    0    0   0       77.68            1.28              44.44          
  concoct.50_sub    c__Alphaproteobacteria (UID3305)      564         349           230         78   199   58   12   2   0       76.00           26.07              81.13          
  metabat2.38            k__Bacteria (UID2570)            433         274           183         56   216   2    0    0   0       75.91            0.56               0.00          
  metabat2.44           o__Rhizobiales (UID3642)          107         484           316        165   311   8    0    0   0       65.37            1.47              25.00          
  concoct.145_sub        k__Bacteria (UID2570)            433         274           183         77   182   13   1    1   0       61.91            6.58              45.45          
  concoct.118_sub     o__Actinomycetales (UID1530)        622         259           152        108   138   13   0    0   0       61.56            5.27              23.08          
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
#MCAV
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  Bin Id                    Marker lineage            # genomes   # markers   # marker sets    0     1    2    3   4   5+   Completeness   Contamination   Strain heterogeneity  
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  metabat2.24         p__Cyanobacteria (UID2192)          79         584           458         9    569   6    0   0   0       98.33            0.84              16.67          
  metabat2.3            k__Bacteria (UID3187)            2258        188           117         6    179   3    0   0   0       94.87            2.56              33.33          
  concoct.13_sub   c__Alphaproteobacteria (UID3305)      564         349           230         88   236   25   0   0   0       74.12            9.16              44.00          
  metabat2.9           o__Rhizobiales (UID3447)          356         416           249        114   298   4    0   0   0       70.73            0.72              25.00          
  concoct.58          o__Cytophagales (UID2936)           47         454           336        164   278   12   0   0   0       62.04            2.64              25.00          
  metabat2.19           k__Bacteria (UID2570)            433         274           183         84   184   6    0   0   0       57.81            1.36               0.00          
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
#OFAV
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  Bin Id                    Marker lineage            # genomes   # markers   # marker sets   0     1    2    3   4   5+   Completeness   Contamination   Strain heterogeneity  
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  concoct.150      c__Gammaproteobacteria (UID4267)      119         544           284        60   467   17   0   0   0       86.32            3.27              64.71          
  concoct.61_sub        k__Bacteria (UID2565)            2921        152            93        17   117   18   0   0   0       84.75           11.01              61.11          
  metabat2.3        f__Flavobacteriaceae (UID2845)        53         548           298        76   461   10   1   0   0       83.74            2.31              38.46          
  concoct.46            k__Bacteria (UID2570)            433         274           183        89   180   5    0   0   0       57.81            1.38              20.00          
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
#PSTR
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  Bin Id                    Marker lineage            # genomes   # markers   # marker sets    0     1    2    3   4   5+   Completeness   Contamination   Strain heterogeneity  
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  metabat2.17       o__Sphingomonadales (UID3310)         26         569           293         2    550   15   2   0   0       99.63            3.27               0.00          
  concoct.14_sub       o__Rhizobiales (UID3642)          107         484           316         8    469   7    0   0   0       99.30            1.92              28.57          
  concoct.18       c__Alphaproteobacteria (UID3305)      564         349           230        111   230   8    0   0   0       64.94            2.16              75.00          
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### GTDBtk - assigning taxonomy to bins

In [None]:
#INSTALLATION
#install gtdbtk v 2.5.2 (2.6.0 not working with the db unity has downloaded)
conda create -p /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/.conda/envs/gtdbtk-2.5.2 -c conda-forge -c bioconda gtdbtk=2.5.2 python=3.12
conda activate /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/.conda/envs/gtdbtk-2.5.2
python --version #check that it is 3.12.12 and not 3.14

conda env config vars set GTDBTK_DATA_PATH="/datasets/bio/gtdb/release226/226.0/auxillary_files/gtdbtk_package/full_package/release226"
conda deactivate #have to deactivate and reactivate env for the path change to take effect
conda activate /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/.conda/envs/gtdbtk-2.5.2

#check that the installation worked
gtdbtk check_install

In [None]:
#!/bin/bash
#SBATCH -c 24  # Number of Cores per Task
#SBATCH --mem=150G  # Requested Memory
#SBATCH -p cpu  # Partition
#SBATCH -t 48:00:00  # Job time limit
#SBATCH --mail-type=ALL
#SBATCH -o /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/slurms/slurm-gtdb_dlab-%j.out  # %j = job ID  # %j = job ID

module load conda/latest
conda activate /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/.conda/envs/gtdbtk-2.5.2

# Set parameters
SAMPLENAME="dlab"
BINPATH="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/binning/${SAMPLENAME}/Das_Tool/${SAMPLENAME}_DASTool_bins"
OUTDIR="/work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/COL/taxonomy/combined_data_MAGs/${SAMPLENAME}"
mkdir -p $OUTDIR

#Run gtdb-tk
gtdbtk classify_wf -x fa --genome_dir $BINPATH/ --out_dir $OUTDIR/gtdb_out

#This will process all genomes in the directory <my_genomes> using both bacterial and archaeal marker sets and place the results in <output_dir>.
#Genomes must be in FASTA format (gzip with the extension .gz is acceptable)

# JOB-ID: 50572538 (this one worked: 50573688)
# bash script file name: /nikea/COL/bash_scripts/Col_gtdb_taxonomy.sh

ugh all of my playing around with gtdbtk last week for cordap genome assembly and this still came back with
EXCEPTION: ValueError
  MESSAGE: "__StageLogger" object has no field "version"

https://github.com/Ecogenomics/GTDBTk/issues/669 suggestion to downgrade from python 3.14 to python 3.12 so will try that

In [None]:
salloc -c 6 -p cpu -t 04:00:00 
module load conda/latest
conda activate /work/pi_sarah_gignouxwolfsohn_uml_edu/nikea/.conda/envs/gtdbtk-2.5.2
conda install -c conda-forge -c bioconda gtdbtk=2.5.2 python=3.12
conda env config vars set GTDBTK_DATA_PATH="/datasets/bio/gtdb/release226/226.0/auxillary_files/gtdbtk_package/full_package/release226"
python --version #3.12.12
gtdbtk test #(this comes back with the ValueError MESSAGE: "__StageLogger" object has no field "version")
gtdbtk check_install #this completes without error


tried running Col_gtdb_taxonomy.sh again: 50573688
I THINK THIS WORKED!!! so changing to python 3.12 was necessary - will make changes to above installation 

ran for the others: \
ofav: 50573739 \
pstr: 50573737 \
mcav: 50573747

