diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e0d2429..932a6d86 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,22 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## 2.5.4 - [2024-02-12] + +### `Added` + +### `Changed` + +- [#581](https://github.com/nf-core/mag/pull/581) - Added explicit licence text to headers of all custom scripts (reported by @FriederikeHanssen and @maxibor, fix by @jfy133) + +### `Fixed` + +- [#583](https://github.com/nf-core/mag/pull/583) - Fix GTDB database input when directory supplied (fix by @jfy133) + +### `Dependencies` + +### `Deprecated` + ## 2.5.3 - [2024-02-05] ### `Added` diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 234d1d8d..ff9fc6fb 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,7 +1,7 @@ report_comment: > - This report has been generated by the nf-core/mag + This report has been generated by the nf-core/mag analysis pipeline. For information about how to interpret these results, please see the - documentation. + documentation. report_section_order: "nf-core-mag-methods-description": order: -1000 diff --git a/bin/combine_tables.py b/bin/combine_tables.py index b867ed73..a2dcf986 100755 --- a/bin/combine_tables.py +++ b/bin/combine_tables.py @@ -1,5 +1,9 @@ #!/usr/bin/env python +## Originally written by Daniel Straub and Sabrina Krakau and released under the MIT license. +## See git repository (https://github.com/nf-core/mag) for full license text. + + import sys import argparse import os.path @@ -8,11 +12,25 @@ def parse_args(args=None): parser = argparse.ArgumentParser() - parser.add_argument("-d", "--depths_summary", required=True, metavar="FILE", help="Bin depths summary file.") - parser.add_argument("-b", "--busco_summary", metavar="FILE", help="BUSCO summary file.") - parser.add_argument("-c", "--checkm_summary", metavar="FILE", help="CheckM summary file.") - parser.add_argument("-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file.") - parser.add_argument("-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file.") + parser.add_argument( + "-d", + "--depths_summary", + required=True, + metavar="FILE", + help="Bin depths summary file.", + ) + parser.add_argument( + "-b", "--busco_summary", metavar="FILE", help="BUSCO summary file." + ) + parser.add_argument( + "-c", "--checkm_summary", metavar="FILE", help="CheckM summary file." + ) + parser.add_argument( + "-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file." + ) + parser.add_argument( + "-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file." + ) parser.add_argument("-a", "--cat_summary", metavar="FILE", help="CAT table file.") parser.add_argument( "-o", @@ -45,7 +63,14 @@ def parse_cat_table(cat_table): for line in f: maxcol = max(maxcol, len(line.split("\t"))) - header = ["bin", "classification", "reason", "lineage", "lineage scores", "full lineage names"] + header = [ + "bin", + "classification", + "reason", + "lineage", + "lineage scores", + "full lineage names", + ] df = pd.read_table( cat_table, @@ -55,7 +80,11 @@ def parse_cat_table(cat_table): skiprows=1, ) # merge all rank columns into a single column - df["CAT_rank"] = df.filter(regex="rank_\d+").apply(lambda x: ";".join(x.dropna()), axis=1).str.lstrip() + df["CAT_rank"] = ( + df.filter(regex="rank_\d+") + .apply(lambda x: ";".join(x.dropna()), axis=1) + .str.lstrip() + ) # remove rank_* columns df.drop(df.filter(regex="rank_\d+").columns, axis=1, inplace=True) @@ -65,21 +94,34 @@ def parse_cat_table(cat_table): def main(args=None): args = parse_args(args) - if not args.busco_summary and not args.checkm_summary and not args.quast_summary and not args.gtdbtk_summary: - sys.exit("No summary specified! Please specify at least BUSCO, CheckM or QUAST summary.") + if ( + not args.busco_summary + and not args.checkm_summary + and not args.quast_summary + and not args.gtdbtk_summary + ): + sys.exit( + "No summary specified! Please specify at least BUSCO, CheckM or QUAST summary." + ) # GTDB-Tk can only be run in combination with BUSCO or CheckM if args.gtdbtk_summary and not (args.busco_summary or args.checkm_summary): - sys.exit("Invalid parameter combination: GTDB-TK summary specified, but no BUSCO or CheckM summary!") + sys.exit( + "Invalid parameter combination: GTDB-TK summary specified, but no BUSCO or CheckM summary!" + ) # handle bin depths results = pd.read_csv(args.depths_summary, sep="\t") - results.columns = ["Depth " + str(col) if col != "bin" else col for col in results.columns] + results.columns = [ + "Depth " + str(col) if col != "bin" else col for col in results.columns + ] bins = results["bin"].sort_values().reset_index(drop=True) if args.busco_summary: busco_results = pd.read_csv(args.busco_summary, sep="\t") - if not bins.equals(busco_results["GenomeBin"].sort_values().reset_index(drop=True)): + if not bins.equals( + busco_results["GenomeBin"].sort_values().reset_index(drop=True) + ): sys.exit("Bins in BUSCO summary do not match bins in bin depths summary!") results = pd.merge( results, busco_results, left_on="bin", right_on="GenomeBin", how="outer" @@ -107,7 +149,9 @@ def main(args=None): ] checkm_results = pd.read_csv(args.checkm_summary, usecols=use_columns, sep="\t") checkm_results["Bin Id"] = checkm_results["Bin Id"] + ".fa" - if not bins.equals(checkm_results["Bin Id"].sort_values().reset_index(drop=True)): + if not bins.equals( + checkm_results["Bin Id"].sort_values().reset_index(drop=True) + ): sys.exit("Bins in CheckM summary do not match bins in bin depths summary!") results = pd.merge( results, checkm_results, left_on="bin", right_on="Bin Id", how="outer" @@ -116,7 +160,9 @@ def main(args=None): if args.quast_summary: quast_results = pd.read_csv(args.quast_summary, sep="\t") - if not bins.equals(quast_results["Assembly"].sort_values().reset_index(drop=True)): + if not bins.equals( + quast_results["Assembly"].sort_values().reset_index(drop=True) + ): sys.exit("Bins in QUAST summary do not match bins in bin depths summary!") results = pd.merge( results, quast_results, left_on="bin", right_on="Assembly", how="outer" @@ -134,7 +180,13 @@ def main(args=None): cat_results = parse_cat_table(args.cat_summary) if len(set(cat_results["bin"].to_list()).difference(set(bins))) > 0: sys.exit("Bins in CAT summary do not match bins in bin depths summary!") - results = pd.merge(results, cat_results[["bin", "CAT_rank"]], left_on="bin", right_on="bin", how="outer") + results = pd.merge( + results, + cat_results[["bin", "CAT_rank"]], + left_on="bin", + right_on="bin", + how="outer", + ) results.to_csv(args.out, sep="\t") diff --git a/bin/domain_classification.R b/bin/domain_classification.R index eb64b312..33530ca5 100755 --- a/bin/domain_classification.R +++ b/bin/domain_classification.R @@ -1,7 +1,7 @@ #!/usr/bin/env Rscript -# Written by Jim Downie and released under the MIT license. -# See git repository (https://github.com/nf-core/mag) for full license text. +## Written by Jim Downie and released under the MIT license. +## See git repository (https://github.com/nf-core/mag) for full license text. library(optparse) library(tidyverse) diff --git a/bin/filter_ssu.py b/bin/filter_ssu.py index 7e89989b..5e4675e4 100755 --- a/bin/filter_ssu.py +++ b/bin/filter_ssu.py @@ -1,5 +1,8 @@ #!/usr/bin/env python +## Originally written by Hadrien Gourlé and released under the MIT license. +## See git repository (https://github.com/nf-core/mag) for full license text. + from __future__ import print_function import os @@ -28,10 +31,16 @@ def filter(args): def main(): - parser = argparse.ArgumentParser(prog="filter_ssu.py", usage="filter ssu hits from refinem") + parser = argparse.ArgumentParser( + prog="filter_ssu.py", usage="filter ssu hits from refinem" + ) parser.add_argument("--evalue", help="evalue threshold") - parser.add_argument("ssu", metavar="ssu.tsv", help="ssu tsv file generated by refinem") - parser.add_argument("output", metavar="output.tsv", default="output.tsv", help="output file name") + parser.add_argument( + "ssu", metavar="ssu.tsv", help="ssu tsv file generated by refinem" + ) + parser.add_argument( + "output", metavar="output.tsv", default="output.tsv", help="output file name" + ) parser.set_defaults(func=filter) args = parser.parse_args() diff --git a/bin/get_mag_depths.py b/bin/get_mag_depths.py index 55d73ac4..43ce3539 100755 --- a/bin/get_mag_depths.py +++ b/bin/get_mag_depths.py @@ -1,5 +1,8 @@ #!/usr/bin/env python +## Originally written by Sabrina Krakau and released under the MIT license. +## See git repository (https://github.com/nf-core/mag) for full license text. + import sys import argparse import os.path @@ -14,7 +17,12 @@ def parse_args(args=None): parser = argparse.ArgumentParser() parser.add_argument( - "-b", "--bins", required=True, nargs="+", metavar="FILE", help="Bins: FASTA containing all contigs." + "-b", + "--bins", + required=True, + nargs="+", + metavar="FILE", + help="Bins: FASTA containing all contigs.", ) parser.add_argument( "-d", @@ -23,9 +31,15 @@ def parse_args(args=None): metavar="FILE", help="(Compressed) TSV file containing contig depths for each sample: contigName, contigLen, totalAvgDepth, sample1_avgDepth, sample1_var [, sample2_avgDepth, sample2_var, ...].", ) - parser.add_argument("-a", "--assembler", required=True, type=str, help="Assembler name.") - parser.add_argument("-i", "--id", required=True, type=str, help="Sample or group id.") - parser.add_argument("-m", "--binner", required=True, type=str, help="Binning method.") + parser.add_argument( + "-a", "--assembler", required=True, type=str, help="Assembler name." + ) + parser.add_argument( + "-i", "--id", required=True, type=str, help="Sample or group id." + ) + parser.add_argument( + "-m", "--binner", required=True, type=str, help="Binning method." + ) return parser.parse_args(args) @@ -56,7 +70,9 @@ def main(args=None): # Initialize output files n_samples = len(sample_names) - with open(args.assembler + "-" + args.binner + "-" + args.id + "-binDepths.tsv", "w") as outfile: + with open( + args.assembler + "-" + args.binner + "-" + args.id + "-binDepths.tsv", "w" + ) as outfile: print("bin", "\t".join(sample_names), sep="\t", file=outfile) # for each bin, access contig depths and compute mean bin depth (for all samples) @@ -77,10 +93,15 @@ def main(args=None): all_depths[sample].append(contig_depths[sample]) binname = os.path.basename(file) - with open(args.assembler + "-" + args.binner + "-" + args.id + "-binDepths.tsv", "a") as outfile: + with open( + args.assembler + "-" + args.binner + "-" + args.id + "-binDepths.tsv", "a" + ) as outfile: print( binname, - "\t".join(str(statistics.median(sample_depths)) for sample_depths in all_depths), + "\t".join( + str(statistics.median(sample_depths)) + for sample_depths in all_depths + ), sep="\t", file=outfile, ) diff --git a/bin/get_mag_depths_summary.py b/bin/get_mag_depths_summary.py index 6dbc6f75..69433371 100755 --- a/bin/get_mag_depths_summary.py +++ b/bin/get_mag_depths_summary.py @@ -1,5 +1,8 @@ #!/usr/bin/env python +## Originally written by Sabrina Krakau and released under the MIT license. +## See git repository (https://github.com/nf-core/mag) for full license text. + import sys import argparse import pandas as pd diff --git a/bin/multiqc_to_custom_tsv.py b/bin/multiqc_to_custom_tsv.py index 6488e31d..4388fb26 100755 --- a/bin/multiqc_to_custom_tsv.py +++ b/bin/multiqc_to_custom_tsv.py @@ -1,5 +1,8 @@ #!/usr/bin/env python -# copied from nf-core/viralrecon and adjusted + +## Copied from nf-core/viralrecon and adjusted +## See git repository (https://github.com/nf-core/viralrecon) for full license text. + import os import sys @@ -9,9 +12,7 @@ def parse_args(args=None): - Description = ( - "Create custom spreadsheet for pertinent MultiQC bowtie 2 metrics generated by the nf-core/mag pipeline." - ) + Description = "Create custom spreadsheet for pertinent MultiQC bowtie 2 metrics generated by the nf-core/mag pipeline." Epilog = "Example usage: python multiqc_to_custom_tsv.py" parser = argparse.ArgumentParser(description=Description, epilog=Epilog) parser.add_argument( @@ -86,7 +87,9 @@ def metrics_dict_to_file(FileFieldList, MultiQCDataDir, OutFile, se): for yamlFile, mappingList in FileFieldList: yamlFile = os.path.join(MultiQCDataDir, yamlFile) if os.path.exists(yamlFile): - MetricsDict = yaml_fields_to_dict(YAMLFile=yamlFile, AppendDict=MetricsDict, FieldMappingList=mappingList) + MetricsDict = yaml_fields_to_dict( + YAMLFile=yamlFile, AppendDict=MetricsDict, FieldMappingList=mappingList + ) FieldList += [x[0] for x in mappingList] else: print("WARNING: File does not exist: {}".format(yamlFile)) @@ -96,7 +99,15 @@ def metrics_dict_to_file(FileFieldList, MultiQCDataDir, OutFile, se): with open(OutFile, "w") as fout: if se: fout.write( - "{}\n".format("\t".join(["Sample", "SE reads not mapped (kept)", "SE reads mapped (discarded)"])) + "{}\n".format( + "\t".join( + [ + "Sample", + "SE reads not mapped (kept)", + "SE reads mapped (discarded)", + ] + ) + ) ) else: fout.write( @@ -118,7 +129,10 @@ def metrics_dict_to_file(FileFieldList, MultiQCDataDir, OutFile, se): [ k, str(MetricsDict[k][FieldList[0]]), - str(MetricsDict[k][FieldList[1]] + MetricsDict[k][FieldList[2]]), + str( + MetricsDict[k][FieldList[1]] + + MetricsDict[k][FieldList[2]] + ), ] ) ) diff --git a/bin/plot_mag_depths.py b/bin/plot_mag_depths.py index aab38473..d3782845 100755 --- a/bin/plot_mag_depths.py +++ b/bin/plot_mag_depths.py @@ -1,5 +1,8 @@ #!/usr/bin/env python +# Originally written by Sabrina Krakau and released under the MIT license. +# See git repository (https://github.com/nf-core/mag) for full license text. + import sys import argparse import os.path @@ -26,7 +29,9 @@ def parse_args(args=None): metavar="FILE", help="File in TSV format containing group information for samples: sample, group", ) - parser.add_argument("-o", "--out", required=True, metavar="FILE", type=str, help="Output file.") + parser.add_argument( + "-o", "--out", required=True, metavar="FILE", type=str, help="Output file." + ) return parser.parse_args(args) @@ -43,12 +48,19 @@ def main(args=None): # compute centered log-ratios # divide df by sample-wise geometric means gmeans = stats.gmean(df, axis=0) # apply on axis=0: 'index' - df = np.log(df.div(gmeans, axis="columns")) # divide column-wise (axis=1|'columns'), take natural logorithm + df = np.log( + df.div(gmeans, axis="columns") + ) # divide column-wise (axis=1|'columns'), take natural logorithm df.index.name = "MAGs" df.columns.name = "Samples" # prepare colors for group information - color_map = dict(zip(groups["group"].unique(), sns.color_palette(n_colors=len(groups["group"].unique())))) + color_map = dict( + zip( + groups["group"].unique(), + sns.color_palette(n_colors=len(groups["group"].unique())), + ) + ) # plot plt.figure() diff --git a/bin/run_busco.sh b/bin/run_busco.sh index 9e022e87..4f8d6c86 100755 --- a/bin/run_busco.sh +++ b/bin/run_busco.sh @@ -1,5 +1,9 @@ #! /usr/bin/env bash +# Originally written by Sabrina Krakau and James Fellows Yates and released +# under the MIT license. +# See git repository (https://github.com/nf-core/mag) for full license text. + p=$1 cp_augustus_config=$2 db=$3 @@ -148,7 +152,7 @@ if [ -f BUSCO/logs/prodigal_out.log ]; then fi # output value of most_spec_db -echo ${most_spec_db} > info_most_spec_db.txt +echo ${most_spec_db} >info_most_spec_db.txt # if needed delete temporary BUSCO files if [ ${busco_clean} = "Y" ]; then diff --git a/bin/split_fasta.py b/bin/split_fasta.py index 87cb9dfa..c5fb6e87 100755 --- a/bin/split_fasta.py +++ b/bin/split_fasta.py @@ -1,5 +1,9 @@ #!/usr/bin/env python +## Originally written by Daniel Straub and Sabrina Krakau and released +## under the MIT license. +## See git repository (https://github.com/nf-core/mag) for full license text. + # USAGE: ./split_fasta.py <*.unbinned.fa(.gz)> import pandas as pd @@ -45,10 +49,14 @@ ) # contigs to retain and pool elif length >= min_length_to_retain_contig: - pooled.append(SeqRecord(Seq(sequence, generic_dna), id=name, description="")) + pooled.append( + SeqRecord(Seq(sequence, generic_dna), id=name, description="") + ) # remaining sequences else: - remaining.append(SeqRecord(Seq(sequence, generic_dna), id=name, description="")) + remaining.append( + SeqRecord(Seq(sequence, generic_dna), id=name, description="") + ) else: with open(input_file) as f: fasta_sequences = SeqIO.parse(f, "fasta") @@ -64,10 +72,14 @@ ) # contigs to retain and pool elif length >= min_length_to_retain_contig: - pooled.append(SeqRecord(Seq(sequence, generic_dna), id=name, description="")) + pooled.append( + SeqRecord(Seq(sequence, generic_dna), id=name, description="") + ) # remaining sequences else: - remaining.append(SeqRecord(Seq(sequence, generic_dna), id=name, description="")) + remaining.append( + SeqRecord(Seq(sequence, generic_dna), id=name, description="") + ) # Sort sequences above threshold by length df_above_threshold.sort_values(by=["length"], ascending=False, inplace=True) @@ -80,7 +92,9 @@ out = SeqRecord(Seq(row["seq"], generic_dna), id=row["id"], description="") SeqIO.write(out, out_base + "." + str(index + 1) + ".fa", "fasta") else: - pooled.append(SeqRecord(Seq(row["seq"], generic_dna), id=row["id"], description="")) + pooled.append( + SeqRecord(Seq(row["seq"], generic_dna), id=row["id"], description="") + ) print("write " + out_base + ".pooled.fa") SeqIO.write(pooled, out_base + ".pooled.fa", "fasta") diff --git a/bin/summary_busco.py b/bin/summary_busco.py index b4a8c99b..9701783b 100755 --- a/bin/summary_busco.py +++ b/bin/summary_busco.py @@ -1,6 +1,10 @@ #!/usr/bin/env python -# USAGE: ./summary.busco.py -sd -ss -f +## Originally written by Daniel Straub, Sabrina Krakau, and Hadrien Gourlé +## and released under the MIT license. +## See git repository (https://github.com/nf-core/mag) for full license text. + +## USAGE: ./summary.busco.py -sd -ss -f import re import sys @@ -12,10 +16,18 @@ def parse_args(args=None): parser = argparse.ArgumentParser() parser.add_argument( - "-a", "--auto", default=False, action="store_true", help="BUSCO run in auto lineage selection mode." + "-a", + "--auto", + default=False, + action="store_true", + help="BUSCO run in auto lineage selection mode.", ) parser.add_argument( - "-sd", "--summaries_domain", nargs="+", metavar="FILE", help="List of BUSCO summary files for domains." + "-sd", + "--summaries_domain", + nargs="+", + metavar="FILE", + help="List of BUSCO summary files for domains.", ) parser.add_argument( "-ss", @@ -45,8 +57,14 @@ def parse_args(args=None): def main(args=None): args = parse_args(args) - if not args.summaries_domain and not args.summaries_specific and not args.failed_bins: - sys.exit("Either --summaries_domain, --summaries_specific or --failed_bins must be specified!") + if ( + not args.summaries_domain + and not args.summaries_specific + and not args.failed_bins + ): + sys.exit( + "Either --summaries_domain, --summaries_specific or --failed_bins must be specified!" + ) # "# Summarized benchmarking in BUSCO notation for file /path/to/MEGAHIT-testset1.contigs.fa" # " C:0.0%[S:0.0%,D:0.0%],F:0.0%,M:100.0%,n:148" @@ -173,15 +191,30 @@ def main(args=None): pd.NA, ] else: - results = [failed_bin, pd.NA, "0.0", "0.0", "0.0", "0.0", "100.0", pd.NA] + results = [ + failed_bin, + pd.NA, + "0.0", + "0.0", + "0.0", + "0.0", + "100.0", + pd.NA, + ] failed.append(results) df_failed = pd.DataFrame(failed, columns=columns) # merge results if args.auto: - df_final = df_domain.merge(df_specific, on="GenomeBin", how="outer").append(df_failed) + df_final = df_domain.merge(df_specific, on="GenomeBin", how="outer").append( + df_failed + ) # check if 'Domain' is 'NA', but 'Specific lineage dataset' given -> 'Viruses' - df_final.loc[pd.isna(df_final["Domain"]) & pd.notna(df_final["Specific lineage dataset"]), "Domain"] = "Viruses" + df_final.loc[ + pd.isna(df_final["Domain"]) + & pd.notna(df_final["Specific lineage dataset"]), + "Domain", + ] = "Viruses" else: df_final = df_specific.append(df_failed) diff --git a/bin/summary_gtdbtk.py b/bin/summary_gtdbtk.py index 44bb7d1d..7ae43a09 100755 --- a/bin/summary_gtdbtk.py +++ b/bin/summary_gtdbtk.py @@ -1,5 +1,8 @@ #!/usr/bin/env python +# Originally written by Sabrina Krakau and released under the MIT license. +# See git repository (https://github.com/nf-core/mag) for full license text. + import re import sys import argparse @@ -16,7 +19,13 @@ def parse_args(args=None): type=str, help="File extension passed to GTDB-TK and substracted by GTDB-Tk from bin names in results files.", ) - parser.add_argument("-s", "--summaries", nargs="+", metavar="FILE", help="List of GTDB-tk summary files.") + parser.add_argument( + "-s", + "--summaries", + nargs="+", + metavar="FILE", + help="List of GTDB-tk summary files.", + ) parser.add_argument( "-fi", "--filtered_bins", @@ -54,8 +63,15 @@ def parse_args(args=None): def main(args=None): args = parse_args(args) - if not args.summaries and not args.filtered_bins and not args.failed_bins and not args.qc_discarded_bins: - sys.exit("Either --summaries, --filtered_bins, --failed_bins or --qc_discarded_bins must be specified!") + if ( + not args.summaries + and not args.filtered_bins + and not args.failed_bins + and not args.qc_discarded_bins + ): + sys.exit( + "Either --summaries, --filtered_bins, --failed_bins or --qc_discarded_bins must be specified!" + ) columns = [ "user_genome", @@ -117,7 +133,9 @@ def main(args=None): for file in args.summaries: df_summary = pd.read_csv(file, sep="\t")[columns] # add by GTDB-Tk substracted file extension again to bin names (at least until changed consistently in rest of pipeline) - df_summary["user_genome"] = df_summary["user_genome"].astype(str) + "." + args.extension + df_summary["user_genome"] = ( + df_summary["user_genome"].astype(str) + "." + args.extension + ) df_summary.set_index("user_genome", inplace=True) df_final = df_final.append(df_summary, verify_integrity=True) @@ -153,7 +171,9 @@ def main(args=None): filtered.append(bin_results) df_filtered = pd.DataFrame(filtered, columns=columns) - df_filtered["user_genome"] = df_filtered["user_genome"].astype(str) + "." + args.extension + df_filtered["user_genome"] = ( + df_filtered["user_genome"].astype(str) + "." + args.extension + ) df_filtered.set_index("user_genome", inplace=True) df_final = df_final.append(df_filtered, verify_integrity=True) @@ -189,12 +209,16 @@ def main(args=None): failed.append(bin_results) df_failed = pd.DataFrame(failed, columns=columns) - df_failed["user_genome"] = df_failed["user_genome"].astype(str) + "." + args.extension + df_failed["user_genome"] = ( + df_failed["user_genome"].astype(str) + "." + args.extension + ) df_failed.set_index("user_genome", inplace=True) df_final = df_final.append(df_failed, verify_integrity=True) # write output - df_final.reset_index().rename(columns={"index": "user_genome"}).to_csv(args.out, sep="\t", index=False) + df_final.reset_index().rename(columns={"index": "user_genome"}).to_csv( + args.out, sep="\t", index=False + ) if __name__ == "__main__": diff --git a/conf/test.config b/conf/test.config index 43a7f18a..28984d13 100644 --- a/conf/test.config +++ b/conf/test.config @@ -29,5 +29,6 @@ params { busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz" busco_clean = true skip_gtdbtk = true + gtdbtk_min_completeness = 0 skip_concoct = true } diff --git a/conf/test_adapterremoval.config b/conf/test_adapterremoval.config index dc00d319..ca9fed10 100644 --- a/conf/test_adapterremoval.config +++ b/conf/test_adapterremoval.config @@ -29,6 +29,7 @@ params { max_unbinned_contigs = 2 busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz" skip_gtdbtk = true + gtdbtk_min_completeness = 0 clip_tool = 'adapterremoval' skip_concoct = true bin_domain_classification = true diff --git a/conf/test_ancient_dna.config b/conf/test_ancient_dna.config index bc441be8..5e935321 100644 --- a/conf/test_ancient_dna.config +++ b/conf/test_ancient_dna.config @@ -28,6 +28,7 @@ params { max_unbinned_contigs = 2 busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz" skip_gtdbtk = true + gtdbtk_min_completeness = 0 ancient_dna = true binning_map_mode = 'own' skip_spades = false diff --git a/conf/test_bbnorm.config b/conf/test_bbnorm.config index 02e764c3..a31f6b7b 100644 --- a/conf/test_bbnorm.config +++ b/conf/test_bbnorm.config @@ -35,6 +35,7 @@ params { busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz" busco_clean = true skip_gtdbtk = true + gtdbtk_min_completeness = 0 bbnorm = true coassemble_group = true } diff --git a/conf/test_binrefinement.config b/conf/test_binrefinement.config index 79105ec4..54144244 100644 --- a/conf/test_binrefinement.config +++ b/conf/test_binrefinement.config @@ -29,6 +29,7 @@ params { max_unbinned_contigs = 2 busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz" skip_gtdbtk = true + gtdbtk_min_completeness = 0 refine_bins_dastool = true refine_bins_dastool_threshold = 0 // TODO not using 'both' until #489 merged diff --git a/conf/test_busco_auto.config b/conf/test_busco_auto.config index 6479012f..48f6b7b5 100644 --- a/conf/test_busco_auto.config +++ b/conf/test_busco_auto.config @@ -25,6 +25,7 @@ params { min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 skip_gtdbtk = true + gtdbtk_min_completeness = 0 skip_prokka = true skip_prodigal = true skip_quast = true diff --git a/conf/test_host_rm.config b/conf/test_host_rm.config index 30cae576..afd4e687 100644 --- a/conf/test_host_rm.config +++ b/conf/test_host_rm.config @@ -26,5 +26,6 @@ params { max_unbinned_contigs = 2 busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz" skip_gtdbtk = true + gtdbtk_min_completeness = 0 skip_concoct = true } diff --git a/conf/test_hybrid.config b/conf/test_hybrid.config index 3ca0608e..a9f7ee07 100644 --- a/conf/test_hybrid.config +++ b/conf/test_hybrid.config @@ -25,5 +25,6 @@ params { max_unbinned_contigs = 2 busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz" skip_gtdbtk = true + gtdbtk_min_completeness = 0 skip_concoct = true } diff --git a/conf/test_hybrid_host_rm.config b/conf/test_hybrid_host_rm.config index 7a0e4a15..531a89d3 100644 --- a/conf/test_hybrid_host_rm.config +++ b/conf/test_hybrid_host_rm.config @@ -27,4 +27,5 @@ params { skip_binqc = true skip_concoct = true skip_gtdbtk = true + gtdbtk_min_completeness = 0 } diff --git a/conf/test_nothing.config b/conf/test_nothing.config index 53df219f..cd3f6311 100644 --- a/conf/test_nothing.config +++ b/conf/test_nothing.config @@ -39,5 +39,6 @@ params { skip_prokka = true skip_binqc = true skip_gtdbtk = true + gtdbtk_min_completeness = 0 skip_concoct = true } diff --git a/conf/test_virus_identification.config b/conf/test_virus_identification.config index e15fab7d..dba55db9 100644 --- a/conf/test_virus_identification.config +++ b/conf/test_virus_identification.config @@ -28,6 +28,7 @@ params { reads_minlength = 150 coassemble_group = true skip_gtdbtk = true + gtdbtk_min_completeness = 0 skip_binning = true skip_prokka = true skip_spades = true diff --git a/nextflow.config b/nextflow.config index 6a2da3b5..ed9d4e27 100644 --- a/nextflow.config +++ b/nextflow.config @@ -379,7 +379,7 @@ manifest { description = """Assembly, binning and annotation of metagenomes""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '2.5.3' + version = '2.5.4' doi = '10.1093/nargab/lqac007' } diff --git a/subworkflows/local/gtdbtk.nf b/subworkflows/local/gtdbtk.nf index a201e370..95e343c8 100644 --- a/subworkflows/local/gtdbtk.nf +++ b/subworkflows/local/gtdbtk.nf @@ -66,13 +66,16 @@ workflow GTDBTK { // Expects to be tar.gz! ch_db_for_gtdbtk = GTDBTK_DB_PREPARATION ( gtdb ).db } else if ( gtdb.isDirectory() ) { - // Make up meta id to match expected channel cardinality for GTDBTK + // The classifywf module expects a list of the _contents_ of the GTDB + // database, not just the directory itself (I'm not sure why). But + // for now we generate this list before putting into a channel, + // then grouping again to pass to the module. + // Then make up meta id to match expected channel cardinality for GTDBTK + gtdb_dir = gtdb.listFiles() ch_db_for_gtdbtk = Channel - .of(gtdb) - .map{ - [ it.toString().split('/').last(), it ] - } - .collect() + .of(gtdb_dir) + .map{['gtdb', it]} + .groupTuple() } else { error("Unsupported object given to --gtdb, database must be supplied as either a directory or a .tar.gz file!") }