From bf344af99665f33cd3c47ee1e15a927ad13608d4 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Tue, 11 Apr 2023 15:43:48 +0200 Subject: [PATCH 1/9] v1.6.0, genome_updater 0.6.0 --- CMakeLists.txt | 2 +- libs/genome_updater | 2 +- setup.py | 2 +- src/ganon/config.py | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 078ae5fa..079263f8 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,7 @@ # ============================================================================= cmake_minimum_required( VERSION 3.10 FATAL_ERROR ) -project( ganon VERSION 1.5.0 LANGUAGES CXX ) +project( ganon VERSION 1.6.0 LANGUAGES CXX ) # ----------------------------------------------------------------------------- # build setup diff --git a/libs/genome_updater b/libs/genome_updater index a512fae6..f75766cd 160000 --- a/libs/genome_updater +++ b/libs/genome_updater @@ -1 +1 @@ -Subproject commit a512fae629fb0732d1b9e9c4837968dcbcd23f3d +Subproject commit f75766cd9d013d0b658315b81ff4525cb99a0ce5 diff --git a/setup.py b/setup.py index d28b1b4e..c5d7d15e 100755 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ def read(filename): setup( name="ganon", - version="1.5.0", + version="1.6.0", url="https://www.github.com/pirovc/ganon", license='MIT', author="Vitor C. Piro", diff --git a/src/ganon/config.py b/src/ganon/config.py index 440b6c1b..e54975ce 100644 --- a/src/ganon/config.py +++ b/src/ganon/config.py @@ -8,7 +8,7 @@ class Config: - version = "1.5.0" + version = "1.6.0" path_exec = {"build": "", "classify": "", "get_seq_info": "", "genome_updater": ""} empty = False @@ -142,7 +142,7 @@ def __init__(self, which: str=None, **kwargs): classify_group_other.add_argument("-t", "--threads", type=unsigned_int(minval=1), metavar="", default=1, help="Number of sub-processes/threads to use") classify_group_other.add_argument("-l", "--hierarchy-labels", type=str, nargs="*", metavar="", help="Hierarchy definition of --db-prefix files to be classified. Can also be a string, but input will be sorted to define order (e.g. 1 1 2 3). The default value reported without hierarchy is 'H1'") classify_group_other.add_argument("-r", "--ranks", type=str, nargs="*", metavar="", default=[], help="Ranks to report taxonomic abundances (.tre). empty will report default ranks [" + ", ".join(self.choices_default_ranks) + "]. This file can be re-generated with the 'ganon report' command for other types of abundances (reads, matches) with further filtration and output options") - classify_group_other.add_argument("-a", "--reassign", action="store_true", help="Reassign reads with multiple matches with an EM-algorithm. Will enforce --output-all. This file can be re-generated with the 'ganon reassign'.") + classify_group_other.add_argument("-a", "--reassign", action="store_true", help="Reassign reads with multiple matches with an EM algorithm. Will enforce --output-all. This file can be re-generated with the 'ganon reassign'.") classify_group_other.add_argument("--verbose", action="store_true", help="Verbose output mode") classify_group_other.add_argument("--quiet", action="store_true", help="Quiet output mode") @@ -265,7 +265,7 @@ def __init__(self, which: str=None, **kwargs): classify.set_defaults(which="classify") reassign = subparsers.add_parser("reassign", - help="Reassign reads with multiple matches to their target with an EM algorith", + help="Reassign reads with multiple matches with an EM algorithm", parents=[reassign_parser], formatter_class=formatter_class) reassign.set_defaults(which="reassign") From b89d2a8ed715f68695219dfcd85909fe6f60c288 Mon Sep 17 00:00:00 2001 From: pirovc <4673375+pirovc@users.noreply.github.com> Date: Tue, 11 Apr 2023 17:13:10 +0200 Subject: [PATCH 2/9] input_recursive as default on build (#239) --- src/ganon/build_update.py | 6 ++- src/ganon/config.py | 1 + src/ganon/util.py | 18 +++++++-- tests/ganon/integration/test_build_custom.py | 39 ++++++++++++++++---- tests/ganon/integration/test_report.py | 2 +- tests/ganon/integration/test_update.py | 12 +++--- tests/ganon/utils.py | 14 ++++--- 7 files changed, 67 insertions(+), 25 deletions(-) diff --git a/src/ganon/build_update.py b/src/ganon/build_update.py index 9154d473..e0217c6c 100644 --- a/src/ganon/build_update.py +++ b/src/ganon/build_update.py @@ -60,6 +60,7 @@ def build(cfg): "-o " + files_output_folder, "-M " + cfg.taxonomy if cfg.taxonomy=="gtdb" else "", "-m", + "-N", "-i" if resume_download else "", "-s" if cfg.quiet else "", "-w" if not cfg.verbose else "", @@ -73,6 +74,7 @@ def build(cfg): build_custom_params = {"input": [input_folder], "input_extension": "fna.gz", + "input_recursive": True, "input_target": "file", "level": "assembly", "ncbi_file_info": [assembly_summary]} @@ -133,6 +135,7 @@ def update(cfg): run_genome_updater_cmd = " ".join([cfg.path_exec['genome_updater'], "-o " + files_output_folder, "-m", + "-N", "-s" if cfg.quiet else "", "-w" if not cfg.verbose else ""]) run(run_genome_updater_cmd, quiet=cfg.quiet) @@ -145,6 +148,7 @@ def update(cfg): build_custom_params = {"input": [input_folder], "input_extension": "fna.gz", + "input_recursive": True, "input_target": "file", "level": "assembly", "ncbi_file_info": [assembly_summary]} @@ -231,7 +235,7 @@ def build_custom(cfg, which_call: str="build_custom"): # Retrieve and check input files or folders if cfg.input: - input_files = validate_input_files(cfg.input, cfg.input_extension, cfg.quiet) + input_files = validate_input_files(cfg.input, cfg.input_extension, cfg.quiet, input_recursive=cfg.input_recursive) if not input_files: print_log("ERROR: No valid input files found", cfg.quiet) return False diff --git a/src/ganon/config.py b/src/ganon/config.py index e54975ce..d96a2f75 100644 --- a/src/ganon/config.py +++ b/src/ganon/config.py @@ -76,6 +76,7 @@ def __init__(self, which: str=None, **kwargs): build_custom_required_args = build_custom_parser.add_argument_group("required arguments") build_custom_required_args.add_argument("-i", "--input", type=str, nargs="*", metavar="", help="Input file(s) and/or folder(s). Mutually exclusive --input-file.") build_custom_required_args.add_argument("-e", "--input-extension", type=str, default="fna.gz", metavar="", help="Required if --input contains folder(s). Wildcards/Shell Expansions not supported (e.g. *).") + build_custom_required_args.add_argument("-c", "--input-recursive", action="store_true", help="Look for files recursively in folder(s) provided with --input") build_custom_args = build_custom_parser.add_argument_group("custom arguments") build_custom_args.add_argument("-n", "--input-file", type=file_exists, metavar="", help="Manually set information for input files: file [target node specialization specialization name]. target is the sequence identifier if --input-target sequence (file can be repeated for multiple sequences). if --input-target file and target is not set, filename is used. node is the taxonomic identifier. Mutually exclusive --input") diff --git a/src/ganon/util.py b/src/ganon/util.py index 52297b92..214cfd00 100644 --- a/src/ganon/util.py +++ b/src/ganon/util.py @@ -61,7 +61,7 @@ def rm_files(files): os.remove(f) -def validate_input_files(input_files_folder, input_extension, quiet): +def validate_input_files(input_files_folder, input_extension, quiet, input_recursive: bool = False): """ given a list of input files and/or folders and an file extension check for valid files and return them in a set @@ -75,13 +75,23 @@ def validate_input_files(input_files_folder, input_extension, quiet): print_log("--input-extension is required when using folders in the --input. Skipping: " + i, quiet) continue files_in_dir = 0 - for file in os.listdir(i): - if file.endswith(input_extension): - f = os.path.join(i, file) + + if input_recursive: + for path in Path(i).rglob('*' + input_extension): + f = str(path.joinpath()) if check_file(f): files_in_dir += 1 valid_input_files.add(f) + else: + for file in os.listdir(i): + if file.endswith(input_extension): + f = os.path.join(i, file) + if check_file(f): + files_in_dir += 1 + valid_input_files.add(f) + print_log(str(files_in_dir) + " valid file(s) [--input-extension " + input_extension + + (", --input-recursive" if input_recursive else "") + "] found in " + i, quiet) else: print_log("Skipping invalid file/folder: " + i, quiet) diff --git a/tests/ganon/integration/test_build_custom.py b/tests/ganon/integration/test_build_custom.py index 85ca40f2..12e39c0f 100644 --- a/tests/ganon/integration/test_build_custom.py +++ b/tests/ganon/integration/test_build_custom.py @@ -33,7 +33,7 @@ def setUpClass(self): def test_input_folder(self): """ - ganon build-custom with folder as --input with --extension + ganon build-custom with folder as --input with --input-extension """ params = self.default_params.copy() params["db_prefix"] = self.results_dir + "test_input_folder" @@ -66,11 +66,34 @@ def test_input_folder(self): cfg = Config("build-custom", **params) self.assertFalse(run_ganon(cfg, params["db_prefix"]), "ganon build-custom ran but it should fail") + def test_input_folder_recursive(self): + """ + ganon build-custom with folder as --input with --input-extension and --input-recursive + """ + params = self.default_params.copy() + params["db_prefix"] = self.results_dir + "test_input_folder_recursive" + params["input"] = data_dir + "build-custom/files/" + params["input_extension"] = "fna.gz" + params["input_recursive"] = True + + cfg = Config("build-custom", **params) + self.assertTrue(run_ganon(cfg, params["db_prefix"]), "ganon build-custom run failed") + res = build_sanity_check_and_parse(vars(cfg)) + self.assertIsNotNone(res, "ganon build-custom sanity check failed") + + # list files from base folder and "more" (got recursively) + files = list_files_folder(params["input"], ext=params["input_extension"], recursive=True) + + self.assertTrue(res["target"]["file"].isin(files).all(), "Files missing from target") + self.assertEqual(len(files), res["target"].shape[0], "Wrong number of files on target") + self.assertTrue(res["info"]["file"].isin(files).all(), "Files missing from info") + self.assertEqual(len(files), res["info"].shape[0], "Wrong number of files on info") + def test_input_single_file(self): """ ganon build-custom with one file as --input (will set --input-target sequence) """ - files = list_files_folder(data_dir + "build-custom/files/", "fna.gz") + files = list_files_folder(data_dir + "build-custom/files/", ext="fna.gz") params = self.default_params.copy() params["db_prefix"] = self.results_dir + "test_input_single_file" params["input"] = files[0] @@ -90,7 +113,7 @@ def test_input_files(self): """ ganon build-custom with files as --input """ - files = list_files_folder(data_dir + "build-custom/files/", "fna.gz") + files = list_files_folder(data_dir + "build-custom/files/", ext="fna.gz") params = self.default_params.copy() params["db_prefix"] = self.results_dir + "test_input_files" params["input"] = files @@ -116,9 +139,9 @@ def test_input_files(self): def test_input_folders_files(self): """ - ganon build-custom with files and folders as --input with --extension + ganon build-custom with files and folders as --input with --input-extension """ - files = list_files_folder(data_dir + "build-custom/files/", "fna.gz") + files = list_files_folder(data_dir + "build-custom/files/", ext="fna.gz") folder = data_dir + "build-custom/files/more/" params = self.default_params.copy() params["db_prefix"] = self.results_dir + "test_input_folders_files" @@ -129,7 +152,7 @@ def test_input_folders_files(self): res = build_sanity_check_and_parse(vars(cfg)) self.assertIsNotNone(res, "ganon build-custom sanity check failed") - files.extend(list_files_folder(folder, params["input_extension"])) + files.extend(list_files_folder(folder, ext=params["input_extension"])) self.assertTrue(res["target"]["file"].isin(files).all(), "Files missing from target") self.assertEqual(len(files), res["target"].shape[0], "Wrong number of files on target") self.assertTrue(res["info"]["file"].isin(files).all(), "Files missing from info") @@ -220,7 +243,7 @@ def test_input_target_file(self): res = build_sanity_check_and_parse(vars(cfg)) self.assertIsNotNone(res, "ganon build-custom sanity check failed") - files = list_files_folder(params["input"], "fna.gz") + files = list_files_folder(params["input"], ext="fna.gz") self.assertTrue(res["target"]["file"].isin(files).all(), "Files missing from target") self.assertEqual(len(files), res["target"].shape[0], "Wrong number of files on target") self.assertTrue(res["info"]["file"].isin(files).all(), "Files missing from info") @@ -238,7 +261,7 @@ def test_input_target_sequence(self): res = build_sanity_check_and_parse(vars(cfg)) self.assertIsNotNone(res, "ganon build-custom sanity check failed") - sequences = list_sequences(list_files_folder(params["input"], "fna.gz")) + sequences = list_sequences(list_files_folder(params["input"], ext="fna.gz")) self.assertTrue(res["target"]["sequence"].isin(sequences).all(), "Files missing from target") self.assertEqual(len(sequences), res["target"].shape[0], "Wrong number of files on target") self.assertTrue(res["info"]["target"].isin(sequences).all(), "Files missing from info") diff --git a/tests/ganon/integration/test_report.py b/tests/ganon/integration/test_report.py index 8e275f79..bd196833 100644 --- a/tests/ganon/integration/test_report.py +++ b/tests/ganon/integration/test_report.py @@ -502,7 +502,7 @@ def test_multiple_rep_files_folder(self): res = report_sanity_check_and_parse(vars(cfg)) self.assertIsNotNone(res, "ganon report has inconsistent results") # should have two outputs - self.assertEqual(len(res), len(list_files_folder(params["input"][0], params["input_extension"])), + self.assertEqual(len(res), len(list_files_folder(params["input"][0], ext=params["input_extension"])), "ganon report did not generate multiple report files") def test_multiple_rep_files_split_hierachy(self): diff --git a/tests/ganon/integration/test_update.py b/tests/ganon/integration/test_update.py index fc7cad37..8d0d2cf0 100644 --- a/tests/ganon/integration/test_update.py +++ b/tests/ganon/integration/test_update.py @@ -60,12 +60,12 @@ def test_output_db_prefix(self): sleep(1) update_params = {"db_prefix": self.results_dir + "test_output_db_prefix", - "output_db_prefix": self.results_dir + "test_output_db_prefix2", - "threads": 1, - "write_info_file": True, - "keep_files": True, - "verbose": True, - "quiet": False} + "output_db_prefix": self.results_dir + "test_output_db_prefix2", + "threads": 1, + "write_info_file": True, + "keep_files": True, + "verbose": True, + "quiet": False} cfg = Config("update", **update_params) # Run ganon build self.assertTrue(run_ganon(cfg, update_params["output_db_prefix"]), "ganon update run failed") diff --git a/tests/ganon/utils.py b/tests/ganon/utils.py index 6975244d..a7300ed5 100644 --- a/tests/ganon/utils.py +++ b/tests/ganon/utils.py @@ -37,11 +37,15 @@ def check_files(prefix, extensions): return True -def list_files_folder(folder, ext: str=None): +def list_files_folder(folder, ext: str="", recursive: bool=False): file_list = [] - for file in os.listdir(folder): - if ext is None or file.endswith(ext): - file_list.append(folder + file) + if recursive: + for path in Path(folder).rglob('*' + ext): + file_list.append(str(path.joinpath())) + else: + for file in os.listdir(folder): + if ext is None or file.endswith(ext): + file_list.append(folder + file) return file_list @@ -114,7 +118,7 @@ def build_sanity_check_and_parse(params, skipped_targets: bool=False): input_files = [] for i in params["input"]: if os.path.isdir(i): - input_files.extend(list_files_folder(i, params["input_extension"])) + input_files.extend(list_files_folder(i, ext=params["input_extension"], recursive=params["input_recursive"])) else: input_files.append(i) From 7e08b5ebc0f43fb6df010b5198db96ccb83c9e83 Mon Sep 17 00:00:00 2001 From: pirovc <4673375+pirovc@users.noreply.github.com> Date: Mon, 17 Apr 2023 09:28:17 +0200 Subject: [PATCH 3/9] Feature/code classify (#241) * errors parse files * size_t, const * skip too long reads * all size_t but counting agents * more size_t * big int mode compile time, some moves * add argument to provide alternative root node for LCA --- CMakeLists.txt | 6 + src/ganon-build/GanonBuild.cpp | 71 +++-- src/ganon-classify/CommandLineParser.cpp | 11 +- src/ganon-classify/GanonClassify.cpp | 300 ++++++++++-------- .../include/ganon-classify/Config.hpp | 15 +- src/utils/include/utils/LCA.hpp | 12 +- tests/utils/LCA.test.cpp | 2 +- 7 files changed, 230 insertions(+), 187 deletions(-) mode change 100755 => 100644 tests/utils/LCA.test.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 079263f8..939820d7 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,6 +23,7 @@ endif() option( VERBOSE_CONFIG "Verbose mode for quick build setup debugging" OFF ) option( CONDA "Flag for compilation in conda env." OFF ) +option( LONGREADS "Uses uint32_t for count in ganon-classify. Useful for very long reads (>65535bp)" OFF ) option( INCLUDE_DIRS "Include directories to look for libraries" "" ) # ----------------------------------------------------------------------------- @@ -81,6 +82,10 @@ if ( NOT CONDA ) add_compile_options( -static -march=native ) endif() +if( LONGREADS ) + add_compile_options(-DLONGREADS) +endif() + # ----------------------------------------------------------------------------- # dependencies and 3rd party libraries # ----------------------------------------------------------------------------- @@ -149,6 +154,7 @@ if( VERBOSE_CONFIG ) message( STATUS " INCLUDE_DIRS : ${INCLUDE_DIRS}" ) message( STATUS " CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}" ) message( STATUS " CONDA : ${CONDA}" ) + message( STATUS " LONGREADS : ${LONGREADS}" ) get_directory_property( dirCompileOptions COMPILE_OPTIONS ) message( STATUS " COMPILE_OPTIONS : ${dirCompileOptions}" ) diff --git a/src/ganon-build/GanonBuild.cpp b/src/ganon-build/GanonBuild.cpp index 9aab4539..518c443c 100644 --- a/src/ganon-build/GanonBuild.cpp +++ b/src/ganon-build/GanonBuild.cpp @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -215,45 +216,53 @@ void count_hashes( SafeQueue< InputFileMap >& ifm_queue, // For all files of a target for ( auto& [file, seqids] : ifm.files ) { - - // open file - seqan3::sequence_file_input< raptor::dna4_traits, seqan3::fields< seqan3::field::id, seqan3::field::seq > > - fin{ file }; - - if ( seqids.empty() ) + try { - robin_hood::unordered_set< uint64_t > hashes; + // open file + seqan3::sequence_file_input< raptor::dna4_traits, + seqan3::fields< seqan3::field::id, seqan3::field::seq > > + fin{ file }; - // File as target - generate all hashes from file with possible multiple sequences - // before counting and storing - for ( auto const& [header, seq] : fin ) - { - total.sequences++; - total.length_bp += seq.size(); - const auto mh = seq | minimiser_view | std::views::common; - hashes.insert( mh.begin(), mh.end() ); - } - hashes_count[ifm.target] += hashes.size(); - detail::store_hashes( ifm.target, hashes, config.tmp_output_folder ); - } - else - { - // Sequence as target - count and store hashes for each sequence - for ( auto const& [header, seq] : fin ) + if ( seqids.empty() ) { - const auto seqid = get_seqid( header ); - // If header is present for this target - if ( seqids.count( seqid ) ) + robin_hood::unordered_set< uint64_t > hashes; + + // File as target - generate all hashes from file with possible multiple sequences + // before counting and storing + for ( auto const& [header, seq] : fin ) { - robin_hood::unordered_set< uint64_t > hashes; total.sequences++; total.length_bp += seq.size(); const auto mh = seq | minimiser_view | std::views::common; hashes.insert( mh.begin(), mh.end() ); - hashes_count[seqid] = hashes.size(); - detail::store_hashes( seqid, hashes, config.tmp_output_folder ); } + hashes_count[ifm.target] += hashes.size(); + detail::store_hashes( ifm.target, hashes, config.tmp_output_folder ); } + else + { + // Sequence as target - count and store hashes for each sequence + for ( auto const& [header, seq] : fin ) + { + const auto seqid = get_seqid( header ); + // If header is present for this target + if ( seqids.count( seqid ) ) + { + robin_hood::unordered_set< uint64_t > hashes; + total.sequences++; + total.length_bp += seq.size(); + const auto mh = seq | minimiser_view | std::views::common; + hashes.insert( mh.begin(), mh.end() ); + hashes_count[seqid] = hashes.size(); + detail::store_hashes( seqid, hashes, config.tmp_output_folder ); + } + } + } + } + catch ( seqan3::parse_error const& e ) + { + std::cerr << "Error parsing file [" << file << "]. " << e.what() << std::endl; + continue; } } } @@ -448,8 +457,8 @@ void optimal_hashes( double const max_fp, /* * given a max. false positive or filter_size, iterate over possible capacities for a bin (single bloom filter) * and calculate the respective size, considering split bins - * selects the parameters generating the "best" between smallest filter/fp and n. of bins and fill the ibf_config - * struct + * selects the parameters generating the "best" between smallest filter/fp and n. of bins and fill the + * ibf_config struct */ // Target with the highest number of minimizers diff --git a/src/ganon-classify/CommandLineParser.cpp b/src/ganon-classify/CommandLineParser.cpp index 8ab6b52f..2b0bb3c9 100644 --- a/src/ganon-classify/CommandLineParser.cpp +++ b/src/ganon-classify/CommandLineParser.cpp @@ -32,9 +32,10 @@ std::optional< Config > CommandLineParser::parse( int argc, char** argv ) ( "hibf", "Input is an Hierarchical IBF (.hibf) generated from raptor.", cxxopts::value< bool >()) ( "skip-lca", "Skip LCA step.", cxxopts::value< bool >()) + ( "tax-root-node", "Define alternative root node for LCA. Default: 1", cxxopts::value< std::string >()) ( "t,threads", "Number of threads", cxxopts::value< uint16_t >()) - ( "n-batches", "Number of batches of n-reads to hold in memory. Default: 1000", cxxopts::value< uint32_t >()) - ( "n-reads", "Number of reads for each batch. Default: 400", cxxopts::value< uint32_t >()) + ( "n-batches", "Number of batches of n-reads to hold in memory. Default: 1000", cxxopts::value< size_t >()) + ( "n-reads", "Number of reads for each batch. Default: 400", cxxopts::value< size_t >()) ( "verbose", "Verbose output mode", cxxopts::value< bool >()) ( "quiet", "Quiet output mode (only outputs errors and warnings to the STDERR)", cxxopts::value< bool >()) ( "h,help", "Print help" ) @@ -95,12 +96,14 @@ std::optional< Config > CommandLineParser::parse( int argc, char** argv ) config.hibf = args["hibf"].as< bool >(); if ( args.count( "skip-lca" ) ) config.skip_lca = args["skip-lca"].as< bool >(); + if ( args.count( "tax-root-node" ) ) + config.tax_root_node = args["tax-root-node"].as< std::string >(); if ( args.count( "threads" ) ) config.threads = args["threads"].as< uint16_t >(); if ( args.count( "n-reads" ) ) - config.n_reads = args["n-reads"].as< uint32_t >(); + config.n_reads = args["n-reads"].as< size_t >(); if ( args.count( "n-batches" ) ) - config.n_batches = args["n-batches"].as< uint32_t >(); + config.n_batches = args["n-batches"].as< size_t >(); if ( args.count( "verbose" ) ) config.verbose = args["verbose"].as< bool >(); if ( args.count( "quiet" ) ) diff --git a/src/ganon-classify/GanonClassify.cpp b/src/ganon-classify/GanonClassify.cpp index 1d7cb4b2..2dd8b0bf 100644 --- a/src/ganon-classify/GanonClassify.cpp +++ b/src/ganon-classify/GanonClassify.cpp @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -32,19 +33,24 @@ #include #include + namespace GanonClassify { namespace detail { +#ifdef LONGREADS +typedef uint32_t TIntCount; +#else +typedef uint16_t TIntCount; +#endif + typedef raptor::hierarchical_interleaved_bloom_filter< seqan3::data_layout::uncompressed > THIBF; typedef seqan3::interleaved_bloom_filter< seqan3::data_layout::uncompressed > TIBF; -typedef robin_hood::unordered_map< std::string, uint16_t > TMatches; - -typedef std::vector< std::tuple< uint64_t, std::string > > TBinMap; - -typedef robin_hood::unordered_map< std::string, std::vector< uint64_t > > TMap; +typedef robin_hood::unordered_map< std::string, size_t > TMatches; +typedef std::vector< std::tuple< size_t, std::string > > TBinMap; +typedef robin_hood::unordered_map< std::string, std::vector< size_t > > TMap; struct Node { @@ -93,7 +99,7 @@ struct ReadBatches struct ReadMatch { std::string target; - uint16_t kmer_count; + size_t kmer_count; }; struct ReadOut @@ -114,9 +120,9 @@ struct ReadOut struct Rep { // Report with counts of matches and reads assigned (unique or lca) for each target - uint64_t matches = 0; - uint64_t lca_reads = 0; - uint64_t unique_reads = 0; + size_t matches = 0; + size_t lca_reads = 0; + size_t unique_reads = 0; }; typedef robin_hood::unordered_map< std::string, Rep > TRep; @@ -124,18 +130,18 @@ typedef robin_hood::unordered_map< std::string, Node > TTax; struct Total { - uint64_t reads_processed = 0; - uint64_t length_processed = 0; - uint64_t reads_classified = 0; - uint64_t matches = 0; - uint64_t unique_matches = 0; + size_t reads_processed = 0; + size_t length_processed = 0; + size_t reads_classified = 0; + size_t matches = 0; + size_t unique_matches = 0; }; struct Stats { Total total; // number of reads in the input files - uint64_t input_reads = 0; + size_t input_reads = 0; // Total for each hierarchy std::map< std::string, Total > hierarchy_total; @@ -212,11 +218,11 @@ std::map< std::string, HierarchyConfig > parse_hierarchy( Config& config ) std::vector< std::string > sorted_hierarchy = config.hierarchy_labels; std::sort( sorted_hierarchy.begin(), sorted_hierarchy.end() ); // get unique hierarcy labels - uint16_t unique_hierarchy = + const size_t unique_hierarchy = std::unique( sorted_hierarchy.begin(), sorted_hierarchy.end() ) - sorted_hierarchy.begin(); - uint16_t hierarchy_count = 0; - for ( uint16_t h = 0; h < config.hierarchy_labels.size(); ++h ) + size_t hierarchy_count = 0; + for ( size_t h = 0; h < config.hierarchy_labels.size(); ++h ) { auto filter_cfg = FilterConfig{ config.ibf[h], config.rel_cutoff[h] }; @@ -254,7 +260,7 @@ std::map< std::string, HierarchyConfig > parse_hierarchy( Config& config ) return parsed_hierarchy; } -void print_hierarchy( Config const& config, std::map< std::string, HierarchyConfig > const& parsed_hierarchy ) +void print_hierarchy( Config const& config, auto const& parsed_hierarchy ) { constexpr auto newl{ "\n" }; @@ -300,7 +306,7 @@ inline TRep sum_reports( std::vector< TRep > const& reports ) return report_sum; } -inline uint16_t threshold_rel( uint16_t n_hashes, double p ) +inline size_t threshold_rel( size_t n_hashes, double p ) { return std::ceil( n_hashes * p ); } @@ -309,16 +315,16 @@ void select_matches( Filter< TIBF >& filter, TMatches& matches, std::vector< size_t >& hashes, auto& agent, - uint16_t threshold_cutoff, - uint16_t& max_kmer_count_read ) + size_t threshold_cutoff, + size_t& max_kmer_count_read ) { // Count every occurance on IBF - seqan3::counting_vector< uint16_t > counts = agent.bulk_count( hashes ); + seqan3::counting_vector< detail::TIntCount > counts = agent.bulk_count( hashes ); - // Sum counts among bins (split target (user bins) into several tecnical bins) for ( auto const& [target, bins] : filter.map ) { - uint16_t summed_count = 0; + // Sum counts among bins (split target (user bins) into several tecnical bins) + size_t summed_count = 0; for ( auto const& binno : bins ) { summed_count += counts[binno]; @@ -341,11 +347,11 @@ void select_matches( Filter< THIBF >& filter, TMatches& matches, std::vector< size_t >& hashes, auto& agent, - uint16_t threshold_cutoff, - uint16_t& max_kmer_count_read ) + size_t threshold_cutoff, + size_t& max_kmer_count_read ) { // Count only matches above threhsold - seqan3::counting_vector< uint16_t > counts = agent.bulk_count( hashes, threshold_cutoff ); + seqan3::counting_vector< detail::TIntCount > counts = agent.bulk_count( hashes, threshold_cutoff ); // Only one bin per target @@ -353,7 +359,7 @@ void select_matches( Filter< THIBF >& filter, { if ( counts[bins[0]] > 0 ) { - const uint16_t count = counts[bins[0]]; + const size_t count = counts[bins[0]]; // ensure that count was not already found for target with higher count // can happen in case of ambiguos targets in multiple filters if ( count > matches[target] ) @@ -366,7 +372,7 @@ void select_matches( Filter< THIBF >& filter, } } -uint32_t filter_matches( ReadOut& read_out, TMatches& matches, TRep& rep, uint16_t threshold_filter ) +size_t filter_matches( ReadOut& read_out, TMatches& matches, TRep& rep, size_t threshold_filter ) { for ( auto const& [target, kmer_count] : matches ) @@ -381,7 +387,7 @@ uint32_t filter_matches( ReadOut& read_out, TMatches& matches, TRep& rep, uint16 return read_out.matches.size(); } -void lca_matches( ReadOut& read_out, ReadOut& read_out_lca, uint16_t max_kmer_count_read, LCA& lca, TRep& rep ) +void lca_matches( ReadOut& read_out, ReadOut& read_out_lca, size_t max_kmer_count_read, LCA& lca, TRep& rep ) { std::vector< std::string > targets; @@ -413,22 +419,21 @@ void classify( std::vector< Filter< TFilter > >& filters, { // oner hash adaptor per thread - auto minimiser_hash = + const auto minimiser_hash = seqan3::views::minimiser_hash( seqan3::shape{ seqan3::ungapped{ hierarchy_config.kmer_size } }, seqan3::window_size{ hierarchy_config.window_size }, seqan3::seed{ raptor::adjust_seed( hierarchy_config.kmer_size ) } ); // one agent per thread per filter using TAgent = std::conditional_t< std::same_as< TFilter, THIBF >, - THIBF::counting_agent_type< uint16_t >, - TIBF::counting_agent_type< uint16_t > >; + THIBF::counting_agent_type< detail::TIntCount >, + TIBF::counting_agent_type< detail::TIntCount > >; std::vector< TAgent > agents; for ( Filter< TFilter >& filter : filters ) { - agents.push_back( filter.ibf.counting_agent() ); + agents.push_back( filter.ibf.template counting_agent< detail::TIntCount >() ); } - while ( true ) { // Wait here until reads are available or push is over and queue is empty @@ -441,49 +446,57 @@ void classify( std::vector< Filter< TFilter > >& filters, // store unclassified reads for next iteration ReadBatches left_over_reads{ rb.paired }; - for ( uint32_t readID = 0; readID < rb.ids.size(); ++readID ) + const size_t hashes_limit = std::numeric_limits< detail::TIntCount >::max(); + + for ( size_t readID = 0; readID < rb.ids.size(); ++readID ) { // read lenghts - uint16_t read1_len = rb.seqs[readID].size(); - uint16_t read2_len = rb.paired ? rb.seqs2[readID].size() : 0; + const size_t read1_len = rb.seqs[readID].size(); + const size_t read2_len = rb.paired ? rb.seqs2[readID].size() : 0; // Store matches for this read TMatches matches; - std::vector< size_t > hashes; - // Best scoring kmer count - uint16_t max_kmer_count_read = 0; + size_t max_kmer_count_read = 0; + + // if length is smaller than window, skip read if ( read1_len >= hierarchy_config.window_size ) { - hashes = rb.seqs[readID] | minimiser_hash | seqan3::views::to< std::vector >; + // Count hashes + std::vector< size_t > hashes = rb.seqs[readID] | minimiser_hash | seqan3::views::to< std::vector >; // Count hashes from both pairs if second is given if ( read2_len >= hierarchy_config.window_size ) { // Add hashes of second pair - auto h2 = rb.seqs2[readID] | minimiser_hash | std::views::common; + const auto h2 = rb.seqs2[readID] | minimiser_hash | std::views::common; hashes.insert( hashes.end(), h2.begin(), h2.end() ); } - // Sum sequence to totals - if ( hierarchy_first ) + const size_t n_hashes = hashes.size(); + // if n_hashes are bigger than int limit, skip read + if ( n_hashes <= hashes_limit ) { - total.reads_processed++; - total.length_processed += read1_len + read2_len; - } + // Sum sequence to totals + if ( hierarchy_first ) + { + total.reads_processed++; + total.length_processed += read1_len + read2_len; + } - // For each filter in the hierarchy - for ( uint8_t i = 0; i < filters.size(); ++i ) - { - // Calculate threshold for cutoff (keep matches above) - uint16_t threshold_cutoff = threshold_rel( hashes.size(), filters[i].filter_config.rel_cutoff ); + // For each filter in the hierarchy + for ( size_t i = 0; i < filters.size(); ++i ) + { + // Calculate threshold for cutoff (keep matches above) + size_t threshold_cutoff = threshold_rel( n_hashes, filters[i].filter_config.rel_cutoff ); - // reset low threshold_cutoff to just one kmer (0 would match everywhere) - if ( threshold_cutoff == 0 ) - threshold_cutoff = 1; + // reset low threshold_cutoff to just one kmer (0 would match everywhere) + if ( threshold_cutoff == 0 ) + threshold_cutoff = 1; - // count and select matches - select_matches( filters[i], matches, hashes, agents[i], threshold_cutoff, max_kmer_count_read ); + // count and select matches + select_matches( filters[i], matches, hashes, agents[i], threshold_cutoff, max_kmer_count_read ); + } } } @@ -496,11 +509,11 @@ void classify( std::vector< Filter< TFilter > >& filters, total.reads_classified++; // Calculate threshold for filtering (keep matches above) - uint16_t threshold_filter = + const size_t threshold_filter = max_kmer_count_read - threshold_rel( max_kmer_count_read, hierarchy_config.rel_filter ); // Filter matches - uint32_t count_filtered_matches = filter_matches( read_out, matches, rep, threshold_filter ); + const size_t count_filtered_matches = filter_matches( read_out, matches, rep, threshold_filter ); if ( !config.skip_lca ) { @@ -535,16 +548,13 @@ void classify( std::vector< Filter< TFilter > >& filters, // not classified if ( !hierarchy_last ) // if there is more levels, store read { - // seqan::appendValue( left_over_reads.ids, rb.ids[readID] ); - // seqan::appendValue( left_over_reads.seqs, rb.seqs[readID] ); - // MOVE? - left_over_reads.ids.push_back( rb.ids[readID] ); - left_over_reads.seqs.push_back( rb.seqs[readID] ); + left_over_reads.ids.push_back( std::move( rb.ids[readID] ) ); + left_over_reads.seqs.push_back( std::move( rb.seqs[readID] ) ); if ( rb.paired ) { // seqan::appendValue( left_over_reads.seqs2, rb.seqs2[readID] ); - left_over_reads.seqs2.push_back( rb.seqs2[readID] ); + left_over_reads.seqs2.push_back( std::move( rb.seqs2[readID] ) ); } } else if ( config.output_unclassified ) // no more levels and no classification, add to @@ -556,7 +566,7 @@ void classify( std::vector< Filter< TFilter > >& filters, // if there are more levels to classify and something was left, keep reads in memory if ( !hierarchy_last && left_over_reads.ids.size() > 0 ) - pointer_helper->push( left_over_reads ); + pointer_helper->push( std::move( left_over_reads ) ); } } @@ -671,7 +681,7 @@ TTax load_tax( std::string tax_file ) template < typename TFilter > bool load_files( std::vector< Filter< TFilter > >& filters, std::vector< FilterConfig >& fconf ) { - uint16_t filter_cnt = 0; + size_t filter_cnt = 0; for ( auto& filter_config : fconf ) { TTax tax; @@ -711,39 +721,37 @@ void print_time( const StopClock& timeGanon, const StopClock& timeLoadFilters, c std::cerr << std::endl; } -void print_stats( Stats& stats, const StopClock& timeClassPrint, auto& parsed_hierarchy ) +void print_stats( Stats& stats, const StopClock& timeClassPrint, auto const& parsed_hierarchy ) { const double elapsed_classification = timeClassPrint.elapsed(); + const double total_reads_processed = stats.total.reads_processed > 0 + ? static_cast< double >( stats.total.reads_processed ) + : 1; // to not report nan on divisions std::cerr << "ganon-classify processed " << stats.total.reads_processed << " sequences (" << stats.total.length_processed / 1000000.0 << " Mbp) in " << elapsed_classification << " seconds (" << ( stats.total.length_processed / 1000000.0 ) / ( elapsed_classification / 60.0 ) << " Mbp/m)" << std::endl; std::cerr << " - " << stats.total.reads_classified << " reads classified (" - << ( stats.total.reads_classified / static_cast< double >( stats.total.reads_processed ) ) * 100 << "%)" - << std::endl; + << ( stats.total.reads_classified / total_reads_processed ) * 100 << "%)" << std::endl; std::cerr << " - " << stats.total.unique_matches << " with unique matches (" - << ( stats.total.unique_matches / static_cast< double >( stats.total.reads_processed ) ) * 100 << "%)" - << std::endl; + << ( stats.total.unique_matches / total_reads_processed ) * 100 << "%)" << std::endl; std::cerr << " - " << stats.total.reads_classified - stats.total.unique_matches << " with multiple matches (" - << ( ( stats.total.reads_classified - stats.total.unique_matches ) - / static_cast< double >( stats.total.reads_processed ) ) - * 100 - << "%)" << std::endl; - - float avg_matches = stats.total.reads_classified - ? ( stats.total.matches / static_cast< double >( stats.total.reads_classified ) ) - : 0; + << ( ( stats.total.reads_classified - stats.total.unique_matches ) / total_reads_processed ) * 100 << "%)" + << std::endl; + + double avg_matches = stats.total.reads_classified + ? ( stats.total.matches / static_cast< double >( stats.total.reads_classified ) ) + : 0; std::cerr << " - " << stats.total.matches << " matches (avg. " << avg_matches << " match/read classified)" << std::endl; - uint64_t total_reads_unclassified = stats.total.reads_processed - stats.total.reads_classified; + const size_t total_reads_unclassified = stats.total.reads_processed - stats.total.reads_classified; std::cerr << " - " << total_reads_unclassified << " reads unclassified (" - << ( total_reads_unclassified / static_cast< double >( stats.total.reads_processed ) ) * 100 << "%)" - << std::endl; + << ( total_reads_unclassified / total_reads_processed ) * 100 << "%)" << std::endl; if ( stats.total.reads_processed < stats.input_reads ) { std::cerr << " - " << stats.input_reads - stats.total.reads_processed - << " reads skipped (shorther than k-mer size)" << std::endl; + << " reads skipped (too long or too short (< window size))" << std::endl; } if ( parsed_hierarchy.size() > 1 ) @@ -759,20 +767,16 @@ void print_stats( Stats& stats, const StopClock& timeClassPrint, auto& parsed_hi : 0; std::cerr << " - " << hierarchy_label << ": " << stats.hierarchy_total[hierarchy_label].reads_classified << " classified (" - << ( stats.hierarchy_total[hierarchy_label].reads_classified - / static_cast< double >( stats.total.reads_processed ) ) - * 100 + << ( stats.hierarchy_total[hierarchy_label].reads_classified / total_reads_processed ) * 100 << "%) " << stats.hierarchy_total[hierarchy_label].unique_matches << " unique (" - << ( stats.hierarchy_total[hierarchy_label].unique_matches - / static_cast< double >( stats.total.reads_processed ) ) - * 100 + << ( stats.hierarchy_total[hierarchy_label].unique_matches / total_reads_processed ) * 100 << "%) " << stats.hierarchy_total[hierarchy_label].reads_classified - stats.hierarchy_total[hierarchy_label].unique_matches << " multiple (" << ( ( stats.hierarchy_total[hierarchy_label].reads_classified - stats.hierarchy_total[hierarchy_label].unique_matches ) - / static_cast< double >( stats.total.reads_processed ) ) + / total_reads_processed ) * 100 << "%) " << stats.hierarchy_total[hierarchy_label].matches << " matches (avg. " << avg_matches << ")" << std::endl; @@ -784,46 +788,64 @@ void parse_reads( SafeQueue< ReadBatches >& queue1, Stats& stats, Config const& { for ( auto const& reads_file : config.single_reads ) { - seqan3::sequence_file_input< raptor::dna4_traits, seqan3::fields< seqan3::field::id, seqan3::field::seq > > fin1{ - reads_file - }; - for ( auto&& rec : fin1 | seqan3::views::chunk( config.n_reads ) ) - { - ReadBatches rb{ false }; - for ( auto& [id, seq] : rec ) - { - rb.ids.push_back( std::move( id ) ); - rb.seqs.push_back( std::move( seq ) ); - } - stats.input_reads += rb.ids.size(); - queue1.push( std::move( rb ) ); - } - } - if ( config.paired_reads.size() > 0 ) - { - for ( uint16_t pair_cnt = 0; pair_cnt < config.paired_reads.size(); pair_cnt += 2 ) + try { seqan3::sequence_file_input< raptor::dna4_traits, seqan3::fields< seqan3::field::id, seqan3::field::seq > > - fin1{ config.paired_reads[pair_cnt] }; - seqan3::sequence_file_input< raptor::dna4_traits, seqan3::fields< seqan3::field::id, seqan3::field::seq > > - fin2{ config.paired_reads[pair_cnt + 1] }; + fin1{ reads_file }; for ( auto&& rec : fin1 | seqan3::views::chunk( config.n_reads ) ) { - ReadBatches rb{ true }; + ReadBatches rb{ false }; for ( auto& [id, seq] : rec ) { rb.ids.push_back( std::move( id ) ); rb.seqs.push_back( std::move( seq ) ); } - // loop in the second file and get same amount of reads - for ( auto& [id, seq] : fin2 | std::views::take( config.n_reads ) ) - { - rb.seqs2.push_back( std::move( seq ) ); - } stats.input_reads += rb.ids.size(); queue1.push( std::move( rb ) ); } } + catch ( seqan3::parse_error const& e ) + { + std::cerr << "Error parsing file [" << reads_file << "]. " << e.what() << std::endl; + continue; + } + } + if ( config.paired_reads.size() > 0 ) + { + for ( size_t pair_cnt = 0; pair_cnt < config.paired_reads.size(); pair_cnt += 2 ) + { + try + { + seqan3::sequence_file_input< raptor::dna4_traits, + seqan3::fields< seqan3::field::id, seqan3::field::seq > > + fin1{ config.paired_reads[pair_cnt] }; + seqan3::sequence_file_input< raptor::dna4_traits, + seqan3::fields< seqan3::field::id, seqan3::field::seq > > + fin2{ config.paired_reads[pair_cnt + 1] }; + for ( auto&& rec : fin1 | seqan3::views::chunk( config.n_reads ) ) + { + ReadBatches rb{ true }; + for ( auto& [id, seq] : rec ) + { + rb.ids.push_back( std::move( id ) ); + rb.seqs.push_back( std::move( seq ) ); + } + // loop in the second file and get same amount of reads + for ( auto& [id, seq] : fin2 | std::views::take( config.n_reads ) ) + { + rb.seqs2.push_back( std::move( seq ) ); + } + stats.input_reads += rb.ids.size(); + queue1.push( std::move( rb ) ); + } + } + catch ( seqan3::parse_error const& ext ) + { + std::cerr << "Error parsing files [" << config.paired_reads[pair_cnt] << "/" + << config.paired_reads[pair_cnt + 1] << "]. " << ext.what() << std::endl; + continue; + } + } } queue1.notify_push_over(); } @@ -835,7 +857,7 @@ void write_classified( SafeQueue< ReadOut >& classified_queue, std::ofstream& ou ReadOut ro = classified_queue.pop(); if ( ro.readID != "" ) { - for ( uint32_t i = 0; i < ro.matches.size(); ++i ) + for ( size_t i = 0; i < ro.matches.size(); ++i ) { out << ro.readID << '\t' << ro.matches[i].target << '\t' << ro.matches[i].kmer_count << '\n'; } @@ -875,7 +897,7 @@ TTax merge_tax( std::vector< Filter< TFilter > > const& filters ) else { TTax merged_tax = filters[0].tax; - for ( uint16_t i = 1; i < filters.size(); ++i ) + for ( size_t i = 1; i < filters.size(); ++i ) { // merge taxonomies keeping the first one as a default merged_tax.insert( filters[i].tax.begin(), filters[i].tax.end() ); @@ -885,7 +907,10 @@ TTax merge_tax( std::vector< Filter< TFilter > > const& filters ) } template < typename TFilter > -void validate_targets_tax( std::vector< Filter< TFilter > > const& filters, TTax& tax, bool quiet ) +void validate_targets_tax( std::vector< Filter< TFilter > > const& filters, + TTax& tax, + bool quiet, + const std::string tax_root_node ) { for ( auto const& filter : filters ) { @@ -893,22 +918,22 @@ void validate_targets_tax( std::vector< Filter< TFilter > > const& filters, TTax { if ( tax.count( target ) == 0 ) { - tax[target] = Node{ "1", "no rank", target }; + tax[target] = Node{ tax_root_node, "no rank", target }; if ( !quiet ) - std::cerr << "WARNING: target [" << target << "] without tax entry, setting parent node to 1 (root)" - << std::endl; + std::cerr << "WARNING: target [" << target << "] without tax entry, setting parent as root node [" + << tax_root_node << "]" << std::endl; } } } } -void pre_process_lca( LCA& lca, TTax& tax ) +void pre_process_lca( LCA& lca, TTax& tax, std::string tax_root_node ) { for ( auto const& [target, node] : tax ) { lca.addEdge( node.parent, target ); } - lca.doEulerWalk(); + lca.doEulerWalk( tax_root_node ); } } // namespace detail @@ -974,8 +999,8 @@ bool ganon_classify( Config config ) // Classify reads iteractively for each hierarchy level - uint16_t hierarchy_id = 0; - uint16_t hierarchy_size = parsed_hierarchy.size(); + size_t hierarchy_id = 0; + const size_t hierarchy_size = parsed_hierarchy.size(); for ( auto& [hierarchy_label, hierarchy_config] : parsed_hierarchy ) { ++hierarchy_id; @@ -986,7 +1011,7 @@ bool ganon_classify( Config config ) LCA lca; timeLoadFilters.start(); - bool loaded = detail::load_files( filters, parsed_hierarchy[hierarchy_label].filters ); + const bool loaded = detail::load_files( filters, parsed_hierarchy[hierarchy_label].filters ); if ( !loaded ) { std::cerr << "ERROR: loading ibf or tax files" << std::endl; @@ -1017,14 +1042,19 @@ bool ganon_classify( Config config ) { // merge repeated elements from multiple filters tax = detail::merge_tax( filters ); - // if target not found in tax, add node target with parent = "1" (root) - detail::validate_targets_tax( filters, tax, config.quiet ); + // if target not found in tax, add node target with parent root + detail::validate_targets_tax( filters, tax, config.quiet, config.tax_root_node ); } if ( !config.skip_lca ) { + if ( tax.count( config.tax_root_node ) == 0 ) + { + std::cerr << "Root node [" << config.tax_root_node << "] not found (--tax-root-node)" << std::endl; + return false; + } // pre-processing of nodes to generate LCA - detail::pre_process_lca( lca, tax ); + detail::pre_process_lca( lca, tax, config.tax_root_node ); } @@ -1089,7 +1119,7 @@ bool ganon_classify( Config config ) std::vector< std::future< void > > tasks; // Threads for classification timeClassPrint.start(); - for ( uint16_t taskNo = 0; taskNo < config.threads; ++taskNo ) + for ( size_t taskNo = 0; taskNo < config.threads; ++taskNo ) { tasks.emplace_back( std::async( std::launch::async, diff --git a/src/ganon-classify/include/ganon-classify/Config.hpp b/src/ganon-classify/include/ganon-classify/Config.hpp index 2182ec87..a4ab2087 100644 --- a/src/ganon-classify/include/ganon-classify/Config.hpp +++ b/src/ganon-classify/include/ganon-classify/Config.hpp @@ -36,13 +36,14 @@ struct Config bool output_unclassified = false; bool output_single = false; - bool hibf = false; - bool skip_lca = false; - uint16_t threads = 1; - uint32_t n_batches = 1000; - uint32_t n_reads = 400; - bool verbose = false; - bool quiet = false; + bool hibf = false; + bool skip_lca = false; + std::string tax_root_node = "1"; + uint16_t threads = 1; + size_t n_batches = 1000; + size_t n_reads = 400; + bool verbose = false; + bool quiet = false; bool check_files( std::vector< std::string > const& files ) diff --git a/src/utils/include/utils/LCA.hpp b/src/utils/include/utils/LCA.hpp index 7b17e867..d5a61439 100644 --- a/src/utils/include/utils/LCA.hpp +++ b/src/utils/include/utils/LCA.hpp @@ -8,14 +8,13 @@ #include #include -// required: root node == "1" and father == "0" class LCA { public: LCA() = default; void addEdge( const std::string& father, const std::string& son ); - void doEulerWalk(); + void doEulerWalk( const std::string& root_node ); int getLCA( unsigned int u, unsigned int v ) const; std::string getLCA( const std::vector< std::string >& taxIds ) const; @@ -85,10 +84,10 @@ inline void LCA::depthFirstSearch( const std::string& current, unsigned int dept } } -inline void LCA::doEulerWalk() +inline void LCA::doEulerWalk( const std::string& root_node ) { m_firstAppearance.resize( m_vertices, first_appearance_init ); - depthFirstSearch( "1", 0 ); + depthFirstSearch( root_node, 0 ); preProcessRMQ(); } @@ -165,13 +164,8 @@ inline int LCA::getLCA( unsigned int u, unsigned int v ) const inline std::string LCA::getLCA( const std::vector< std::string >& taxIds ) const { - assert( taxIds.size() > 1 ); // Ideally should return itself if size==1 - // check for valid entries - // if ( std::any_of( taxIds.begin(), taxIds.end(), [&]( const auto& id ) { return m_encode.count( id ) == 0; } ) ) - // return "1"; - int lca = getLCA( m_encode.at( taxIds[0] ), m_encode.at( taxIds[1] ) ); for ( unsigned int i = 2; i < taxIds.size(); ++i ) lca = getLCA( lca, m_encode.at( taxIds[i] ) ); diff --git a/tests/utils/LCA.test.cpp b/tests/utils/LCA.test.cpp old mode 100755 new mode 100644 index 6fdb6f1f..a2796eb8 --- a/tests/utils/LCA.test.cpp +++ b/tests/utils/LCA.test.cpp @@ -9,7 +9,7 @@ LCA pre_process_lca( std::string file ) LCA lca; for ( auto const& line : aux::parse_tsv( file ) ) lca.addEdge( line[1], line[0] ); - lca.doEulerWalk(); + lca.doEulerWalk( "1" ); return lca; } From 95e73f2f827efbead6a20bd092c90cf869fcbc34 Mon Sep 17 00:00:00 2001 From: pirovc <4673375+pirovc@users.noreply.github.com> Date: Wed, 26 Apr 2023 16:56:43 +0200 Subject: [PATCH 4/9] bugfix/report-no-tax (#243) --- src/ganon-classify/GanonClassify.cpp | 11 +++++++++-- src/ganon/report.py | 13 +++++++------ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/ganon-classify/GanonClassify.cpp b/src/ganon-classify/GanonClassify.cpp index 2dd8b0bf..492ff91a 100644 --- a/src/ganon-classify/GanonClassify.cpp +++ b/src/ganon-classify/GanonClassify.cpp @@ -532,10 +532,17 @@ void classify( std::vector< Filter< TFilter > >& filters, if ( config.output_lca ) classified_lca_queue.push( read_out_lca ); } - else if ( count_filtered_matches == 1 ) + else { // Not running lca and has unique match - rep[read_out.matches[0].target].unique_reads++; + if ( count_filtered_matches == 1 ){ + rep[read_out.matches[0].target].unique_reads++; + }else{ + // without tax, no lca, count multi-matches to a root node + // to keep consistency among reports (no. of classified reads) + rep[config.tax_root_node].unique_reads++; + } + } if ( config.output_all ) diff --git a/src/ganon/report.py b/src/ganon/report.py index 1f31732c..51cac422 100644 --- a/src/ganon/report.py +++ b/src/ganon/report.py @@ -82,6 +82,7 @@ def report(cfg): for rep_file in rep_files: reports, counts = parse_rep(rep_file) + if not reports: print_log(" - nothing to report for " + rep_file, cfg.quiet) continue @@ -140,10 +141,11 @@ def parse_rep(rep_file): elif fields[0] == "#total_unclassified": unclassified_reads = int(fields[1]) else: - hierarchy_name, target, direct_matches, unique_reads, lca_reads, rank, name = fields - direct_matches = int(direct_matches) - unique_reads = int(unique_reads) - lca_reads = int(lca_reads) + hierarchy_name = fields[0] + target = fields[1] + direct_matches = int(fields[2]) + unique_reads = int(fields[3]) + lca_reads = int(fields[4]) if hierarchy_name not in reports: reports[hierarchy_name] = {} @@ -195,8 +197,7 @@ def build_report(reports, counts, full_tax, genome_sizes, output_file, fixed_ran # Add orphan nodes to taxonomy for node in merged_rep.keys(): if tax.latest(node) == tax.undefined_node: - tax._nodes[node] = tax.root_node - tax._ranks[node] = tax.undefined_rank + tax.add(node, tax.root_node) orphan_nodes.add(node) tax.check_consistency() # Pre-build lineages for performance From 2ed0724e127ff3607d2b8693eb9f107c0b4307be Mon Sep 17 00:00:00 2001 From: pirovc <4673375+pirovc@users.noreply.github.com> Date: Tue, 9 May 2023 10:23:01 +0200 Subject: [PATCH 5/9] Feature/add mkdocs (#247) * inital files * README into docs * update gu061 * more * v1.5.1 fix path script build-custom (#246) * v1.5.1 fix path script build-custom * gu v0.6.1 * more docs * more docs * docs * docs * docs * more docs * docs * README --- README.md | 972 +------------------------------------- docs/classification.md | 131 +++++ docs/custom_databases.md | 270 +++++++++++ docs/default_databases.md | 199 ++++++++ docs/index.md | 528 +++++++++++++++++++++ docs/outputfiles.md | 138 ++++++ docs/reports.md | 18 + docs/tutorials.md | 3 + libs/genome_updater | 2 +- mkdocs.yml | 15 + src/ganon/config.py | 20 +- 11 files changed, 1327 insertions(+), 969 deletions(-) create mode 100644 docs/classification.md create mode 100644 docs/custom_databases.md create mode 100644 docs/default_databases.md create mode 100644 docs/index.md create mode 100644 docs/outputfiles.md create mode 100644 docs/reports.md create mode 100644 docs/tutorials.md create mode 100644 mkdocs.yml diff --git a/README.md b/README.md index ca8ec289..c1cc9692 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,28 @@ # ganon [![Build Status](https://travis-ci.com/pirovc/ganon.svg?branch=master)](https://travis-ci.com/pirovc/ganon) [![codecov](https://codecov.io/gh/pirovc/ganon/branch/master/graph/badge.svg)](https://codecov.io/gh/pirovc/ganon) [![Anaconda-Server Badge](https://anaconda.org/bioconda/ganon/badges/downloads.svg)](https://anaconda.org/bioconda/ganon) [![Anaconda-Server Badge](https://anaconda.org/bioconda/ganon/badges/platforms.svg)](https://anaconda.org/bioconda/ganon) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/ganon/README.html) [![Publication](https://img.shields.io/badge/DOI-10.1101%2F406017-blue)](https://dx.doi.org/10.1093/bioinformatics/btaa458) -ganon classifies short DNA sequences against large sets of genomic reference sequences efficiently. +ganon classifies DNA sequences against large sets of genomic reference sequences efficiently. It features: -- automatically downloads, builds and updates commonly used repositories (refseq/genbank). +- automatic download, build and update for commonly used repos (refseq/genbank) +- binning and taxonomic profiling +- multiple taxonomy integration (ncbi/gtdb) +- LCA algorithm + read reassignment EM algorithm +- hierarchical use of databases +- taxonomic and sequence abundance reports with genome size correction +- contingency tables and many more -- binning and taxonomic profiling are supported + multi taxonomy integration (ncbi/gtdb) + LCA algorithm + read reassignment EM algorithm + hierarchical use of databases. +User manual: https://pirovc.github.io/ganon/ -- ganon generates taxonomic and sequence abundance reports with genome size correction, contingency tables and has many other [features](#Features) - ---- - -## Quick install/usage guide - -### Install with conda +## Quick install ```sh conda install -c bioconda -c conda-forge ganon ``` -### Download and build +## Basic usage + +### Download and build (Archaea - complete genomes - NCBI RefSeq) + ```bash -# Archaeal complete genome sequences from NCBI RefSeq ganon build --db-prefix arc_cg_rs --source refseq --organism-group archaea --complete-genomes --threads 12 ``` @@ -29,948 +31,4 @@ ganon build --db-prefix arc_cg_rs --source refseq --organism-group archaea --com ganon classify --db-prefix arc_cg_rs --output-prefix classify_results --single-reads my_reads.fq.gz --threads 12 ``` -### Re-generate reports and create tables from multiple reports -```bash -ganon report --db-prefix arc_cg_rs --input classify_results.rep --output-prefix filtered_report --min-count 0.01 -ganon table --input classify_results.tre filtered_report.tre --output-file output_table.tsv --top-sample 10 -``` - -### Update the database at a later time point -```bash -ganon update --db-prefix arc_cg_rs --threads 12 -``` - ---- - -- [Details](#details) -- [Examples](#examples) -- [Output files](#output-files) -- [Building customized databases](#building-customized-databases) -- [Multiple and Hierarchical classification](#multiple-and-hierarchical-classification) -- [Choosing and explaining parameters](#choosing-and-explaining-parameters) -- [Parameters](#parameters) - -## Details - -ganon is designed to index large sets of genomic reference sequences and to classify short reads against them efficiently. The tool uses Interleaved Bloom Filters as indices based on k-mers/minimizers. It was mainly developed, but not limited, to the metagenomics classification problem: quickly assign short fragments to their closest reference among thousands of references. After classification, taxonomic abundance is estimated and reported. - -### Features - -- NCBI and GTDB native support for taxonomic classification (also runs without taxonomy) -- integrated download of commonly used reference sequences from RefSeq/Genbank (`ganon build`) -- update indices incrementally (`ganon update`) -- customizable build for pre-downloaded or non-standard sequence files (`ganon build-custom`) -- build and classify at different taxonomic levels, file, sequence, strain/assembly or custom specialization -- perform [hierarchical classification](#multiple-and-hierarchical-classification): use several databases in any order -- [report](#report) the lowest common ancestor (LCA) but also multiple and unique matches for every read -- [report](#report) sequence or taxonomic abundances as well as total number of matches -- reassignment of reads with multiple matches to a unique match with an EM algorithm -- generate reports and contingency tables for multi-sample studies with several filter options - -ganon achieved very good results in [our own evaluations](https://dx.doi.org/10.1093/bioinformatics/btaa458) but also in independent evaluations: [LEMMI](https://lemmi-v1.ezlab.org/), [LEMMI v2](https://lemmi.ezlab.org/) and [CAMI2](https://dx.doi.org/10.1038/s41592-022-01431-4) - -### Installation guide - -The easiest way to install ganon is via conda, using the bioconda and conda-forge channels: - -```bash -conda install -c bioconda -c conda-forge ganon -``` - -However, there are possible performance benefits compiling ganon from source in the target machine rather than using the conda version. To do so, please follow the instructions below: - -
- Instructions - -#### build dependencies - -System packages: -- gcc >=7 -- cmake >=3.10 -- zlib - -#### run dependencies - -System packages: -- python >=3.6 -- pandas >=1.1.0 -- [multitax](https://github.com/pirovc/multitax) >=1.2.1 - -```bash -python3 -V # >=3.6 -python3 -m pip install "pandas>=1.1.0" "multitax>=1.2.1" -``` - -#### Downloading and building ganon + submodules - -```bash -git clone --recurse-submodules https://github.com/pirovc/ganon.git -``` - -```bash -cd ganon -python3 setup.py install --record files.txt #optional -mkdir build_cpp -cd build_cpp -cmake -DCMAKE_BUILD_TYPE=Release -DVERBOSE_CONFIG=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCONDA=OFF .. -make -sudo make install #optional -``` - -- to change install location (e.g. `/myprefix/bin/`), set the installation prefix in the cmake command with `-DCMAKE_INSTALL_PREFIX=/myprefix/ ` - -- use `-DINCLUDE_DIRS` to set alternative paths to cxxopts and Catch2 libs. - -If everything was properly installed, the following commands should show the help pages without errors: - -```bash -ganon -h -``` - -#### Run tests - -```bash -python3 -m unittest discover -s tests/ganon/integration/ -python3 -m unittest discover -s tests/ganon/integration_online/ #optional - downloads large files -cd build_cpp/ -ctest -VV . -``` -
- -## Examples - -### Commonly used reference set for metagenomics analysis (archaea+bacteria+fungi+viral from NCBI RefSeq, complete genomes, one assembly per taxa) - -```bash -ganon build --db-prefix abfv_rs_cg_t1a --organism-group archaea bacteria fungi viral --source refseq --taxonomy ncbi --complete-genomes --threads 12 --top 1 -``` - -### GTDB database, one assembly per taxa -```bash -ganon build --db-prefix complete_gtdb --organism-group archaea bacteria --source refseq genbank --taxonomy gtdb --threads 12 --top 1 -``` - -### Database based on specific taxonomic identifiers (203492 - Fusobacteriaceae) -```bash -# NCBI -ganon build --db-prefix fuso_ncbi --taxid "203492" --source refseq genbank --taxonomy ncbi --threads 12 -# GTDB -ganon build --db-prefix fuso_gtdb --taxid "f__Fusobacteriaceae" --source refseq genbank --taxonomy gtdb --threads 12 -``` - -### Customized database at assembly level -```bash -ganon build-custom --db-prefix my_db --input my_big_fasta_file.fasta.gz --level assembly --threads 12 -``` - -### Customized database at species level build based on files previously downloaded with [genome_updater](https://github.com/pirovc/genome_updater) -```bash -ganon build-custom --db-prefix custom_db_gu --input myfiles/2022-06-28_10-02-14/files/ --level species --ncbi-file-info outfolder/2022-06-28_10-02-14/assembly_summary.txt --threads 12 -``` - -### Customized database with sequence as target (to classify reads at sequence level) -```bash -ganon build-custom --db-prefix seq_target --input myfiles/2022-06-28_10-02-14/files/ --ncbi-file-info outfolder/2022-06-28_10-02-14/assembly_summary.txt --input-target sequence --threads 12 -``` -## Output files - -### build/update - -Every run on `ganon build`, `ganon build-custom` or `ganon update` will generate the following database files: - - - {prefix}**.ibf**: main interleaved bloom filter index file - - {prefix}**.tax**: taxonomy tree, only generated if `--taxonomy` is used *(fields: target/node, parent, rank, name, genome size)*. - - {prefix}**_files/**: (`ganon build` only) folder containing downloaded reference sequence and auxiliary files. Not necessary for classification. Keep this folder if the database will be update later. Otherwise it can be deleted. - -*Obs: Database files generated with version 1.2.0 or higher are not compatible with older versions.* - -### classify - - - {prefix}**.tre**: full report file (see below) - - {prefix}**.rep**: plain report of the run with only targets that received a match. Can be used to re-generate full reports (.tre) with `ganon report`. At the end prints 2 extra lines with `#total_classified` and `#total_unclassified`. Fields: - - 1: hierarchy label - - 2: target - - 3: # total matches - - 4: # unique reads - - 5: # lca reads - - 6: rank - - 7: name - - {prefix}**.lca**: output with one match for each classified read after LCA. Only generated with `--output-lca` active. If multiple hierarchy levels are set, one file for each level will be created: {prefix}.{hierarchy}.lca *(fields: read identifier, target, (max) k-mer/minimizer count)* - - {prefix}**.all**: output with all matches for each read. Only generated with `--output-all` active **Warning: file can be very large**. If multiple hierarchy levels are set, one file for each level will be created: {prefix}.{hierarchy}.all *(fields: read identifier, target, k-mer/minimizer count)* - -### report - - - {prefix}**.tre**: tab-separated tree-like report with cumulative counts and taxonomic lineage. There are several possible `--report-type`. More information on the different types of reports can be found [here](#report-type---report-type): - - *abundance*: will attempt to estimate **taxonomic abundances** by re-disributing read counts from LCA matches and correcting sequence abundance by approximate genome sizes. - - *reads*: **sequence abundances**, reports the proportion of sequences assigned to a taxa, each read classified is counted once. - - *dist*: like *reads* with read count re-distribution. - - *corr*: like *reads* with correction by genome size. - - *matches*: every match is reported to their original target, including multiple and shared matches. - -Each line in this report is a taxonomic entry (including the root node), with the following fields: - -| col | field | obs | example | -|-----|--------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------| -| 1 | rank | | phylum | -| 2 | target | taxonomic id. or specialization (assembly id.) | 562 | -| 3 | lineage | | 1\|131567\|2\|1224\|28211\|766\|942\|768\|769 | -| 4 | name | | Chromobacterium rhizoryzae | -| 5 | # unique | number of reads that matched exclusively to this target | 5 | -| 6 | # shared | number of reads with non-unique matches directly assigned to this target. Represents the LCA matches (`--report-type reads`), re-assigned matches (`--report-type abundance/dist`) or shared matches (`--report-type matches`) | 10 | -| 7 | # children | number of unique and shared assignments to all children nodes of this target | 20 | -| 8 | # cumulative | the sum of the unique, shared and children assignments up-to this target | 35 | -| 9 | % cumulative | percentage of assignments or estimated relative abundance for `--report-type abundance` | 43.24 | - -- The first line of the report file will show the number of unclassified reads (not for `--report-type matches`) - -- The CAMI challenge [bioboxes profiling format](https://github.com/bioboxes/rfc/blob/master/data-format/profiling.mkd) is supported using `--output-format bioboxes`. In this format, only values for the percentage/abundance (col. 9) are reported. The root node and unclassified entries are ommited. - -- The sum of cumulative assignments for the unclassified and root lines is 100%. The final cumulative sum of reads/matches may be under 100% if any filter is successfully applied and/or hierarchical selection is selected (keep/skip/split). - -- For all report type but `matches`, only taxa that received direct read matches, either unique or by LCA assignment, are considered. Some reads may have only shared matches and will not be reported directly but will be accounted for on some parent level. To visualize those matches, create a report with `--report-type matches` or use directly the file {prefix}**.rep**. - -### table - - - {output_file}: a tab-separated file with counts/percentages of taxa for multiple samples - -
- Examples of output files - -The main output file is the `{prefix}.tre` which will summarize the results: - -``` -unclassified unclassified 0 0 0 2 2.02020 -root 1 1 root 0 0 97 97 97.97980 -superkingdom 2 1|2 Bacteria 0 0 97 97 97.97980 -phylum 1239 1|2|1239 Firmicutes 0 0 57 57 57.57576 -phylum 1224 1|2|1224 Proteobacteria 0 0 40 40 40.40404 -class 91061 1|2|1239|91061 Bacilli 0 0 57 57 57.57576 -class 28211 1|2|1224|28211 Alphaproteobacteria 0 0 28 28 28.28283 -class 1236 1|2|1224|1236 Gammaproteobacteria 0 0 12 12 12.12121 -order 1385 1|2|1239|91061|1385 Bacillales 0 0 57 57 57.57576 -order 204458 1|2|1224|28211|204458 Caulobacterales 0 0 28 28 28.28283 -order 72274 1|2|1224|1236|72274 Pseudomonadales 0 0 12 12 12.12121 -family 186822 1|2|1239|91061|1385|186822 Paenibacillaceae 0 0 57 57 57.57576 -family 76892 1|2|1224|28211|204458|76892 Caulobacteraceae 0 0 28 28 28.28283 -family 468 1|2|1224|1236|72274|468 Moraxellaceae 0 0 12 12 12.12121 -genus 44249 1|2|1239|91061|1385|186822|44249 Paenibacillus 0 0 57 57 57.57576 -genus 75 1|2|1224|28211|204458|76892|75 Caulobacter 0 0 28 28 28.28283 -genus 469 1|2|1224|1236|72274|468|469 Acinetobacter 0 0 12 12 12.12121 -species 1406 1|2|1239|91061|1385|186822|44249|1406 Paenibacillus polymyxa 57 0 0 57 57.57576 -species 366602 1|2|1224|28211|204458|76892|75|366602 Caulobacter sp. K31 28 0 0 28 28.28283 -species 470 1|2|1224|1236|72274|468|469|470 Acinetobacter baumannii 12 0 0 12 12.12121 -``` - -running `ganon classify` or `ganon report` with `--ranks all`, the output will show all ranks used for classification and presented sorted by lineage (also available with `ganon report --sort lineage`): - -``` -unclassified unclassified 0 0 0 2 2.02020 -root 1 1 root 0 0 97 97 97.97980 -no rank 131567 1|131567 cellular organisms 0 0 97 97 97.97980 -superkingdom 2 1|131567|2 Bacteria 0 0 97 97 97.97980 -phylum 1224 1|131567|2|1224 Proteobacteria 0 0 40 40 40.40404 -class 1236 1|131567|2|1224|1236 Gammaproteobacteria 0 0 12 12 12.12121 -order 72274 1|131567|2|1224|1236|72274 Pseudomonadales 0 0 12 12 12.12121 -family 468 1|131567|2|1224|1236|72274|468 Moraxellaceae 0 0 12 12 12.12121 -genus 469 1|131567|2|1224|1236|72274|468|469 Acinetobacter 0 0 12 12 12.12121 -species group 909768 1|131567|2|1224|1236|72274|468|469|909768 Acinetobacter calcoaceticus/baumannii complex 0 0 12 12 12.12121 -species 470 1|131567|2|1224|1236|72274|468|469|909768|470 Acinetobacter baumannii 12 0 0 12 12.12121 -class 28211 1|131567|2|1224|28211 Alphaproteobacteria 0 0 28 28 28.28283 -order 204458 1|131567|2|1224|28211|204458 Caulobacterales 0 0 28 28 28.28283 -family 76892 1|131567|2|1224|28211|204458|76892 Caulobacteraceae 0 0 28 28 28.28283 -genus 75 1|131567|2|1224|28211|204458|76892|75 Caulobacter 0 0 28 28 28.28283 -species 366602 1|131567|2|1224|28211|204458|76892|75|366602 Caulobacter sp. K31 28 0 0 28 28.28283 -no rank 1783272 1|131567|2|1783272 Terrabacteria group 0 0 57 57 57.57576 -phylum 1239 1|131567|2|1783272|1239 Firmicutes 0 0 57 57 57.57576 -class 91061 1|131567|2|1783272|1239|91061 Bacilli 0 0 57 57 57.57576 -order 1385 1|131567|2|1783272|1239|91061|1385 Bacillales 0 0 57 57 57.57576 -family 186822 1|131567|2|1783272|1239|91061|1385|186822 Paenibacillaceae 0 0 57 57 57.57576 -genus 44249 1|131567|2|1783272|1239|91061|1385|186822|44249 Paenibacillus 0 0 57 57 57.57576 -species 1406 1|131567|2|1783272|1239|91061|1385|186822|44249|1406 Paenibacillus polymyxa 57 0 0 57 57.57576 -``` - -with `--output-format bioboxes` - -``` -@Version:0.10.0 -@SampleID:example.rep H1 -@Ranks:superkingdom|phylum|class|order|family|genus|species|assembly -@Taxonomy:db.tax -@@TAXID RANK TAXPATH TAXPATHSN PERCENTAGE -2 superkingdom 2 Bacteria 100.00000 -1224 phylum 2|1224 Bacteria|Proteobacteria 56.89782 -201174 phylum 2|201174 Bacteria|Actinobacteria 21.84869 -1239 phylum 2|1239 Bacteria|Firmicutes 9.75197 -976 phylum 2|976 Bacteria|Bacteroidota 6.15297 -1117 phylum 2|1117 Bacteria|Cyanobacteria 2.23146 -203682 phylum 2|203682 Bacteria|Planctomycetota 1.23353 -57723 phylum 2|57723 Bacteria|Acidobacteria 0.52549 -200795 phylum 2|200795 Bacteria|Chloroflexi 0.31118 -``` -
- -## Building customized databases - -Besides the automated download and build (`ganon build`) ganon provides a highly customizable build procedure (`ganon build-custom`) to create databases. - -To use custom sequences, just provide them with `--input`. ganon will try to retrieve all necessary information necessary to build a database. - -*ganon expects assembly accessions if building by file (e.g. filename should be similar as `GCA_002211645.1_ASM221164v1_genomic.fna.gz`) or accession version if building by sequence (e.g. headers should look like `>CP022124.1 Fusobacterium nu...`).* More information about building by file or sequence can be found [here](#target-file-or-sequence---input-target). - -It is also possible to use non-standard accessions and headers to build databases with `--input-file`. This file should contain the following fields (tab-separated): file, [target, node, specialization, specialization_name]. - -
- Examples of --input-file - -Using `--input-target sequence`: - -``` -sequences.fasta HEADER1 -sequences.fasta HEADER2 -sequences.fasta HEADER3 -others.fasta HEADER4 -others.fasta HEADER5 -``` - -or using `--input-target file`: - -``` -sequences.fasta FILE_A -others.fasta FILE_B -``` - -Nodes can be provided to link the data with taxonomy. For example (using `--taxonomy ncbi`): - -``` -sequences.fasta HEADER1 562 -sequences.fasta HEADER2 562 -sequences.fasta HEADER3 562 -others.fasta HEADER4 623 -others.fasta HEADER5 623 -``` - -Specialization can be used to create a additional classification level after the taxonomic leaves. For example (using `--level custom`): - -``` -sequences.fasta HEADER1 562 ID443 Escherichia coli TW10119 -sequences.fasta HEADER2 562 ID297 Escherichia coli PCN079 -sequences.fasta HEADER3 562 ID8873 Escherichia coli P0301867.7 -others.fasta HEADER4 623 ID2241 Shigella flexneri 1a -others.fasta HEADER5 623 ID4422 Shigella flexneri 1b -``` -
- -## Multiple and Hierarchical classification - -`ganon classify` can be performed in multiple databases at the same time. The databases can also be provided in a hierarchical order. - -Multiple database classification can be performed providing several inputs for `--db-prefix`. They are required to be built with the same `--kmer-size` and `--window-size` values. Multiple databases are considered as one (as if built together) and redundancy in content (same reference in two or more databases) is allowed. - -To classify reads in a hierarchical order, `--hierarchy-labels` should be provided. When using multiple hierarchical levels, output files will be generated for each level (use `--output-single` to generate a single output from multiple hierarchical levels). Please note that some parameters are set for each database (e.g. `--rel-cutoff`) while others are set for each hierarchical level (e.g. `--rel-filter`) - -
- Examples - -### Classifying reads against multiple databases: - -```bash -ganon classify --db-prefix db1 db2 db3 \ - --rel-cutoff 0.75 \ - --single-reads reads.fq.gz -``` - -Classification against 3 database (as if they were one) using the same cutoff. - -### Classifying reads against multiple databases with different cutoffs: - -```bash -ganon classify --db-prefix db1 db2 db3 \ - --rel-cutoff 0.2 0.3 0.1 \ - --single-reads reads.fq.gz -``` - -Classification against 3 database (as if they were one) using different error rates for each. - -### Classifying reads against multiple databases hierarchically: - -```bash -ganon classify --db-prefix db1 db2 db3 \ - --hierarchy-labels 1_first 1_first 2_second \ - --single-reads reads.fq.gz -``` - -In this example, reads are going to be classified first against db1 and db2. Reads without a valid match will be further classified against db3. `--hierarchy-labels` are strings and are going to be sorted to define the hierarchy order, disregarding input order. - -### Classifying reads against multiple databases hierarchically with different cutoffs: - -```bash -ganon classify --db-prefix db1 db2 db3 \ - --hierarchy-labels 1_first 1_first 2_second \ - --rel-cutoff 1 0.5 0.25 \ - --rel-filter 0.1 0.5 \ - --single-reads reads.fq.gz -``` - -In this example, classification will be performed with different `--rel-cutoff` for each database. For each hierarchy levels (`1_first` and `2_second`) a different `--rel-filter` will be used. - -
- -## Choosing and explaining parameters - -The most important parameters and trade-offs are: - -- `ganon build` `--window-size --kmer-size`: the *window* value should always be the same or larger than the *kmer* value. The larger the difference between them, the smaller the database will be. However, some sensitivity/precision loss in classification is expected with small *kmer* and/or large *window*. Larger *kmer* values (e.g. `31`) will improve classification, specially read binning, at a cost of way bigger databases. -- `ganon classify` `--rel-cutoff`: this value defines the threshold for matches between reads and database. Higher `--rel-cutoff` values will improve precision and decrease sensitivity with expected less unique matches but an increase in overall matches. For taxonomic profiling, a higher value between `0.4` and `0.8` may provide better results. For read binning, lower values between `0.2` and `0.4` are recommended. -- `ganon classify` `--rel-filter`: further filter top matches after cutoff is applied. Usually set between `0` and `0.2`. -- `ganon classify` `--reassign`: runs an EM-algorithm to reassign reads that received multiple matches. It provides a unique match for each read at the level the database was built (e.g. assembly or species). Mostly useful for read binning, with little overall impact on taxonomic profiling. Can be used independently with `ganon reassign`. -- `ganon report` `--report-type`: reports either taxonomic, sequence or matches abundances. Use `corr` or `abundance` for taxonomic profiling, `reads` or `dist` for sequence profiling and `matches` to report a summary of all matches. -- `ganon report` `--min-count`: cutoff to discard underrepresented taxa. Useful to remove the common long tail of spurious matches and false positives when performing classification. Values between `0.0001` (0.01%) and `0.001` (0.1%) improved sensitivity and precision in our evaluations. The higher the value, the more precise the outcome, with a sensitivity loss. Alternatively `--top-percentile` can be used to keep a relative amount of taxa instead a hard cutoff. - -The numeric values above are averages from several experiments with different sample types and database contents. They may not work as expected for your data. If you are not sure which values to use or see something unexpected, please open an [issue](https://github.com/pirovc/ganon/issues). - -Below, a more in-depth explanation of important parameters: - -### ganon build - -#### filter false positive and size (--max-fp, --filter-size) - -ganon indices are based on bloom filters and can have false positive matches. This can be controlled with `--max-fp` parameter. The lower the `--max-fp`, the less chances of false positives, but the larger the database size will be. For example, with `--max-fp 0.01` the database will be build so any target (e.g. assembly, specificed with `--level`) will have 1 in a 100 change of reporting a false match (between minimizer k-mers). - -Alternatively, one can set a specific size for the final index with `--filer-size`. When using this option, please observe the theoretic false positive of the index reported at the end of the building process. - -#### minimizers (--window-size, --kmer-size) - -in `ganon build`, when `--window-size` > `--kmer-size` minimizers are used. That means that for a every window, a single k-mer will be selected. It produces smaller database files and requires substantially less memory overall. It may increase building times but will have a huge benefit for classification times. Sensitivity and precision can be reduced by small margins. If `--window-size` = `--kmer-size`, all k-mers are going to be used to build the database. - -### ganon classify - -#### reads (--single-reads, --paired-reads) - -ganon accepts single-end and paired-end reads. Both types can be use at the same time. In paired-end mode, reads are always reported with the header of the first pair. Paired-end reads are classified in a standard forward-reverse orientation. The max. read length accepted is 65535 (accounting both reads in paired mode). - -#### cutoff and filter (--rel-cutoff, --rel-filter) - -ganon has two parameters to control a match between reads and references: `--rel-cutoff` and `--rel-filter`. - -Every read can be classified against none, one or more references. What will be reported is the remaining matches after `cutoff` and `filter` thresholds are applied, based on the number of shared minimizers (or k-mers) between sequences. - -The `cutoff` is the first. It should be set as a minimal value to consider a match between a read and a reference. Next the `filter` is applied to the remaining matches. `filter` thresholds are relative to the best scoring match and control how far from the best match further matches are allowed. `cutoff` can be interpreted as the lower bound to discard spurious matches and `filter` as the fine tuning to control what to keep. - -
- Example - -Using `--kmer-size 19` (and `--window-size 19` to simplify the example), a certain read (100bp) has the following matches with the 5 references (`ref1..5`), sorted by shared k-mers: - -| reference | shared k-mers | -|-----------|---------------| -| ref1 | 82 | -| ref2 | 68 | -| ref3 | 44 | -| ref4 | 25 | -| ref5 | 20 | - -this read can have at most 82 shared k-mers (`100-19+1=82`). With `--rel-cutoff 0.25`, the following matches will be discarded: - -| reference | shared k-mers | --rel-cutoff 0.25 | -|-----------|---------------|-------------------| -| ref1 | 82 | | -| ref2 | 68 | | -| ref3 | 44 | | -| ref4 | 25 | | -| ~~ref5~~ | ~~20~~ | X | - -since the `--rel-cutoff` threshold is `82 * 0.25 = 21` (ceiling is applied). Further, with `--rel-filter 0.3`, the following matches will be discarded: - -| reference | shared k-mers | --rel-cutoff 0.25 | --rel-filter 0.3 | -|-----------|---------------|-------------------|------------------| -| ref1 | 82 | | | -| ref2 | 68 | | | -| ~~ref3~~ | ~~44~~ | | X | -| ~~ref4~~ | ~~25~~ | | X | -| ~~ref5~~ | ~~20~~ | X | | - - -since best match is 82, the filter parameter is removing any match below `0.3 * 82 = 57` (ceiling is applied) shared k-mers. `ref1` and `ref2` are reported as matches. - -
- -For databases built with `--window-size`, the relative values are not based on the maximum number of possible shared k-mers but on the actual number of unique minimizers extracted from the read. - -A different `cutoff` can be set for every database in a multiple or hierarchical database classification. A different `filter` can be set for every level of a hierarchical database classification. - -Note that reads that remain with only one reference match (after `cutoff` and `filter` are applied) are considered a unique match. - -### ganon build-custom - -#### Target file or sequence (--input-target) - -Customized builds can be done either by file or sequence. `--input-target file` will consider every file provided with `--input` a single unit. `--input-target sequence` will use every sequence as a unit. - -`--input-target file` is the default behavior and most efficient way to build databases. `--input-target sequence` should only be used when the input sequences are stored in a single file or when classification at sequence level is desired. - -#### Build level (--level) - -The `--level` parameter defines the max. depth of the database for classification. This parameter is relevant because the `--max-fp` is going to be guaranteed at the `--level` chosen. By default, the level will be the same as `--input-target`, meaning that classification will be done either at file or sequence level. - -Alternatively, `--level assembly` will link the file or sequence target information with assembly accessions retrieved from NCBI servers. `--level leaves` or `--level species` (or genus, family, ...) will link the targets with taxonomic information and prune the tree at the chosen level. `--level custom` will use specialization level define in the `--input-file`. - -#### Genome sizes (--genome-size-files) - -Ganon will automatically download auxiliary files to define an approximate genome size for each entry in the taxonomic tree. For `--taxonomy ncbi` the [species_genome_size.txt.gz](https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/) is used. For `--taxonomy gtdb` the [\*_metadata.tar.gz](https://data.gtdb.ecogenomic.org/releases/latest/) files are used. Those files can be directly provided with the `--genome-size-files` argument. - -Genome sizes of parent nodes are calculated as the average of the respective children nodes. Other nodes without direct assigned genome sizes will use the closest parent with a pre-calculated genome size. The genome sizes are stored in the [ganon database](#buildupdate). - -#### Retrieving info (--ncbi-sequence-info, --ncbi-file-info) - -Further taxonomy and assembly linking information has to be collected to properly build the database. `--ncbi-sequence-info` and `--ncbi-file-info` allow customizations on this step. - -When `--input-target sequence`, `--ncbi-sequence-info` argument allows the use of NCBI e-utils webservices (`eutils`) or downloads accession2taxid files to extract target information (options `nucl_gb` `nucl_wgs` `nucl_est` `nucl_gss` `pdb` `prot` `dead_nucl` `dead_wgs` `dead_prot`). By default, ganon uses `eutils` up-to 50000 input sequences, otherwise it downloads `nucl_gb` `nucl_wgs` from https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/. Previously downloaded files can be directly provided with this argument. - -When `--input-target file`, `--ncbi-file-info` uses `assembly_summary.txt` from https://ftp.ncbi.nlm.nih.gov/genomes/ to extract target information (options `refseq` `genbank` `refseq_historical` `genbank_historical`. Previously downloaded files can be directly provided with this argument. - -If you are using outdated, removed or inactive assembly or sequence files and accessions from NCBI, make sure to include `dead_nucl` `dead_wgs` for `--ncbi-sequence-info` or `refseq_historical` `genbank_historical` for `--ncbi-file-info`. `eutils` option does not work with outdated accessions. - -### ganon report - -#### report type (--report-type) - -Several reports are availble with `--report-type`: `reads`, `abundance`, `dist`, `corr`, `matches`: - -`reads` reports **sequence abundances** which are the basic proportion of reads classified in the sample. - -`abundance` will convert sequence abundance into **taxonomic abundances** by re-distributing read counts among leaf nodes and correcting by genome size. The re-distribution applies for reads classified with a LCA assignment and it is proportional to the number of unique matches of leaf nodes available in the ganon database (relative to the LCA node). Genome size is estimated based on [NCBI or GTDB auxiliary files](#genome-sizes---genome-size-files). Genome size correction is applied by rank based on default ranks only (superkingdom phylum class order family genus species assembly). Read counts in intermediate ranks will be corrected based on the closest parent default rank and re-assigned to its original rank. - -`dist` is the same of `reads` with read count re-distribution - -`corr` is the same of `reads` with correction by genome size - -`matches` will report the total number of matches classified, either unique or shared. *This report will output the total number of matches instead the total number of reads reported in all other reports.* - -## Parameters - -``` -usage: ganon [-h] [-v] - {build,build-custom,update,classify,reassign,report,table} ... - -- - - - - - - - - - - _ _ _ _ _ - (_|(_|| |(_)| | - _| v. 1.5.0 -- - - - - - - - - - - -positional arguments: - {build,build-custom,update,classify,reassign,report,table} - build Download and build ganon default databases - (refseq/genbank) - build-custom Build custom ganon databases - update Update ganon default databases - classify Classify reads against built databases - reassign Reassign reads with multiple matches with an EM - algorithm - report Generate reports from classification results - table Generate table from reports - -options: - -h, --help show this help message and exit - -v, --version Show program's version number and exit. -``` - -
- ganon build - -``` -usage: ganon build [-h] [-g [...]] [-a [...]] [-b [...]] [-o] [-c] [-u] [-m [...]] [-z [...]] -d DB_PREFIX [-x] [-t] - [-p] [-f] [-k] [-w] [-s] [-j] [--hibf] [--restart] [--verbose] [--quiet] [--write-info-file] - -options: - -h, --help show this help message and exit - -required arguments: - -g [ ...], --organism-group [ ...] - One or more organism groups to download [archaea, bacteria, fungi, human, invertebrate, - metagenomes, other, plant, protozoa, vertebrate_mammalian, vertebrate_other, viral]. Mutually - exclusive --taxid (default: None) - -a [ ...], --taxid [ ...] - One or more taxonomic identifiers to download. e.g. 562 (-x ncbi) or 's__Escherichia coli' (-x - gtdb). Mutually exclusive --organism-group (default: None) - -d DB_PREFIX, --db-prefix DB_PREFIX - Database output prefix (default: None) - -download arguments: - -b [ ...], --source [ ...] - Source to download [refseq, genbank] (default: ['refseq']) - -o , --top Download limited assemblies for each taxa. 0 for all. (default: 0) - -c, --complete-genomes - Download only sub-set of complete genomes (default: False) - -u , --genome-updater - Additional genome_updater parameters (https://github.com/pirovc/genome_updater) (default: None) - -m [ ...], --taxonomy-files [ ...] - Specific files for taxonomy - otherwise files will be downloaded (default: None) - -z [ ...], --genome-size-files [ ...] - Specific files for genome size estimation - otherwise files will be downloaded (default: None) - -important arguments: - -x , --taxonomy Set taxonomy to enable taxonomic classification, lca and reports [ncbi, gtdb, skip] (default: - ncbi) - -t , --threads - -advanced arguments: - -p , --max-fp Max. false positive rate for bloom filters Mutually exclusive --filter-size. (default: 0.05) - -f , --filter-size Fixed size for filter in Megabytes (MB). Mutually exclusive --max-fp. (default: 0) - -k , --kmer-size The k-mer size to split sequences. (default: 19) - -w , --window-size The window-size to build filter with minimizers. (default: 31) - -s , --hash-functions - The number of hash functions for the interleaved bloom filter [0-5]. 0 to detect optimal value. - (default: 4) - -j , --mode Create smaller or faster filters at the cost of classification speed or database size, - respectively [avg, smaller, smallest, faster, fastest]. If --filter-size is used, - smaller/smallest refers to the false positive rate. By default, an average value is calculated - to balance classification speed and database size. (default: avg) - --hibf Builds an HIBF with raptor/chopper (v3). --mode and --filter-size will be ignored. (default: - False) - -optional arguments: - --restart Restart build/update from scratch, do not try to resume from the latest possible step. - {db_prefix}_files/ will be deleted if present. (default: False) - --verbose Verbose output mode (default: False) - --quiet Quiet output mode (default: False) - --write-info-file Save copy of target info generated to {db_prefix}.info.tsv. Can be re-used as --input-file for - further attempts. (default: False) -``` - -
- -
- ganon build-custom - -``` -usage: ganon build-custom [-h] [-i [...]] [-e] [-n] [-a] [-l] [-m [...]] [-z [...]] [-r [...]] [-q [...]] -d DB_PREFIX - [-x] [-t] [-p] [-f] [-k] [-w] [-s] [-j] [--hibf] [--restart] [--verbose] [--quiet] - [--write-info-file] - -options: - -h, --help show this help message and exit - -required arguments: - -i [ ...], --input [ ...] - Input file(s) and/or folder(s). Mutually exclusive --input-file. (default: None) - -e , --input-extension - Required if --input contains folder(s). Wildcards/Shell Expansions not supported (e.g. *). - (default: fna.gz) - -d DB_PREFIX, --db-prefix DB_PREFIX - Database output prefix (default: None) - -custom arguments: - -n , --input-file Manually set information for input files: file [target node specialization - specialization name]. target is the sequence identifier if --input-target sequence (file - can be repeated for multiple sequences). if --input-target file and target is not set, filename - is used. node is the taxonomic identifier. Mutually exclusive --input (default: None) - -a , --input-target Target to use [file, sequence]. By default: 'file' if multiple input files are provided or - --input-file is set, 'sequence' if a single file is provided. Using 'file' is recommended and - will speed-up the building process (default: None) - -l , --level Use a specialized target to build the database. By default, --level is the --input-target. - Options: any available taxonomic rank [species, genus, ...] or 'leaves' (requires --taxonomy). - Further specialization options [assembly, custom]. assembly will retrieve and use the assembly - accession and name. custom requires and uses the specialization field in the --input-file. - (default: None) - -m [ ...], --taxonomy-files [ ...] - Specific files for taxonomy - otherwise files will be downloaded (default: None) - -z [ ...], --genome-size-files [ ...] - Specific files for genome size estimation - otherwise files will be downloaded (default: None) - -ncbi arguments: - -r [ ...], --ncbi-sequence-info [ ...] - Uses NCBI e-utils webservices or downloads accession2taxid files to extract target information. - [eutils, nucl_gb, nucl_wgs, nucl_est, nucl_gss, pdb, prot, dead_nucl, dead_wgs, dead_prot or one - or more accession2taxid files from https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/]. - By default uses e-utils up-to 50000 sequences or downloads nucl_gb nucl_wgs otherwise. (default: - []) - -q [ ...], --ncbi-file-info [ ...] - Downloads assembly_summary files to extract target information. [refseq, genbank, - refseq_historical, genbank_historical or one or more assembly_summary files from - https://ftp.ncbi.nlm.nih.gov/genomes/] (default: ['refseq', 'genbank']) - -important arguments: - -x , --taxonomy Set taxonomy to enable taxonomic classification, lca and reports [ncbi, gtdb, skip] (default: - ncbi) - -t , --threads - -advanced arguments: - -p , --max-fp Max. false positive rate for bloom filters Mutually exclusive --filter-size. (default: 0.05) - -f , --filter-size Fixed size for filter in Megabytes (MB). Mutually exclusive --max-fp. (default: 0) - -k , --kmer-size The k-mer size to split sequences. (default: 19) - -w , --window-size The window-size to build filter with minimizers. (default: 31) - -s , --hash-functions - The number of hash functions for the interleaved bloom filter [0-5]. 0 to detect optimal value. - (default: 4) - -j , --mode Create smaller or faster filters at the cost of classification speed or database size, - respectively [avg, smaller, smallest, faster, fastest]. If --filter-size is used, - smaller/smallest refers to the false positive rate. By default, an average value is calculated - to balance classification speed and database size. (default: avg) - --hibf Builds an HIBF with raptor/chopper (v3). --mode and --filter-size will be ignored. (default: - False) - -optional arguments: - --restart Restart build/update from scratch, do not try to resume from the latest possible step. - {db_prefix}_files/ will be deleted if present. (default: False) - --verbose Verbose output mode (default: False) - --quiet Quiet output mode (default: False) - --write-info-file Save copy of target info generated to {db_prefix}.info.tsv. Can be re-used as --input-file for - further attempts. (default: False) -``` - -
- -
- ganon update - -``` -usage: ganon update [-h] -d DB_PREFIX [-o] [-t] [--restart] [--verbose] [--quiet] [--write-info-file] - -options: - -h, --help show this help message and exit - -required arguments: - -d DB_PREFIX, --db-prefix DB_PREFIX - Existing database input prefix (default: None) - -important arguments: - -o , --output-db-prefix - Output database prefix. By default will be the same as --db-prefix and overwrite files (default: - None) - -t , --threads - -optional arguments: - --restart Restart build/update from scratch, do not try to resume from the latest possible step. - {db_prefix}_files/ will be deleted if present. (default: False) - --verbose Verbose output mode (default: False) - --quiet Quiet output mode (default: False) - --write-info-file Save copy of target info generated to {db_prefix}.info.tsv. Can be re-used as --input-file for - further attempts. (default: False) -``` - -
- -
- ganon classify - -``` -usage: ganon classify [-h] -d [DB_PREFIX ...] [-s [reads.fq[.gz] ...]] [-p [reads.1.fq[.gz] reads.2.fq[.gz] ...]] - [-c [...]] [-e [...]] [-o] [--output-lca] [--output-all] [--output-unclassified] [--output-single] - [-t] [-l [...]] [-r [...]] [-a] [--verbose] [--quiet] - -options: - -h, --help show this help message and exit - -required arguments: - -d [DB_PREFIX ...], --db-prefix [DB_PREFIX ...] - Database input prefix[es] (default: None) - -s [reads.fq[.gz] ...], --single-reads [reads.fq[.gz] ...] - Multi-fastq[.gz] file[s] to classify (default: None) - -p [reads.1.fq[.gz] reads.2.fq[.gz] ...], --paired-reads [reads.1.fq[.gz] reads.2.fq[.gz] ...] - Multi-fastq[.gz] pairs of file[s] to classify (default: None) - -cutoff/filter arguments: - -c [ ...], --rel-cutoff [ ...] - Min. percentage of a read (set of minimizers) shared with the a reference necessary to consider - a match. Generally used to cutoff low similarity matches. Single value or one per database (e.g. - 0.7 1 0.25). 0 for no cutoff (default: [0.75]) - -e [ ...], --rel-filter [ ...] - Additional relative percentage of minimizers (relative to the best match) to keep a match. - Generally used to select best matches above cutoff. Single value or one per hierarchy (e.g. 0.1 - 0). 1 for no filter (default: [0.0]) - -output arguments: - -o , --output-prefix - Output prefix for output (.rep) and report (.tre). Empty to output to STDOUT (only .rep) - (default: None) - --output-lca Output an additional file with one lca match for each read (.lca) (default: False) - --output-all Output an additional file with all matches. File can be very large (.all) (default: False) - --output-unclassified - Output an additional file with unclassified read headers (.unc) (default: False) - --output-single When using multiple hierarchical levels, output everything in one file instead of one per - hierarchy (default: False) - -other arguments: - -t , --threads Number of sub-processes/threads to use (default: 1) - -l [ ...], --hierarchy-labels [ ...] - Hierarchy definition of --db-prefix files to be classified. Can also be a string, but input will - be sorted to define order (e.g. 1 1 2 3). The default value reported without hierarchy is 'H1' - (default: None) - -r [ ...], --ranks [ ...] - Ranks to report taxonomic abundances (.tre). empty will report default ranks [superkingdom, - phylum, class, order, family, genus, species, assembly]. This file can be re-generated with the - 'ganon report' command for other types of abundances (reads, matches) with further filtration - and output options (default: []) - -a, --reassign Reassign reads with multiple matches with an EM algorithm. Will enforce --output-all. This file - can be re-generated with the 'ganon reassign'. (default: False) - --verbose Verbose output mode (default: False) - --quiet Quiet output mode (default: False) -``` - -
- -
- ganon reassign - -``` -usage: ganon reassign [-h] -i -o OUTPUT_PREFIX [-e] [-s] [--verbose] [--quiet] - -options: - -h, --help show this help message and exit - -required arguments: - -i , --input-prefix Input prefix to find files from ganon classify (.all and optionally .rep) (default: None) - -o OUTPUT_PREFIX, --output-prefix OUTPUT_PREFIX - Output prefix for reassigned file (.all and optionally .rep). In case of multiple files, the - base input filename will be appended at the end of the output file 'output_prefix + - FILENAME.all' (default: None) - -EM arguments: - -e , --max-iter Max. number of iterations for the EM algorithm. If 0, will run until convergence (check - --threshold) (default: 10) - -s , --threshold Convergence threshold limit to stop the EM algorithm. (default: 0) - -other arguments: - --verbose Verbose output mode (default: False) - --quiet Quiet output mode (default: False) -``` - -
- -
- ganon report - -``` -usage: ganon report [-h] -i [...] [-e INPUT_EXTENSION] -o OUTPUT_PREFIX [-d [...]] [-x] [-m [...]] [-z [...]] [-f] [-t] - [-r [...]] [-s] [-a] [-y] [-p [...]] [-k [...]] [-c] [--verbose] [--quiet] [--min-count] - [--max-count] [--names [...]] [--names-with [...]] [--taxids [...]] - -options: - -h, --help show this help message and exit - -required arguments: - -i [ ...], --input [ ...] - Input file(s) and/or folder(s). '.rep' file(s) from ganon classify. (default: None) - -e INPUT_EXTENSION, --input-extension INPUT_EXTENSION - Required if --input contains folder(s). Wildcards/Shell Expansions not supported (e.g. *). - (default: rep) - -o OUTPUT_PREFIX, --output-prefix OUTPUT_PREFIX - Output prefix for report file 'output_prefix.tre'. In case of multiple files, the base input - filename will be appended at the end of the output file 'output_prefix + FILENAME.tre' (default: - None) - -db/tax arguments: - -d [ ...], --db-prefix [ ...] - Database prefix(es) used for classification. Only '.tax' file(s) are required. If not provided, - new taxonomy will be downloaded. Mutually exclusive with --taxonomy. (default: []) - -x , --taxonomy Taxonomy database to use [ncbi, gtdb, skip]. Mutually exclusive with --db-prefix. (default: - ncbi) - -m [ ...], --taxonomy-files [ ...] - Specific files for taxonomy - otherwise files will be downloaded (default: None) - -z [ ...], --genome-size-files [ ...] - Specific files for genome size estimation - otherwise files will be downloaded (default: None) - -output arguments: - -f , --output-format - Output format [text, tsv, csv, bioboxes]. text outputs a tabulated formatted text file for - better visualization. bioboxes is the the CAMI challenge profiling format (only - percentage/abundances are reported). (default: tsv) - -t , --report-type Type of report [abundance, reads, matches, dist, corr]. 'abundance' -> tax. abundance (re- - distribute read counts and correct by genome size), 'reads' -> sequence abundance, 'matches' -> - report all unique and shared matches, 'dist' -> like reads with re-distribution of shared read - counts only, 'corr' -> like abundance without re-distribution of shared read counts (default: - abundance) - -r [ ...], --ranks [ ...] - Ranks to report ['', 'all', custom list]. 'all' for all possible ranks. empty for default ranks - [superkingdom, phylum, class, order, family, genus, species, assembly]. (default: []) - -s , --sort Sort report by [rank, lineage, count, unique]. Default: rank (with custom --ranks) or lineage - (with --ranks all) (default: ) - -a, --no-orphan Omit orphan nodes from the final report. Otherwise, orphan nodes (= nodes not found in the - db/tax) are reported as 'na' with root as direct parent. (default: False) - -y, --split-hierarchy - Split output reports by hierarchy (from ganon classify --hierarchy-labels). If activated, the - output files will be named as '{output_prefix}.{hierarchy}.tre' (default: False) - -p [ ...], --skip-hierarchy [ ...] - One or more hierarchies to skip in the report (from ganon classify --hierarchy-labels) (default: - []) - -k [ ...], --keep-hierarchy [ ...] - One or more hierarchies to keep in the report (from ganon classify --hierarchy-labels) (default: - []) - -c , --top-percentile - Top percentile filter, based on percentage/relative abundance. Applied only at default ranks - [superkingdom, phylum, class, order, family, genus, species, assembly] (default: 0) - -optional arguments: - --verbose Verbose output mode (default: False) - --quiet Quiet output mode (default: False) - -filter arguments: - --min-count Minimum number/percentage of counts to keep an taxa [values between 0-1 for percentage, >1 - specific number] (default: 0) - --max-count Maximum number/percentage of counts to keep an taxa [values between 0-1 for percentage, >1 - specific number] (default: 0) - --names [ ...] Show only entries matching exact names of the provided list (default: []) - --names-with [ ...] Show entries containing full or partial names of the provided list (default: []) - --taxids [ ...] One or more taxids to report (including children taxa) (default: []) -``` - -
- -
- ganon table - -``` -usage: ganon table [-h] -i [...] [-e] -o OUTPUT_FILE [-l] [-f] [-t] [-a] [-m] [-r] [-n] [--header] - [--unclassified-label] [--filtered-label] [--skip-zeros] [--transpose] [--verbose] [--quiet] - [--min-count] [--max-count] [--names [...]] [--names-with [...]] [--taxids [...]] - -options: - -h, --help show this help message and exit - -required arguments: - -i [ ...], --input [ ...] - Input file(s) and/or folder(s). '.tre' file(s) from ganon report. (default: None) - -e , --input-extension - Required if --input contains folder(s). Wildcards/Shell Expansions not supported (e.g. *). - (default: tre) - -o OUTPUT_FILE, --output-file OUTPUT_FILE - Output filename for the table (default: None) - -output arguments: - -l , --output-value Output value on the table [percentage, counts]. percentage values are reported between [0-1] - (default: counts) - -f , --output-format - Output format [tsv, csv] (default: tsv) - -t , --top-sample Top hits of each sample individually (default: 0) - -a , --top-all Top hits of all samples (ranked by percentage) (default: 0) - -m , --min-frequency - Minimum number/percentage of files containing an taxa to keep the taxa [values between 0-1 for - percentage, >1 specific number] (default: 0) - -r , --rank Define specific rank to report. Empty will report all ranks. (default: None) - -n, --no-root Do not report root node entry and lineage. Direct and shared matches to root will be accounted - as unclassified (default: False) - --header Header information [name, taxid, lineage] (default: name) - --unclassified-label - Add column with unclassified count/percentage with the chosen label. May be the same as - --filtered-label (e.g. unassigned) (default: None) - --filtered-label Add column with filtered count/percentage with the chosen label. May be the same as - --unclassified-label (e.g. unassigned) (default: None) - --skip-zeros Do not print lines with only zero count/percentage (default: False) - --transpose Transpose output table (taxa as cols and files as rows) (default: False) - -optional arguments: - --verbose Verbose output mode (default: False) - --quiet Quiet output mode (default: False) - -filter arguments: - --min-count Minimum number/percentage of counts to keep an taxa [values between 0-1 for percentage, >1 - specific number] (default: 0) - --max-count Maximum number/percentage of counts to keep an taxa [values between 0-1 for percentage, >1 - specific number] (default: 0) - --names [ ...] Show only entries matching exact names of the provided list (default: []) - --names-with [ ...] Show entries containing full or partial names of the provided list (default: []) - --taxids [ ...] One or more taxids to report (including children taxa) (default: []) -``` - -
+For complete examples, databases, installation from source and information -> https://pirovc.github.io/ganon/ \ No newline at end of file diff --git a/docs/classification.md b/docs/classification.md new file mode 100644 index 00000000..5946b5d8 --- /dev/null +++ b/docs/classification.md @@ -0,0 +1,131 @@ +# Classification + +`ganon classify` will match single and/or paired-end reads against one or [more databases](#multiple-and-hierarchical-classification), for example: + +```bash +ganon classify --db-prefix my_db --paired-reads reads.1.fq.gz reads.2.fq.gz --output-prefix results --threads 32 +``` + +`ganon report` will be automatically executed after classification and a report will be created (`.tre`). + +ganon can generate both taxonomic profiling and binning results with `ganon classify` + `ganon report`. Please choose the parameters according to your application. + +### Profiling + +`ganon classify` is set-up by default to perform taxonomic profiling. It uses: + + - strict `--rel-cutoff` and `--rel-filter` values (`0.75` and `0`, respectively) + - `--min-count 0.0001` (0.01%) on `ganon report` to exclude low abundant groups + - `--report-type abundance` on `ganon report` to generate taxonomic abundances, re-distributing read counts and correcting for genome sizes + +### Binning + +To achieve better results for binning reads to specific references, ganon can be configured with: + + - `--output-all` and `--output-lca` to write `.all` `.lca` files for binning results + - less strict `--rel-cutoff` and `--rel-filter` values (e.g. `0.25` and `0.1`, respectively) + - activate the `--reassign` on `ganon classify` (or use the `ganon reassign` procedure) to apply a EM algorithm, re-assigning reads with LCA matches to most probable target (`--level` the database was built) + +!!! tip + Higher `--kmer-size` values on `ganon build` can also improve read binning sensitivity + +## Multiple and Hierarchical classification + +`ganon classify` can be performed in multiple databases at the same time. The databases can also be provided in a hierarchical order. + +Multiple database classification can be performed providing several inputs for `--db-prefix`. They are required to be built with the same `--kmer-size` and `--window-size` values. Multiple databases are considered as one (as if built together) and redundancy in content (same reference in two or more databases) is allowed. + +To classify reads in a hierarchical order, `--hierarchy-labels` should be provided. When using multiple hierarchical levels, output files will be generated for each level (use `--output-single` to generate a single output from multiple hierarchical levels). Please note that some parameters are set for each database (e.g. `--rel-cutoff`) while others are set for each hierarchical level (e.g. `--rel-filter`) + +
+ Examples +
+Classification against 3 database (as if they were one) using the same cutoff: + +```bash +ganon classify --db-prefix db1 db2 db3 \ + --rel-cutoff 0.75 \ + --single-reads reads.fq.gz +``` + +Classification against 3 database (as if they were one) using different error rates for each: + +```bash +ganon classify --db-prefix db1 db2 db3 \ + --rel-cutoff 0.2 0.3 0.1 \ + --single-reads reads.fq.gz +``` + +In this example, reads are going to be classified first against db1 and db2. Reads without a valid match will be further classified against db3. `--hierarchy-labels` are strings and are going to be sorted to define the hierarchy order, disregarding input order: + +```bash +ganon classify --db-prefix db1 db2 db3 \ + --hierarchy-labels 1_first 1_first 2_second \ + --single-reads reads.fq.gz +``` + +In this example, classification will be performed with different `--rel-cutoff` for each database. For each hierarchy levels (`1_first` and `2_second`) a different `--rel-filter` will be used: + +```bash +ganon classify --db-prefix db1 db2 db3 \ + --hierarchy-labels 1_first 1_first 2_second \ + --rel-cutoff 1 0.5 0.25 \ + --rel-filter 0.1 0.5 \ + --single-reads reads.fq.gz +``` + +
+
+ +## Parameter details + +### reads (--single-reads, --paired-reads) + +ganon accepts single-end and paired-end reads. Both types can be use at the same time. In paired-end mode, reads are always reported with the header of the first pair. Paired-end reads are classified in a standard forward-reverse orientation. The max. read length accepted is 65535 (accounting both reads in paired mode). + +### cutoff and filter (--rel-cutoff, --rel-filter) + +ganon has two parameters to control a match between reads and references: `--rel-cutoff` and `--rel-filter`. + +Every read can be classified against none, one or more references. What will be reported is the remaining matches after `cutoff` and `filter` thresholds are applied, based on the number of shared minimizers (or k-mers) between sequences. + +The `cutoff` is the first. It should be set as a minimal value to consider a match between a read and a reference. Next the `filter` is applied to the remaining matches. `filter` thresholds are relative to the best scoring match and control how far from the best match further matches are allowed. `cutoff` can be interpreted as the lower bound to discard spurious matches and `filter` as the fine tuning to control what to keep. + +For example, using `--kmer-size 19` (and `--window-size 19` to simplify the example), a certain read (100bp) has the following matches with the 5 references (`ref1..5`), sorted by shared k-mers: + +| reference | shared k-mers | +|-----------|---------------| +| ref1 | 82 | +| ref2 | 68 | +| ref3 | 44 | +| ref4 | 25 | +| ref5 | 20 | + +this read can have at most 82 shared k-mers (`100-19+1=82`). With `--rel-cutoff 0.25`, the following matches will be discarded: + +| reference | shared k-mers | --rel-cutoff 0.25 | +|-----------|---------------|-------------------| +| ref1 | 82 | | +| ref2 | 68 | | +| ref3 | 44 | | +| ref4 | 25 | | +| ~~ref5~~ | ~~20~~ | X | + +since the `--rel-cutoff` threshold is `82 * 0.25 = 21` (ceiling is applied). Further, with `--rel-filter 0.3`, the following matches will be discarded: + +| reference | shared k-mers | --rel-cutoff 0.25 | --rel-filter 0.3 | +|-----------|---------------|-------------------|------------------| +| ref1 | 82 | | | +| ref2 | 68 | | | +| ~~ref3~~ | ~~44~~ | | X | +| ~~ref4~~ | ~~25~~ | | X | +| ~~ref5~~ | ~~20~~ | X | | + + +since best match is 82, the filter parameter is removing any match below `0.3 * 82 = 57` (ceiling is applied) shared k-mers. `ref1` and `ref2` are reported as matches. + +For databases built with `--window-size`, the relative values are not based on the maximum number of possible shared k-mers but on the actual number of unique minimizers extracted from the read. + +A different `cutoff` can be set for every database in a multiple or hierarchical database classification. A different `filter` can be set for every level of a hierarchical database classification. + +Note that reads that remain with only one reference match (after `cutoff` and `filter` are applied) are considered a unique match. \ No newline at end of file diff --git a/docs/custom_databases.md b/docs/custom_databases.md new file mode 100644 index 00000000..ca078847 --- /dev/null +++ b/docs/custom_databases.md @@ -0,0 +1,270 @@ +# Custom databases + +## Default NCBI assembly or sequence accession + +Besides the automated download and build (`ganon build`) ganon provides a highly customizable build procedure (`ganon build-custom`) to create databases from local sequence files. + +To use custom sequences, just provide them with `--input`. ganon will try to retrieve all necessary information necessary to build a database. + +!!! note + ganon expects assembly accessions in the filename like `GCA_002211645.1_ASM221164v1_genomic.fna.gz`. When using `--input-target sequence` filenames are not important but sequence headers should contain sequence accessions like `>CP022124.1 Fusobacterium nu...`. More information about building by file or sequence can be found [here](#target-file-or-sequence-input-target). + +## Non-standard/custom accessions + +It is also possible to use **non-standard accessions and headers** to build custom databases with `--input-file`. This file should contain the following fields (tab-separated): `file [ target node specialization specialization_name].` Note that file is mandatory and additional fields not. + +!!! tip + If you just want to build a database without any taxonomic or target information, just sent the files with `--input`, use `--taxonomy skip` and choose between `--input-target file` or `sequence`. + +
+ Examples of --input-file +
+ +With --input-target file (default), where my_target_1 and my_target_2 are just names to assign sequences from (unique) sequence files: + +``` +sequences.fasta my_target_1 +others.fasta my_target_2 +``` + +With --input-target sequence, second column should match sequence headers on provided sequence files (that should be repeated for each header): + +``` +sequences.fasta HEADER1 +sequences.fasta HEADER2 +sequences.fasta HEADER3 +others.fasta HEADER4 +others.fasta HEADER5 +``` + +A third column with taxonomic nodes can be provided to link the data with taxonomy. For example with --taxonomy ncbi: + +``` +sequences.fasta FILE_A 562 +others.fasta FILE_B 623 +``` + +``` +sequences.fasta HEADER1 562 +sequences.fasta HEADER2 562 +sequences.fasta HEADER3 562 +others.fasta HEADER4 623 +others.fasta HEADER5 623 +``` + +Further specializations can be used to create a additional classification level after the taxonomic leaves. For example (using --level custom): + +``` +sequences.fasta FILE_A 562 ID44444 Escherichia coli TW10119 +others.fasta FILE_B 623 ID55555 Shigella flexneri 1a +``` + +``` +sequences.fasta HEADER1 562 ID443 Escherichia coli TW10119 +sequences.fasta HEADER2 562 ID297 Escherichia coli PCN079 +sequences.fasta HEADER3 562 ID8873 Escherichia coli P0301867.7 +others.fasta HEADER4 623 ID2241 Shigella flexneri 1a +others.fasta HEADER5 623 ID4422 Shigella flexneri 1b +``` + +
+
+ +## Examples + +Some examples with download and build commands for custom ganon databases from useful and commonly used repositories and datasets for metagenomics analysis: + +### HumGut + +Collection of >30000 genomes from healthy human metagenomes. [Article](https://microbiomejournal.biomedcentral.com/articles/10.1186/s40168-021-01114-w)/[Website](https://arken.nmbu.no/~larssn/humgut/). + +```bash +# Download sequence files +wget --quiet --show-progress "http://arken.nmbu.no/~larssn/humgut/HumGut.tar.gz" +tar xf HumGut.tar.gz + +# Download taxonomy and metadata files +wget "https://arken.nmbu.no/~larssn/humgut/ncbi_nodes.dmp" +wget "https://arken.nmbu.no/~larssn/humgut/ncbi_names.dmp" +wget "https://arken.nmbu.no/~larssn/humgut/HumGut.tsv" +# Generate --input-file from metadata +tail -n+2 HumGut.tsv | awk -F"\t" '{print "fna/"$21"\t"$1"\t"$2}' > HumGut_ganon_input_file.tsv + +# Build ganon database +ganon build-custom --input-file HumGut_ganon_input_file.tsv --taxonomy-files ncbi_nodes.dmp ncbi_names.dmp --db-prefix HumGut --level strain --threads 32 +``` + +Similarly using GTDB taxonomy files: + +```bash +# Download taxonomy files +wget "https://arken.nmbu.no/~larssn/humgut/gtdb_nodes.dmp" +wget "https://arken.nmbu.no/~larssn/humgut/gtdb_names.dmp" + +# Build ganon database +ganon build-custom --input-file HumGut_ganon_input_file.tsv --taxonomy-files gtdb_nodes.dmp gtdb_names.dmp --db-prefix HumGut_gtdb --level strain --threads 32 +``` + +!!! note + There is no need to use ganon's gtdb integration here since GTDB files in NCBI format are available + +### Plasmid, Plastid and Mitochondrion from RefSeq + +Extra repositories from RefSeq release not included as default databases. [Website](https://www.ncbi.nlm.nih.gov/refseq/). + +```bash +# Download sequence files +wget -A genomic.fna.gz -m -nd --quiet --show-progress "ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/plasmid/" +wget -A genomic.fna.gz -m -nd --quiet --show-progress "ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/plastid/" +wget -A genomic.fna.gz -m -nd --quiet --show-progress "ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/mitochondrion/" + +# Build ganon database +ganon build-custom --input plasmid.* plastid.* mitochondrion.* --db-prefix ppm --input-target sequence --level leaves --threads 32 +``` + +### UniVec, UniVec_core + +"UniVec is a non-redundant database of sequences commonly attached to cDNA or genomic DNA during the cloning process." [Website](https://ftp.ncbi.nlm.nih.gov/pub/UniVec/README.uv). Useful to screen for vector and linker/adapter contamination. UniVec_core is a sub-set of the UniVec selected to reduce the false positive hits from real biological sources. + +```bash +# UniVec +wget -O "UniVec.fasta" --quiet --show-progress "ftp://ftp.ncbi.nlm.nih.gov/pub/UniVec/UniVec" +grep -o '^>[^ ]*' UniVec.fasta | sed 's/^>//' | awk '{print "UniVec.fasta\t"$1"\t81077"}' > UniVec_ganon_input_file.tsv +ganon build-custom --input-file UniVec_ganon_input_file.tsv --db-prefix UniVec --input-target sequence --level leaves --threads 8 + +# UniVec_Core +wget -O "UniVec_Core.fasta" --quiet --show-progress "ftp://ftp.ncbi.nlm.nih.gov/pub/UniVec/UniVec_Core" +grep -o '^>[^ ]*' UniVec_Core.fasta | sed 's/^>//' | awk '{print "UniVec_Core.fasta\t"$1"\t81077"}' > UniVec_Core_ganon_input_file.tsv +ganon build-custom --input-file UniVec_Core_ganon_input_file.tsv --db-prefix UniVec_Core --input-target sequence --level leaves --threads 8 +``` + +!!! note + All UniVec entries in the examples are categorized as [Artificial Sequence](https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=81077) (NCBI txid:81077). Some are completely artificial but others may be derived from real biological sources. More information in this [link](https://ftp.ncbi.nlm.nih.gov/pub/UniVec/README.vector.origins). + +### MGnify genome catalogues (MAGs) + +"Genome catalogues are biome-specific collections of metagenomic-assembled and isolate genomes". [Article](https://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/)/[Website](https://www.ebi.ac.uk/metagenomics/browse/genomes)/[FTP](https://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/). + +There are currently (2023-05-04) 8 genome catalogues available: chicken-gut, human-gut, human-oral, marine, non-model-fish-gut, pig-gut and zebrafish-fecal. An example below how to download and build the human-oral catalog: + + +```bash +# Download metadata +wget "https://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/human-oral/v1.0/genomes-all_metadata.tsv" + +# Download sequence files with 12 threads +tail -n+2 genomes-all_metadata.tsv | cut -f 1,20 | xargs -P 12 -n2 sh -c 'curl --silent ${1}| gzip -d | sed -e "1,/##FASTA/ d" | gzip > ${0}.fna.gz' + +# Generate ganon input file +tail -n+2 genomes-all_metadata.tsv | cut -f 1,15 | tr ';' '\t' | awk -F"\t" '{tax="1";for(i=NF;i>1;i--){if(length($i)>3){tax=$i;break;}};print $1".fna.gz\t"$1"\t"tax}' > ganon_input_file.tsv + +# Build ganon database +ganon build-custom --input-file ganon_input_file.tsv --db-prefix mgnify_human_oral_v1 --taxonomy gtdb --level leaves --threads 32 +``` + +!!! note + MGnify genomes catalogues will be build with GTDB taxonomy. + +### Pathogen detection FDA-ARGOS + +A collection of >1400 "microbes that include biothreat microorganisms, common clinical pathogens and closely related species". [Article](https://www.ncbi.nlm.nih.gov/bioproject/231221)/[Website](https://www.ncbi.nlm.nih.gov/bioproject/231221)/[BioProject](https://www.ncbi.nlm.nih.gov/bioproject/231221). + +```bash +# Download sequence files +wget https://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt +grep "strain=FDAARGOS" assembly_summary_refseq.txt > fdaargos_assembly_summary.txt +genome_updater.sh -e fdaargos_assembly_summary.txt -f "genomic.fna.gz" -o download -m -t 12 + +# Build ganon database +ganon build-custom --input download/ --input-recursive --db-prefix fdaargos --ncbi-file-info download/assembly_summary.txt --level assembly --threads 32 +``` + +!!! note + The example above uses [genome_updater](https://github.com/pirovc/genome_updater) to download files + +### BLAST databases (nt, env_nt, nt_prok, ...) + +BLAST databases. [Website](https://blast.ncbi.nlm.nih.gov/Blast.cgi)/[FTP](https://ftp.ncbi.nlm.nih.gov/blast/db/). + +Current available nucleotide databases (2023-05-04): `16S_ribosomal_RNA` `18S_fungal_sequences` `28S_fungal_sequences` `Betacoronavirus` `env_nt` `human_genome` `ITS_eukaryote_sequences` `ITS_RefSeq_Fungi` `LSU_eukaryote_rRNA` `LSU_prokaryote_rRNA` `mito` `mouse_genome` `nt` `nt_euk` `nt_others` `nt_prok` `nt_viruses` `patnt` `pdbnt` `ref_euk_rep_genomes` `ref_prok_rep_genomes` `refseq_rna` `refseq_select_rna` `ref_viroids_rep_genomes` `ref_viruses_rep_genomes` `SSU_eukaryote_rRNA` `tsa_nt` + +!!! note + List currently available nucleotide databases `curl --silent --list-only ftp://ftp.ncbi.nlm.nih.gov/blast/db/ | grep "nucl-metadata.json" | sed 's/-nucl-metadata.json/, /g' | sort` + +!!! warning + Some BLAST databases are very big and may require extreme computational resources to build. You may need to use some [reduction strategies](../default_databases/#reducing-database-size) + +The example below extracts sequences and information from a BLAST db to build a ganon database: + +```bash +# Define BLAST db +db="16S_ribosomal_RNA" + +# Download BLAST db +wget -nd --quiet --show-progress "ftp://ftp.ncbi.nlm.nih.gov/blast/db/${db}*.tar.gz" + +# Merge and extract BLAST db files +cat "${db}"*.tar.gz | tar xvfz - > ex_files.txt +ex_file=$(head -n 1 ex_files.txt) +dbprefix="${ex_file%.*}" + +# Generate sequences from BLAST db +blastdbcmd -entry all -db "${dbprefix}" -out "${db}.fna" + +# Generate ganon input file +blastdbcmd -entry all -db "${dbprefix}" -outfmt "%a %X" | awk -v file="${db}.fna" '{print file"\t"$1"\t"$2 }' > "${db}_ganon_input_file.tsv" + +# Build ganon database +ganon build-custom --input-file "${db}_ganon_input_file.tsv" --db-prefix "${db}" --input-target sequence --level leaves --threads 32 +``` + +!!! note + `blastdbcmd` is a command from BLAST software suite and should be installed separately + +### Files from genome_updater + +To create a ganon database from files previosly downloaded with [genome_updater](https://github.com/pirovc/genome_updater): + +```bash +ganon build-custom --input output_folder_genome_updater/version/ --input-recursive --db-prefix mydb --ncbi-file-info output_folder_genome_updater/assembly_summary.txt --level assembly --threads 32 +``` + +## Parameter details + +### False positive and size (--max-fp, --filter-size) + +ganon indices are based on bloom filters and can have false positive matches. This can be controlled with `--max-fp` parameter. The lower the `--max-fp`, the less chances of false positives matches on classification, but the larger the database size will be. For example, with `--max-fp 0.01` the database will be build so any target (defined by `--level`) will have 1 in a 100 change of reporting a false k-mer match. The false positive of the query (all k-mers of the reads) is higher but directly affected. + +Alternatively, one can set a specific size for the final index with `--filter-size`. When using this option, please observe the theoretic false positive of the index reported at the end of the building process. + +### minimizers (--window-size, --kmer-size) + +in `ganon build`, when `--window-size` > `--kmer-size` minimizers are used. That means that for a every window, a single k-mer will be selected. It produces smaller database files and requires substantially less memory overall. It may increase building times but will have a huge benefit for classification times. Sensitivity and precision can be reduced by small margins. If `--window-size` = `--kmer-size`, all k-mers are going to be used to build the database. + +### Target file or sequence (--input-target) + +Customized builds can be done either by file or sequence. `--input-target file` will consider every file provided with `--input` a single unit. `--input-target sequence` will use every sequence as a unit. + +`--input-target file` is the default behavior and most efficient way to build databases. `--input-target sequence` should only be used when the input sequences are stored in a single file or when classification at sequence level is desired. + +### Build level (--level) + +The `--level` parameter defines the max. depth of the database for classification. This parameter is relevant because the `--max-fp` is going to be guaranteed at the `--level` chosen. By default, the level will be the same as `--input-target`, meaning that classification will be done either at file or sequence level. + +Alternatively, `--level assembly` will link the file or sequence target information with assembly accessions retrieved from NCBI servers. `--level leaves` or `--level species` (or genus, family, ...) will link the targets with taxonomic information and prune the tree at the chosen level. `--level custom` will use specialization level define in the `--input-file`. + +### Genome sizes (--genome-size-files) + +Ganon will automatically download auxiliary files to define an approximate genome size for each entry in the taxonomic tree. For `--taxonomy ncbi` the [species_genome_size.txt.gz](https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/) is used. For `--taxonomy gtdb` the [\*_metadata.tar.gz](https://data.gtdb.ecogenomic.org/releases/latest/) files are used. Those files can be directly provided with the `--genome-size-files` argument. + +Genome sizes of parent nodes are calculated as the average of the respective children nodes. Other nodes without direct assigned genome sizes will use the closest parent with a pre-calculated genome size. The genome sizes are stored in the [ganon database](#buildupdate). + +### Retrieving info (--ncbi-sequence-info, --ncbi-file-info) + +Further taxonomy and assembly linking information has to be collected to properly build the database. `--ncbi-sequence-info` and `--ncbi-file-info` allow customizations on this step. + +When `--input-target sequence`, `--ncbi-sequence-info` argument allows the use of NCBI e-utils webservices (`eutils`) or downloads accession2taxid files to extract target information (options `nucl_gb` `nucl_wgs` `nucl_est` `nucl_gss` `pdb` `prot` `dead_nucl` `dead_wgs` `dead_prot`). By default, ganon uses `eutils` up-to 50000 input sequences, otherwise it downloads `nucl_gb` `nucl_wgs` from https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/. Previously downloaded files can be directly provided with this argument. + +When `--input-target file`, `--ncbi-file-info` uses `assembly_summary.txt` from https://ftp.ncbi.nlm.nih.gov/genomes/ to extract target information (options `refseq` `genbank` `refseq_historical` `genbank_historical`. Previously downloaded files can be directly provided with this argument. + +If you are using outdated, removed or inactive assembly or sequence files and accessions from NCBI, make sure to include `dead_nucl` `dead_wgs` for `--ncbi-sequence-info` or `refseq_historical` `genbank_historical` for `--ncbi-file-info`. `eutils` option does not work with outdated accessions. diff --git a/docs/default_databases.md b/docs/default_databases.md new file mode 100644 index 00000000..394aaca9 --- /dev/null +++ b/docs/default_databases.md @@ -0,0 +1,199 @@ +# Databases + +ganon automates the download, update and build of databases based on NCBI RefSeq and GenBank genomes repositories wtih `ganon build` and `update` commands, for example: + +```bash +ganon build -g archaea bacteria -d arc_bac -c -t 30 +``` + +will download archaeal and bacterial complete genomes from RefSeq and build a database with 30 threads. Some day later, the database can be updated to include newest genomes with: + + +```bash +ganon update -d arc_bac -t 30 +``` + +Additionally, [custom databases](../custom_databases) can be built with customized files and identifiers with the `ganon build-custom` command. + +!!! info + We DO NOT provide pre-built indices for download. ganon can build databases very efficiently. This way, you will always have up-to-date reference sequences and get most out of your data. + + +!!! warning + Databases are one of the most important elements when analyzing metagenomics data and should be chosen carefully based on the study data + +## RefSeq and GenBank + +NCBI RefSeq and GenBank repositories are common resources to obtain reference sequences to analyze metagenomics data. They are mainly divided into domains/organism groups (e.g. archaea, bacteria, fungi, ...) but can be further filtered in many ways. The choice of those filters can drastically change the outcome of results. + + +### Commonly used sub-sets + +| RefSeq | # assemblies | Size (GB) * | | +|---|---|---|---| +| Complete | 295219 | 350 - 500 |
cmd`ganon build --source refseq --organism-group archaea bacteria fungi viral --threads 48 --db-prefix abfv_rs`
| +| One assembly per species | 52779 | 40 - 98 |
cmd`ganon build --source refseq --organism-group archaea bacteria fungi viral --threads 48 --genome-updater "-A 'species:1'" --db-prefix abfv_rs_t1s`
| +| Complete genomes (higher quality) | 44121 | 19 - 64 |
cmd`ganon build --source refseq --organism-group archaea bacteria fungi viral --threads 48 --complete-genomes --db-prefix abfv_rs_cg`
| +| One assembly per species of complete genomes | 19713 | 8 - 27 |
cmd`ganon build --source refseq --organism-group archaea bacteria fungi viral --threads 48 --complete-genomes "-A 'species:1'" --db-prefix abfv_rs_cg_t1s`
| +| One representative assembly per species | 18073 | 21 - 35 |
cmd`ganon build --source refseq --organism-group archaea bacteria fungi viral --threads 48 --genome-updater "-c 'representative genome'" --db-prefix abfv_rs_rg`
| + +| GenBank | # assemblies | Size (GB) * | | +|---|---|---|---| +| Complete | 1595845 | |
cmd`ganon build --source genbank --organism-group archaea bacteria fungi viral --threads 48 --db-prefix abfv_gb`
| +| One assembly per species | 99505 | 91 - 420 |
cmd`ganon build --source genbank --organism-group archaea bacteria fungi viral --threads 48 --genome-updater "-A 'species:1'" --db-prefix abfv_gb_t1s`
| +| Complete genomes (higher quality) | 92917 | |
cmd`ganon build --source genbank --organism-group archaea bacteria fungi viral --threads 48 --complete-genomes --db-prefix abfv_gb_cg`
| +| One assembly per species of complete genomes | 34497 | |
cmd`ganon build --source genbank --organism-group archaea bacteria fungi viral --threads 48 --complete-genomes "-A 'species:1'" --db-prefix abfv_gb_cg_t1s`
| + +\* Size (GB) is the final size of the database and the approximate amount of RAM necessary to build it (calculated with default parameters). The two values represent databases built with and without the `--hibf` parameter, respectively. The trade-offs between those two modes are explained [here](#hibf). + +!!! warning + The `# assemblies` were obtained on 2023-03-14 accounting for archaea, bacteria, fungi and viral groups only. By the time you are reading this, those numbers certainly grew a bit. + +- As a rule of thumb, the more the better, so choose the most comprehensive sub-set as possible given your computational resources +- Databases can have a fixed size/RAM usage with the `--filter-size` parameter. Beware that smaller filters will increase the false positive rates when classifying. Other approaches [can reduce the size/RAM requirements with some trade-offs](#reducing-database-size). +- Further examples of commonly used database can be found [here](../custom_databases/#examples). +- Alternatively, you can build one database for each organism group separately and use them in `ganon classify` in any order or even stack them hierarchically. This way combination of multiple databases are possible, extending use cases. + +!!! tip + assemblies usually represent strains and can be used to do "strain-level classification" + +### Specific organisms or taxonomic groups + +It is also possible to generate databases for specific organisms or taxonomic branches with `-a/--taxid`, for example: + +```bash +ganon build --source refseq --taxid 562 317 --threads 48 --db-prefix coli_syringae +``` + +will download and build a database for all *Escherichia coli* (taxid:562) and *Pseudomonas syringae* (taxid:317) assemblies from RefSeq. + +### More filter options + +ganon uses [genome_updater](https://github.com/pirovc/genome_updater) to manage downloads and further specific options and filters can be provided with the paramer `-u/--genome-updater`, for example: + +```bash +ganon build -g bacteria -t 48 -d bac_refseq --genome-updater "-A 'genus:3' -E 20230101" +``` + +will download top 3 archaeal assemblies for each genus with date before 2023-01-01. For more information about genome_updater parameters, please check the [repository](https://github.com/pirovc/genome_updater). + +## GTDB + +By default, ganon will use the NCBI Taxonomy to build the database. However, [GTDB](gtdb.ecogenomic.org/) is fully supported and can be used with the parameter `--taxonomy gtdb`. + +| GTDB R214 | # assemblies | Size (GB) | | +|---|---|---|---| +| Complete | 402709 | |
cmd`ganon build --source refseq genbank --organism-group archaea bacteria --threads 48 --taxonomy gtdb --db-prefix ab_gtdb`
| +| One assembly for each species | 85205 | |
cmd`ganon build --source refseq genbank --organism-group archaea bacteria --threads 48 --taxonomy gtdb --top 1 --db-prefix ab_gtdb_t1s`
| + +Filtering by taxonomic entries also work with GTDB, for example: + +```bash +ganon build --db-prefix fuso_gtdb --taxid "f__Fusobacteriaceae" --source refseq genbank --taxonomy gtdb --threads 12 +``` + +!!! info + GTDB covers only bacteria and archaea groups and has assemblies from both RefSeq and GenBank. + +## Update (ganon update) + +Default ganon databases generated with the `ganon build` can be updated with `ganon update`. This procedure will download new files and re-generate the ganon database with the updated entries. + +For example, a database generated with the following command: + +```bash +ganon build --db-prefix arc_cg_rs --source refseq --organism-group archaea --complete-genomes --threads 12 +``` + +will contain all archaeal complete genomes from NCBI RefSeq at the time of running. Some days later, the database can be updated, fetching only new sequences added to the NCBI repository with the command: + +```bash +ganon update --db-prefix arc_cg_rs --threads 12 +``` + +!!! tip + To not overwrite the current database and create a new one with the updated files, use the `--output-db-prefix` parameter. + +## Reproducibility + +If you use ganon with default databases and want to re-generate it later or keep track of the content for reproducibility purposes, you can save the `assembly_summary.txt` file located inside the `{output_prefix}_files/` directory. To re-download the exact same snapshot of files used, one could use [genome_updater](https://github.com/pirovc/genome_updater), for example: + +```bash +genome_updater.sh -e assembly_summary.txt -f "genomic.fna.gz" -o recovered_files -m -t 12 +``` + +## Reducing database size + +### HIBF + +The Hierarchical Interleaved Bloom Filter (HIBF) is an improvement over the Interleaved Bloom Filter (IBF) and provides *smaller* databases with *faster* query times ([pre-print](https://www.biorxiv.org/content/10.1101/2022.08.01.502266v1)). However, the HIBF takes longer to build and has less flexibility regarding size control. The HIBF can be generated in `ganon build` and `ganon build-custom` with the `--hibf` parameter. + +**This is the best method to reduce large database sizes and to achieve faster classification speed.** + +!!! hint + - For larger reference sets, huge amount of reads to query or production level analysis -> HIBF + - For quick build and analysis with smaller data -> IBF (default) + +!!! warning + [raptor (v3.0.0)](https://github.com/seqan/raptor/releases/tag/raptor-v3.0.0) has to be installed to build databases with `--hibf` + +### Top assemblies + +RefSeq and GenBank are highly biased toward some few organisms. This means that some species are highly represented in number of assemblies compared to others. This can not only bias analysis but also brings redundancy to the database. Choosing a certain number of top assemblies can mitigate those issues. Database sizes can also be drastically reduced without this redundancy, but "strain-level" analysis are then not possible. We recommend using top assemblies for larger and comprehensive reference sets (like the ones listed [above](#refseq-and-genbank)) and use the full set of assemblies for specific clade analysis. + +!!! Example + - `ganon build --top 1` will select one assembly for each taxonomic leaf (NCBI taxonomy still has strain, sub-species, ...) + - `ganon build --genome-updater "-A 'species:1'"` will select one assembly for each species + - `ganon build --genome-updater "-A 'genus:3'"` will select three assemblies for each genus + +### False positive + +A higher `--max-fp` value will generate a smaller database but with a higher number of false positive matches on classification. [More details](../custom_databases/#false-positive-and-size-max-fp-filter-size). Values between `0.01` and `0.3` are generally used. + +### Fixed size + +A fixed size for the database filter can be defined with `--filter-size`. The smaller the filter size, the higher the false positive chances on classification. When using a fixed filter size, ganon will report the max. and avg. false positive rate at the end of the build. [More details](../custom_databases/#false-positive-and-size-max-fp-filter-size). + +### Mode + +`--mode` offers 5 different categories to build a database controlling the trade-off between size and classification speed. + +- `avg`: Balanced mode +- `smaller` or `smallest`: create smaller databases with slower classification speed +- `fast` or `fastest`: create bigger databases with faster classification speed + +!!! Warning + If `--filter-size` is used, `smaller` and `smallest` to the false positive and not to the database size which is fixed. + +### Split databases + +Ganon allows classification with multiple databases in one level or in an hierarchy ([More details](../classification/#multiple-and-hierarchical-classification)). This means that databases can be built separately and used in any combination as desired. There are usually some benefits of doing so: + +- Smaller databases when building by organism group, for example: one for bacteria, another for viruses, ... since average genome sizes are quite different. +- Easier to maintain and update. +- Extend use cases and avoid misclassification due to contaminated databases. +- Use databases as quality control, for example: remove reads matching one database of host or vectors (check out `ganon report --skip-hierarchy`). + +### k-mer and window size + +Define how much unique information is stored in the database. [More details](../custom_databases/#minimizers-window-size-kmer-size) + +- The smaller the `--kmer-size`, the less unique they will be, reducing database size but also sensitivity in classification. +- The bigger the `--window-size`, the less information needs to be stored resulting in smaller databases but with decrease classification accuracy. + +### Example + +Besides the huge benefits of using [HIBF](#hibf) and specific sub-sets of big repositories shown on the [default databases table](#commonly-used-sub-sets), examples of other reduction strategies (without `--hibf`) can be seen below: + +*RefSeq archaeal complete genomes from 20230505* + +| Strategy | Size (MB) | Smaller | Trade-off | | +|---|---|---|---|---| +| `default` | 318 | - | - |
cmd`ganon build --source refseq --organism-group archaea --threads 12 --complete-genomes --db-prefix arc_rs_cg`
| +| `--mode smallest` | 301 | 5% | Slower classification |
cmd`ganon build --source refseq --organism-group archaea --threads 12 --complete-genomes --mode smallest --db-prefix arc_rs_cg_smallest`
| +| `--filter-size 256` | 256 | 19% | Higher false positive on classification |
cmd`ganon build --source refseq --organism-group archaea --threads 12 --complete-genomes --filter-size 256 --db-prefix arc_rs_cg_fs256`
| +| `--window-size 35` | 249 | 21% | Less sensitive classification |
cmd`ganon build --source refseq --organism-group archaea --threads 12 --complete-genomes --window-size 35 --db-prefix arc_rs_cg_ws35`
| +| `--max-fp 0.2` | 190 | 40% | Higher false positive on classification |
cmd`ganon build --source refseq --organism-group archaea --threads 12 --complete-genomes --max-fp 0.2 --db-prefix arc_rs_cg_fp0.2`
| + +!!! note + This is an illustrative example and the reduction proportions for different configuration may be quite different diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..af1675d6 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,528 @@ +# ganon + +[![Build Status](https://travis-ci.com/pirovc/ganon.svg?branch=master)](https://travis-ci.com/pirovc/ganon) [![codecov](https://codecov.io/gh/pirovc/ganon/branch/master/graph/badge.svg)](https://codecov.io/gh/pirovc/ganon) [![Anaconda-Server Badge](https://anaconda.org/bioconda/ganon/badges/downloads.svg)](https://anaconda.org/bioconda/ganon) [![Anaconda-Server Badge](https://anaconda.org/bioconda/ganon/badges/platforms.svg)](https://anaconda.org/bioconda/ganon) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/ganon/README.html) [![Publication](https://img.shields.io/badge/DOI-10.1101%2F406017-blue)](https://dx.doi.org/10.1093/bioinformatics/btaa458) + +ganon is designed to index large sets of genomic reference sequences and to classify reads against them efficiently. The tool uses Interleaved Bloom Filters as indices based on k-mers/minimizers. It was mainly developed, but not limited, to the metagenomics classification problem: quickly assign sequence fragments to their closest reference among thousands of references. After classification, taxonomic abundance is estimated and reported. + +## Features + +- NCBI and GTDB native support for taxonomic classification (also runs without taxonomy) +- integrated download of commonly used reference sequences from RefSeq/Genbank (`ganon build`) +- update indices incrementally (`ganon update`) +- customizable build for pre-downloaded or non-standard sequence files (`ganon build-custom`) +- build and classify at different taxonomic levels, file, sequence, strain/assembly or custom specialization +- perform [hierarchical classification](#multiple-and-hierarchical-classification): use several databases in any order +- [report](#report) the lowest common ancestor (LCA) but also multiple and unique matches for every read +- [report](#report) sequence or taxonomic abundances as well as total number of matches +- reassignment of reads with multiple matches to a unique match with an EM algorithm +- generate reports and contingency tables for multi-sample studies with several filter options + +ganon achieved very good results in [our own evaluations](https://dx.doi.org/10.1093/bioinformatics/btaa458) but also in independent evaluations: [LEMMI](https://lemmi-v1.ezlab.org/), [LEMMI v2](https://lemmi.ezlab.org/) and [CAMI2](https://dx.doi.org/10.1038/s41592-022-01431-4) + +## Installation with conda + +The easiest way to install ganon is via conda, using the bioconda and conda-forge channels: + +```bash +conda install -c bioconda -c conda-forge ganon +``` + +However, there are possible performance benefits compiling ganon from source in the target machine rather than using the conda version. To do so, please follow the instructions below: + +## Installation from source + +### Dependencies + +- gcc >=7 +- cmake >=3.10 +- zlib +- python >=3.6 +- pandas >=1.1.0 +- [multitax](https://github.com/pirovc/multitax) >=1.3.1 + +```bash +python3 -V # >=3.6 +python3 -m pip install "pandas>=1.1.0" "multitax>=1.3.1" +``` + +### Downloading and building ganon + submodules + +```bash +git clone --recurse-submodules https://github.com/pirovc/ganon.git +``` + +```bash +cd ganon +python3 setup.py install --record files.txt # optional +mkdir build_cpp +cd build_cpp +cmake -DCMAKE_BUILD_TYPE=Release -DVERBOSE_CONFIG=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCONDA=OFF -DLONGREADS=OFF .. +make +sudo make install # optional +``` + +- to change install location (e.g. `/myprefix/bin/`), set the installation prefix in the cmake command with `-DCMAKE_INSTALL_PREFIX=/myprefix/ ` +- use `-DINCLUDE_DIRS` to set alternative paths to cxxopts and Catch2 libs. +- to classify extremely large reads (>65000bp) use `-DLONGREADS=Ã’N` + +If everything was properly installed, the following commands should show the help pages without errors: + +```bash +ganon -h +``` + +### Running tests + +```bash +python3 -m unittest discover -s tests/ganon/integration/ +python3 -m unittest discover -s tests/ganon/integration_online/ # optional - downloads large files +cd build_cpp/ +ctest -VV . +``` + +## Important parameters + +The most important parameters and trade-offs are: + +- `ganon build` `--hibf`: build smaller databases that can be queried faster. Building will take longer. +- `ganon build` `--window-size --kmer-size`: the *window* value should always be the same or larger than the *kmer* value. The larger the difference between them, the smaller the database will be. However, some sensitivity/precision loss in classification is expected with small *kmer* and/or large *window*. Larger *kmer* values (e.g. `31`) will improve classification, specially read binning, at a cost of way bigger databases. +--- +- `ganon classify` `--rel-cutoff`: this value defines the threshold for matches between reads and database. Higher `--rel-cutoff` values will improve precision and decrease sensitivity with expected less unique matches but an increase in overall matches. For taxonomic profiling, a higher value between `0.4` and `0.8` may provide better results. For read binning, lower values between `0.2` and `0.4` are recommended. +- `ganon classify` `--rel-filter`: further filter top matches after cutoff is applied. Usually set between `0` and `0.2`. +- `ganon classify` `--reassign`: runs an EM-algorithm to reassign reads that received multiple matches. It provides a unique match for each read at the level the database was built (e.g. assembly or species). Mostly useful for read binning, with little overall impact on taxonomic profiling. Can be used independently with `ganon reassign`. +--- +- `ganon report` `--report-type`: reports either taxonomic, sequence or matches abundances. Use `corr` or `abundance` for taxonomic profiling, `reads` or `dist` for sequence profiling and `matches` to report a summary of all matches. +- `ganon report` `--min-count`: cutoff to discard underrepresented taxa. Useful to remove the common long tail of spurious matches and false positives when performing classification. Values between `0.0001` (0.01%) and `0.001` (0.1%) improved sensitivity and precision in our evaluations. The higher the value, the more precise the outcome, with a sensitivity loss. Alternatively `--top-percentile` can be used to keep a relative amount of taxa instead a hard cutoff. + +The numeric values above are averages from several experiments with different sample types and database contents. They may not work as expected for your data. If you are not sure which values to use or see something unexpected, please open an [issue](https://github.com/pirovc/ganon/issues). + +## Parameters + +``` +usage: ganon [-h] [-v] + {build,build-custom,update,classify,reassign,report,table} ... + +- - - - - - - - - - + _ _ _ _ _ + (_|(_|| |(_)| | + _| v. 1.6.0 +- - - - - - - - - - + +positional arguments: + {build,build-custom,update,classify,reassign,report,table} + build Download and build ganon default databases + (refseq/genbank) + build-custom Build custom ganon databases + update Update ganon default databases + classify Classify reads against built databases + reassign Reassign reads with multiple matches with an EM + algorithm + report Generate reports from classification results + table Generate table from reports + +options: + -h, --help show this help message and exit + -v, --version Show program's version number and exit. +``` + +
+ ganon build + +``` +usage: ganon build [-h] [-g [...]] [-a [...]] [-b [...]] [-o] [-c] [-u] [-m [...]] [-z [...]] -d DB_PREFIX [-x] [-t] + [-p] [-f] [-k] [-w] [-s] [-j] [--hibf] [--restart] [--verbose] [--quiet] [--write-info-file] + +options: + -h, --help show this help message and exit + +required arguments: + -g [ ...], --organism-group [ ...] + One or more organism groups to download [archaea, bacteria, fungi, human, invertebrate, + metagenomes, other, plant, protozoa, vertebrate_mammalian, vertebrate_other, viral]. Mutually + exclusive --taxid (default: None) + -a [ ...], --taxid [ ...] + One or more taxonomic identifiers to download. e.g. 562 (-x ncbi) or 's__Escherichia coli' (-x + gtdb). Mutually exclusive --organism-group (default: None) + -d DB_PREFIX, --db-prefix DB_PREFIX + Database output prefix (default: None) + +download arguments: + -b [ ...], --source [ ...] + Source to download [refseq, genbank] (default: ['refseq']) + -o , --top Download limited assemblies for each taxa. 0 for all. (default: 0) + -c, --complete-genomes + Download only sub-set of complete genomes (default: False) + -u , --genome-updater + Additional genome_updater parameters (https://github.com/pirovc/genome_updater) (default: None) + -m [ ...], --taxonomy-files [ ...] + Specific files for taxonomy - otherwise files will be downloaded (default: None) + -z [ ...], --genome-size-files [ ...] + Specific files for genome size estimation - otherwise files will be downloaded (default: None) + +important arguments: + -x , --taxonomy Set taxonomy to enable taxonomic classification, lca and reports [ncbi, gtdb, skip] (default: + ncbi) + -t , --threads + +advanced arguments: + -p , --max-fp Max. false positive rate for bloom filters Mutually exclusive --filter-size. (default: 0.05) + -f , --filter-size Fixed size for filter in Megabytes (MB). Mutually exclusive --max-fp. (default: 0) + -k , --kmer-size The k-mer size to split sequences. (default: 19) + -w , --window-size The window-size to build filter with minimizers. (default: 31) + -s , --hash-functions + The number of hash functions for the interleaved bloom filter [0-5]. 0 to detect optimal value. + (default: 4) + -j , --mode Create smaller or faster filters at the cost of classification speed or database size, + respectively [avg, smaller, smallest, faster, fastest]. If --filter-size is used, + smaller/smallest refers to the false positive rate. By default, an average value is calculated + to balance classification speed and database size. (default: avg) + --hibf Builds an HIBF with raptor/chopper (v3). --mode and --filter-size will be ignored. (default: + False) + +optional arguments: + --restart Restart build/update from scratch, do not try to resume from the latest possible step. + {db_prefix}_files/ will be deleted if present. (default: False) + --verbose Verbose output mode (default: False) + --quiet Quiet output mode (default: False) + --write-info-file Save copy of target info generated to {db_prefix}.info.tsv. Can be re-used as --input-file for + further attempts. (default: False) +``` + +
+ +
+ ganon build-custom + +``` +usage: ganon build-custom [-h] [-i [...]] [-e] [-c] [-n] [-a] [-l] [-m [...]] [-z [...]] [-r [...]] [-q [...]] -d + DB_PREFIX [-x] [-t] [-p] [-f] [-k] [-w] [-s] [-j] [--hibf] [--restart] [--verbose] [--quiet] + [--write-info-file] + +options: + -h, --help show this help message and exit + +required arguments: + -i [ ...], --input [ ...] + Input file(s) and/or folder(s). Mutually exclusive --input-file. (default: None) + -e , --input-extension + Required if --input contains folder(s). Wildcards/Shell Expansions not supported (e.g. *). + (default: fna.gz) + -c, --input-recursive + Look for files recursively in folder(s) provided with --input (default: False) + -d DB_PREFIX, --db-prefix DB_PREFIX + Database output prefix (default: None) + +custom arguments: + -n , --input-file Manually set information for input files: file [target node specialization + specialization name]. target is the sequence identifier if --input-target sequence (file + can be repeated for multiple sequences). if --input-target file and target is not set, filename + is used. node is the taxonomic identifier. Mutually exclusive --input (default: None) + -a , --input-target Target to use [file, sequence]. By default: 'file' if multiple input files are provided or + --input-file is set, 'sequence' if a single file is provided. Using 'file' is recommended and + will speed-up the building process (default: None) + -l , --level Use a specialized target to build the database. By default, --level is the --input-target. + Options: any available taxonomic rank [species, genus, ...] or 'leaves' (requires --taxonomy). + Further specialization options [assembly, custom]. assembly will retrieve and use the assembly + accession and name. custom requires and uses the specialization field in the --input-file. + (default: None) + -m [ ...], --taxonomy-files [ ...] + Specific files for taxonomy - otherwise files will be downloaded (default: None) + -z [ ...], --genome-size-files [ ...] + Specific files for genome size estimation - otherwise files will be downloaded (default: None) + +ncbi arguments: + -r [ ...], --ncbi-sequence-info [ ...] + Uses NCBI e-utils webservices or downloads accession2taxid files to extract target information. + [eutils, nucl_gb, nucl_wgs, nucl_est, nucl_gss, pdb, prot, dead_nucl, dead_wgs, dead_prot or one + or more accession2taxid files from https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/]. + By default uses e-utils up-to 50000 sequences or downloads nucl_gb nucl_wgs otherwise. (default: + []) + -q [ ...], --ncbi-file-info [ ...] + Downloads assembly_summary files to extract target information. [refseq, genbank, + refseq_historical, genbank_historical or one or more assembly_summary files from + https://ftp.ncbi.nlm.nih.gov/genomes/] (default: ['refseq', 'genbank']) + +important arguments: + -x , --taxonomy Set taxonomy to enable taxonomic classification, lca and reports [ncbi, gtdb, skip] (default: + ncbi) + -t , --threads + +advanced arguments: + -p , --max-fp Max. false positive rate for bloom filters Mutually exclusive --filter-size. (default: 0.05) + -f , --filter-size Fixed size for filter in Megabytes (MB). Mutually exclusive --max-fp. (default: 0) + -k , --kmer-size The k-mer size to split sequences. (default: 19) + -w , --window-size The window-size to build filter with minimizers. (default: 31) + -s , --hash-functions + The number of hash functions for the interleaved bloom filter [0-5]. 0 to detect optimal value. + (default: 4) + -j , --mode Create smaller or faster filters at the cost of classification speed or database size, + respectively [avg, smaller, smallest, faster, fastest]. If --filter-size is used, + smaller/smallest refers to the false positive rate. By default, an average value is calculated + to balance classification speed and database size. (default: avg) + --hibf Builds an HIBF with raptor/chopper (v3). --mode and --filter-size will be ignored. (default: + False) + +optional arguments: + --restart Restart build/update from scratch, do not try to resume from the latest possible step. + {db_prefix}_files/ will be deleted if present. (default: False) + --verbose Verbose output mode (default: False) + --quiet Quiet output mode (default: False) + --write-info-file Save copy of target info generated to {db_prefix}.info.tsv. Can be re-used as --input-file for + further attempts. (default: False) +``` + +
+ +
+ ganon update + +``` +usage: ganon update [-h] -d DB_PREFIX [-o] [-t] [--restart] [--verbose] [--quiet] [--write-info-file] + +options: + -h, --help show this help message and exit + +required arguments: + -d DB_PREFIX, --db-prefix DB_PREFIX + Existing database input prefix (default: None) + +important arguments: + -o , --output-db-prefix + Output database prefix. By default will be the same as --db-prefix and overwrite files (default: + None) + -t , --threads + +optional arguments: + --restart Restart build/update from scratch, do not try to resume from the latest possible step. + {db_prefix}_files/ will be deleted if present. (default: False) + --verbose Verbose output mode (default: False) + --quiet Quiet output mode (default: False) + --write-info-file Save copy of target info generated to {db_prefix}.info.tsv. Can be re-used as --input-file for + further attempts. (default: False) +``` + +
+ +
+ ganon classify + +``` +usage: ganon classify [-h] -d [DB_PREFIX ...] [-s [reads.fq[.gz] ...]] [-p [reads.1.fq[.gz] reads.2.fq[.gz] ...]] + [-c [...]] [-e [...]] [-o] [--output-lca] [--output-all] [--output-unclassified] [--output-single] + [-t] [-l [...]] [-r [...]] [-a] [--verbose] [--quiet] + +options: + -h, --help show this help message and exit + +required arguments: + -d [DB_PREFIX ...], --db-prefix [DB_PREFIX ...] + Database input prefix[es] (default: None) + -s [reads.fq[.gz] ...], --single-reads [reads.fq[.gz] ...] + Multi-fastq[.gz] file[s] to classify (default: None) + -p [reads.1.fq[.gz] reads.2.fq[.gz] ...], --paired-reads [reads.1.fq[.gz] reads.2.fq[.gz] ...] + Multi-fastq[.gz] pairs of file[s] to classify (default: None) + +cutoff/filter arguments: + -c [ ...], --rel-cutoff [ ...] + Min. percentage of a read (set of minimizers) shared with the a reference necessary to consider + a match. Generally used to cutoff low similarity matches. Single value or one per database (e.g. + 0.7 1 0.25). 0 for no cutoff (default: [0.75]) + -e [ ...], --rel-filter [ ...] + Additional relative percentage of minimizers (relative to the best match) to keep a match. + Generally used to select best matches above cutoff. Single value or one per hierarchy (e.g. 0.1 + 0). 1 for no filter (default: [0.0]) + +output arguments: + -o , --output-prefix + Output prefix for output (.rep) and report (.tre). Empty to output to STDOUT (only .rep) + (default: None) + --output-lca Output an additional file with one lca match for each read (.lca) (default: False) + --output-all Output an additional file with all matches. File can be very large (.all) (default: False) + --output-unclassified + Output an additional file with unclassified read headers (.unc) (default: False) + --output-single When using multiple hierarchical levels, output everything in one file instead of one per + hierarchy (default: False) + +other arguments: + -t , --threads Number of sub-processes/threads to use (default: 1) + -l [ ...], --hierarchy-labels [ ...] + Hierarchy definition of --db-prefix files to be classified. Can also be a string, but input will + be sorted to define order (e.g. 1 1 2 3). The default value reported without hierarchy is 'H1' + (default: None) + -r [ ...], --ranks [ ...] + Ranks to report taxonomic abundances (.tre). empty will report default ranks [superkingdom, + phylum, class, order, family, genus, species, assembly]. This file can be re-generated with the + 'ganon report' command for other types of abundances (reads, matches) with further filtration + and output options (default: []) + -a, --reassign Reassign reads with multiple matches with an EM algorithm. Will enforce --output-all. This file + can be re-generated with the 'ganon reassign'. (default: False) + --verbose Verbose output mode (default: False) + --quiet Quiet output mode (default: False) +``` + +
+ +
+ ganon reassign + +``` +usage: ganon reassign [-h] -i -o OUTPUT_PREFIX [-e] [-s] [--verbose] [--quiet] + +options: + -h, --help show this help message and exit + +required arguments: + -i , --input-prefix Input prefix to find files from ganon classify (.all and optionally .rep) (default: None) + -o OUTPUT_PREFIX, --output-prefix OUTPUT_PREFIX + Output prefix for reassigned file (.all and optionally .rep). In case of multiple files, the + base input filename will be appended at the end of the output file 'output_prefix + + FILENAME.all' (default: None) + +EM arguments: + -e , --max-iter Max. number of iterations for the EM algorithm. If 0, will run until convergence (check + --threshold) (default: 10) + -s , --threshold Convergence threshold limit to stop the EM algorithm. (default: 0) + +other arguments: + --verbose Verbose output mode (default: False) + --quiet Quiet output mode (default: False) +``` + +
+ +
+ ganon report + +``` +usage: ganon report [-h] -i [...] [-e INPUT_EXTENSION] -o OUTPUT_PREFIX [-d [...]] [-x] [-m [...]] [-z [...]] [-f] [-t] + [-r [...]] [-s] [-a] [-y] [-p [...]] [-k [...]] [-c] [--verbose] [--quiet] [--min-count] + [--max-count] [--names [...]] [--names-with [...]] [--taxids [...]] + +options: + -h, --help show this help message and exit + +required arguments: + -i [ ...], --input [ ...] + Input file(s) and/or folder(s). '.rep' file(s) from ganon classify. (default: None) + -e INPUT_EXTENSION, --input-extension INPUT_EXTENSION + Required if --input contains folder(s). Wildcards/Shell Expansions not supported (e.g. *). + (default: rep) + -o OUTPUT_PREFIX, --output-prefix OUTPUT_PREFIX + Output prefix for report file 'output_prefix.tre'. In case of multiple files, the base input + filename will be appended at the end of the output file 'output_prefix + FILENAME.tre' (default: + None) + +db/tax arguments: + -d [ ...], --db-prefix [ ...] + Database prefix(es) used for classification. Only '.tax' file(s) are required. If not provided, + new taxonomy will be downloaded. Mutually exclusive with --taxonomy. (default: []) + -x , --taxonomy Taxonomy database to use [ncbi, gtdb, skip]. Mutually exclusive with --db-prefix. (default: + ncbi) + -m [ ...], --taxonomy-files [ ...] + Specific files for taxonomy - otherwise files will be downloaded (default: None) + -z [ ...], --genome-size-files [ ...] + Specific files for genome size estimation - otherwise files will be downloaded (default: None) + +output arguments: + -f , --output-format + Output format [text, tsv, csv, bioboxes]. text outputs a tabulated formatted text file for + better visualization. bioboxes is the the CAMI challenge profiling format (only + percentage/abundances are reported). (default: tsv) + -t , --report-type Type of report [abundance, reads, matches, dist, corr]. 'abundance' -> tax. abundance (re- + distribute read counts and correct by genome size), 'reads' -> sequence abundance, 'matches' -> + report all unique and shared matches, 'dist' -> like reads with re-distribution of shared read + counts only, 'corr' -> like abundance without re-distribution of shared read counts (default: + abundance) + -r [ ...], --ranks [ ...] + Ranks to report ['', 'all', custom list]. 'all' for all possible ranks. empty for default ranks + [superkingdom, phylum, class, order, family, genus, species, assembly]. (default: []) + -s , --sort Sort report by [rank, lineage, count, unique]. Default: rank (with custom --ranks) or lineage + (with --ranks all) (default: ) + -a, --no-orphan Omit orphan nodes from the final report. Otherwise, orphan nodes (= nodes not found in the + db/tax) are reported as 'na' with root as direct parent. (default: False) + -y, --split-hierarchy + Split output reports by hierarchy (from ganon classify --hierarchy-labels). If activated, the + output files will be named as '{output_prefix}.{hierarchy}.tre' (default: False) + -p [ ...], --skip-hierarchy [ ...] + One or more hierarchies to skip in the report (from ganon classify --hierarchy-labels) (default: + []) + -k [ ...], --keep-hierarchy [ ...] + One or more hierarchies to keep in the report (from ganon classify --hierarchy-labels) (default: + []) + -c , --top-percentile + Top percentile filter, based on percentage/relative abundance. Applied only at default ranks + [superkingdom, phylum, class, order, family, genus, species, assembly] (default: 0) + +optional arguments: + --verbose Verbose output mode (default: False) + --quiet Quiet output mode (default: False) + +filter arguments: + --min-count Minimum number/percentage of counts to keep an taxa [values between 0-1 for percentage, >1 + specific number] (default: 0) + --max-count Maximum number/percentage of counts to keep an taxa [values between 0-1 for percentage, >1 + specific number] (default: 0) + --names [ ...] Show only entries matching exact names of the provided list (default: []) + --names-with [ ...] Show entries containing full or partial names of the provided list (default: []) + --taxids [ ...] One or more taxids to report (including children taxa) (default: []) +``` + +
+ +
+ ganon table + +``` +usage: ganon table [-h] -i [...] [-e] -o OUTPUT_FILE [-l] [-f] [-t] [-a] [-m] [-r] [-n] [--header] + [--unclassified-label] [--filtered-label] [--skip-zeros] [--transpose] [--verbose] [--quiet] + [--min-count] [--max-count] [--names [...]] [--names-with [...]] [--taxids [...]] + +options: + -h, --help show this help message and exit + +required arguments: + -i [ ...], --input [ ...] + Input file(s) and/or folder(s). '.tre' file(s) from ganon report. (default: None) + -e , --input-extension + Required if --input contains folder(s). Wildcards/Shell Expansions not supported (e.g. *). + (default: tre) + -o OUTPUT_FILE, --output-file OUTPUT_FILE + Output filename for the table (default: None) + +output arguments: + -l , --output-value Output value on the table [percentage, counts]. percentage values are reported between [0-1] + (default: counts) + -f , --output-format + Output format [tsv, csv] (default: tsv) + -t , --top-sample Top hits of each sample individually (default: 0) + -a , --top-all Top hits of all samples (ranked by percentage) (default: 0) + -m , --min-frequency + Minimum number/percentage of files containing an taxa to keep the taxa [values between 0-1 for + percentage, >1 specific number] (default: 0) + -r , --rank Define specific rank to report. Empty will report all ranks. (default: None) + -n, --no-root Do not report root node entry and lineage. Direct and shared matches to root will be accounted + as unclassified (default: False) + --header Header information [name, taxid, lineage] (default: name) + --unclassified-label + Add column with unclassified count/percentage with the chosen label. May be the same as + --filtered-label (e.g. unassigned) (default: None) + --filtered-label Add column with filtered count/percentage with the chosen label. May be the same as + --unclassified-label (e.g. unassigned) (default: None) + --skip-zeros Do not print lines with only zero count/percentage (default: False) + --transpose Transpose output table (taxa as cols and files as rows) (default: False) + +optional arguments: + --verbose Verbose output mode (default: False) + --quiet Quiet output mode (default: False) + +filter arguments: + --min-count Minimum number/percentage of counts to keep an taxa [values between 0-1 for percentage, >1 + specific number] (default: 0) + --max-count Maximum number/percentage of counts to keep an taxa [values between 0-1 for percentage, >1 + specific number] (default: 0) + --names [ ...] Show only entries matching exact names of the provided list (default: []) + --names-with [ ...] Show entries containing full or partial names of the provided list (default: []) + --taxids [ ...] One or more taxids to report (including children taxa) (default: []) +``` + +
diff --git a/docs/outputfiles.md b/docs/outputfiles.md new file mode 100644 index 00000000..6b05b20a --- /dev/null +++ b/docs/outputfiles.md @@ -0,0 +1,138 @@ +# Output files + +## ganon build/build-custom/update + +Every run on `ganon build`, `ganon build-custom` or `ganon update` will generate the following database files: + + - {prefix}**.ibf/.hibf**: main interleaved bloom filter index file, **.hibf** is generated with `--hibf` option. + - {prefix}**.tax**: taxonomy tree, only generated if `--taxonomy` is used *(fields: target/node, parent, rank, name, genome size)*. + - {prefix}**_files/**: (`ganon build` only) folder containing downloaded reference sequence and auxiliary files. Not necessary for classification. Keep this folder if the database will be update later. Otherwise it can be deleted. + +!!! warning + Database files generated with version 1.2.0 or higher are not compatible with older versions. + +## ganon classify + +- {prefix}**.tre**: full report file (see below) +- {prefix}**.rep**: plain report of the run with only targets that received a match. Can be used to re-generate full reports (.tre) with `ganon report`. At the end prints 2 extra lines with `#total_classified` and `#total_unclassified`. Fields + - 1: hierarchy label + - 2: target + - 3: # total matches + - 4: # unique reads + - 5: # lca reads + - 6: rank + - 7: name +- {prefix}**.lca**: output with one match for each classified read after LCA. Only generated with `--output-lca` active. If multiple hierarchy levels are set, one file for each level will be created: {prefix}.{hierarchy}.lca *(fields: read identifier, target, (max) k-mer/minimizer count)* +- {prefix}**.all**: output with all matches for each read. Only generated with `--output-all` active **Warning: file can be very large**. If multiple hierarchy levels are set, one file for each level will be created: {prefix}.{hierarchy}.all *(fields: read identifier, target, k-mer/minimizer count)* + +## ganon report + +- {prefix}**.tre**: tab-separated tree-like report with cumulative counts and taxonomic lineage. There are several possible `--report-type`. More information on the different types of reports can be found [here](#report-type---report-type): + - *abundance*: will attempt to estimate **taxonomic abundances** by re-disributing read counts from LCA matches and correcting sequence abundance by approximate genome sizes. + - *reads*: **sequence abundances**, reports the proportion of sequences assigned to a taxa, each read classified is counted once. + - *dist*: like *reads* with read count re-distribution. + - *corr*: like *reads* with correction by genome size. + - *matches*: every match is reported to their original target, including multiple and shared matches. + +Each line in this report is a taxonomic entry (including the root node), with the following fields: + +| col | field | obs | example | +|-----|--------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------| +| 1 | rank | | phylum | +| 2 | target | taxonomic id. or specialization (assembly id.) | 562 | +| 3 | lineage | | 1\|131567\|2\|1224\|28211\|766\|942\|768\|769 | +| 4 | name | | Chromobacterium rhizoryzae | +| 5 | # unique | number of reads that matched exclusively to this target | 5 | +| 6 | # shared | number of reads with non-unique matches directly assigned to this target. Represents the LCA matches (`--report-type reads`), re-assigned matches (`--report-type abundance/dist`) or shared matches (`--report-type matches`) | 10 | +| 7 | # children | number of unique and shared assignments to all children nodes of this target | 20 | +| 8 | # cumulative | the sum of the unique, shared and children assignments up-to this target | 35 | +| 9 | % cumulative | percentage of assignments or estimated relative abundance for `--report-type abundance` | 43.24 | + +- The first line of the report file will show the number of unclassified reads (not for `--report-type matches`) + +- The CAMI challenge [bioboxes profiling format](https://github.com/bioboxes/rfc/blob/master/data-format/profiling.mkd) is supported using `--output-format bioboxes`. In this format, only values for the percentage/abundance (col. 9) are reported. The root node and unclassified entries are ommited. + +- The sum of cumulative assignments for the unclassified and root lines is 100%. The final cumulative sum of reads/matches may be under 100% if any filter is successfully applied and/or hierarchical selection is selected (keep/skip/split). + +- For all report type but `matches`, only taxa that received direct read matches, either unique or by LCA assignment, are considered. Some reads may have only shared matches and will not be reported directly but will be accounted for on some parent level. To visualize those matches, create a report with `--report-type matches` or use directly the file {prefix}**.rep**. + +## ganon table + + - {output_file}: a tab-separated file with counts/percentages of taxa for multiple samples + +
+ Examples of output files + +The main output file is the `{prefix}.tre` which will summarize the results: + +``` +unclassified unclassified 0 0 0 2 2.02020 +root 1 1 root 0 0 97 97 97.97980 +superkingdom 2 1|2 Bacteria 0 0 97 97 97.97980 +phylum 1239 1|2|1239 Firmicutes 0 0 57 57 57.57576 +phylum 1224 1|2|1224 Proteobacteria 0 0 40 40 40.40404 +class 91061 1|2|1239|91061 Bacilli 0 0 57 57 57.57576 +class 28211 1|2|1224|28211 Alphaproteobacteria 0 0 28 28 28.28283 +class 1236 1|2|1224|1236 Gammaproteobacteria 0 0 12 12 12.12121 +order 1385 1|2|1239|91061|1385 Bacillales 0 0 57 57 57.57576 +order 204458 1|2|1224|28211|204458 Caulobacterales 0 0 28 28 28.28283 +order 72274 1|2|1224|1236|72274 Pseudomonadales 0 0 12 12 12.12121 +family 186822 1|2|1239|91061|1385|186822 Paenibacillaceae 0 0 57 57 57.57576 +family 76892 1|2|1224|28211|204458|76892 Caulobacteraceae 0 0 28 28 28.28283 +family 468 1|2|1224|1236|72274|468 Moraxellaceae 0 0 12 12 12.12121 +genus 44249 1|2|1239|91061|1385|186822|44249 Paenibacillus 0 0 57 57 57.57576 +genus 75 1|2|1224|28211|204458|76892|75 Caulobacter 0 0 28 28 28.28283 +genus 469 1|2|1224|1236|72274|468|469 Acinetobacter 0 0 12 12 12.12121 +species 1406 1|2|1239|91061|1385|186822|44249|1406 Paenibacillus polymyxa 57 0 0 57 57.57576 +species 366602 1|2|1224|28211|204458|76892|75|366602 Caulobacter sp. K31 28 0 0 28 28.28283 +species 470 1|2|1224|1236|72274|468|469|470 Acinetobacter baumannii 12 0 0 12 12.12121 +``` + +running `ganon classify` or `ganon report` with `--ranks all`, the output will show all ranks used for classification and presented sorted by lineage (also available with `ganon report --sort lineage`): + +``` +unclassified unclassified 0 0 0 2 2.02020 +root 1 1 root 0 0 97 97 97.97980 +no rank 131567 1|131567 cellular organisms 0 0 97 97 97.97980 +superkingdom 2 1|131567|2 Bacteria 0 0 97 97 97.97980 +phylum 1224 1|131567|2|1224 Proteobacteria 0 0 40 40 40.40404 +class 1236 1|131567|2|1224|1236 Gammaproteobacteria 0 0 12 12 12.12121 +order 72274 1|131567|2|1224|1236|72274 Pseudomonadales 0 0 12 12 12.12121 +family 468 1|131567|2|1224|1236|72274|468 Moraxellaceae 0 0 12 12 12.12121 +genus 469 1|131567|2|1224|1236|72274|468|469 Acinetobacter 0 0 12 12 12.12121 +species group 909768 1|131567|2|1224|1236|72274|468|469|909768 Acinetobacter calcoaceticus/baumannii complex 0 0 12 12 12.12121 +species 470 1|131567|2|1224|1236|72274|468|469|909768|470 Acinetobacter baumannii 12 0 0 12 12.12121 +class 28211 1|131567|2|1224|28211 Alphaproteobacteria 0 0 28 28 28.28283 +order 204458 1|131567|2|1224|28211|204458 Caulobacterales 0 0 28 28 28.28283 +family 76892 1|131567|2|1224|28211|204458|76892 Caulobacteraceae 0 0 28 28 28.28283 +genus 75 1|131567|2|1224|28211|204458|76892|75 Caulobacter 0 0 28 28 28.28283 +species 366602 1|131567|2|1224|28211|204458|76892|75|366602 Caulobacter sp. K31 28 0 0 28 28.28283 +no rank 1783272 1|131567|2|1783272 Terrabacteria group 0 0 57 57 57.57576 +phylum 1239 1|131567|2|1783272|1239 Firmicutes 0 0 57 57 57.57576 +class 91061 1|131567|2|1783272|1239|91061 Bacilli 0 0 57 57 57.57576 +order 1385 1|131567|2|1783272|1239|91061|1385 Bacillales 0 0 57 57 57.57576 +family 186822 1|131567|2|1783272|1239|91061|1385|186822 Paenibacillaceae 0 0 57 57 57.57576 +genus 44249 1|131567|2|1783272|1239|91061|1385|186822|44249 Paenibacillus 0 0 57 57 57.57576 +species 1406 1|131567|2|1783272|1239|91061|1385|186822|44249|1406 Paenibacillus polymyxa 57 0 0 57 57.57576 +``` + +with `--output-format bioboxes` + +``` +@Version:0.10.0 +@SampleID:example.rep H1 +@Ranks:superkingdom|phylum|class|order|family|genus|species|assembly +@Taxonomy:db.tax +@@TAXID RANK TAXPATH TAXPATHSN PERCENTAGE +2 superkingdom 2 Bacteria 100.00000 +1224 phylum 2|1224 Bacteria|Proteobacteria 56.89782 +201174 phylum 2|201174 Bacteria|Actinobacteria 21.84869 +1239 phylum 2|1239 Bacteria|Firmicutes 9.75197 +976 phylum 2|976 Bacteria|Bacteroidota 6.15297 +1117 phylum 2|1117 Bacteria|Cyanobacteria 2.23146 +203682 phylum 2|203682 Bacteria|Planctomycetota 1.23353 +57723 phylum 2|57723 Bacteria|Acidobacteria 0.52549 +200795 phylum 2|200795 Bacteria|Chloroflexi 0.31118 +``` +
+
\ No newline at end of file diff --git a/docs/reports.md b/docs/reports.md new file mode 100644 index 00000000..6229f634 --- /dev/null +++ b/docs/reports.md @@ -0,0 +1,18 @@ +# Reports + + +## Parameter details + +### report type (--report-type) + +Several reports are availble with `--report-type`: `reads`, `abundance`, `dist`, `corr`, `matches`: + +`reads` reports **sequence abundances** which are the basic proportion of reads classified in the sample. + +`abundance` will convert sequence abundance into **taxonomic abundances** by re-distributing read counts among leaf nodes and correcting by genome size. The re-distribution applies for reads classified with a LCA assignment and it is proportional to the number of unique matches of leaf nodes available in the ganon database (relative to the LCA node). Genome size is estimated based on [NCBI or GTDB auxiliary files](../custom_databases/#genome-sizes-genome-size-files). Genome size correction is applied by rank based on default ranks only (superkingdom phylum class order family genus species assembly). Read counts in intermediate ranks will be corrected based on the closest parent default rank and re-assigned to its original rank. + +`dist` is the same of `reads` with read count re-distribution + +`corr` is the same of `reads` with correction by genome size + +`matches` will report the total number of matches classified, either unique or shared. *This report will output the total number of matches instead the total number of reads reported in all other reports.* diff --git a/docs/tutorials.md b/docs/tutorials.md new file mode 100644 index 00000000..fb4334b4 --- /dev/null +++ b/docs/tutorials.md @@ -0,0 +1,3 @@ +# Tutorials + +... soon ... \ No newline at end of file diff --git a/libs/genome_updater b/libs/genome_updater index f75766cd..077e4aee 160000 --- a/libs/genome_updater +++ b/libs/genome_updater @@ -1 +1 @@ -Subproject commit f75766cd9d013d0b658315b81ff4525cb99a0ce5 +Subproject commit 077e4aee4dc9e2019477e4a8bbd41227d52f92db diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 00000000..043fd727 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,15 @@ +site_name: ganon +theme: readthedocs +nav: + - ganon: index.md + - Databases (ganon build): default_databases.md + - Custom databases (ganon build-custom): custom_databases.md + - Classification (ganon classify): classification.md + - Reports (ganon report): reports.md + - Output files: outputfiles.md + - Tutorials: tutorials.md +markdown_extensions: + - attr_list + - admonition + - toc: + permalink: ":)" \ No newline at end of file diff --git a/src/ganon/config.py b/src/ganon/config.py index d96a2f75..923ae49a 100644 --- a/src/ganon/config.py +++ b/src/ganon/config.py @@ -464,15 +464,8 @@ def validate(self): def set_paths(self): missing_path = False self.ganon_path = self.ganon_path + "/" if self.ganon_path else "" - if self.which in ["build", "update"]: - ganon_get_seq_info_paths = [self.ganon_path, self.ganon_path+"scripts/", self.ganon_path+"../scripts/"] if self.ganon_path else [None, "scripts/"] - for p in ganon_get_seq_info_paths: - self.path_exec["get_seq_info"] = shutil.which("ganon-get-seq-info.sh", path=p) - if self.path_exec["get_seq_info"] is not None: break - if self.path_exec["get_seq_info"] is None: - print_log("ganon-get-seq-info.sh script was not found. Please inform a specific path with --ganon-path") - missing_path = True + if self.which in ["build", "update", "build-custom"]: ganon_genome_updater_paths = [self.ganon_path, self.ganon_path+"libs/genome_updater/", self.ganon_path+"../libs/genome_updater/"] if self.ganon_path else [None, "libs/genome_updater/"] for p in ganon_genome_updater_paths: self.path_exec["genome_updater"] = shutil.which("genome_updater.sh", path=p) @@ -481,8 +474,14 @@ def set_paths(self): print_log("genome_updater.sh was not found. Please inform a specific path with --ganon-path") missing_path = True + ganon_get_seq_info_paths = [self.ganon_path, self.ganon_path+"scripts/", self.ganon_path+"../scripts/"] if self.ganon_path else [None, "scripts/"] + for p in ganon_get_seq_info_paths: + self.path_exec["get_seq_info"] = shutil.which("ganon-get-seq-info.sh", path=p) + if self.path_exec["get_seq_info"] is not None: break + if self.path_exec["get_seq_info"] is None: + print_log("ganon-get-seq-info.sh script was not found. Please inform a specific path with --ganon-path") + missing_path = True - if self.which in ["build-custom"]: # if path is given, look for binaries only there ganon_build_paths = [self.ganon_path, self.ganon_path+"build/"] if self.ganon_path else [None, "build/"] for p in ganon_build_paths: @@ -492,7 +491,7 @@ def set_paths(self): print_log("ganon-build binary was not found. Please inform a specific path with --ganon-path") missing_path = True - if self.hibf: + if hasattr(self, 'hibf') and self.hibf: self.raptor_path = self.raptor_path + "/" if self.raptor_path else "" raptor_paths = [self.raptor_path, self.raptor_path+"build/bin/"] if self.raptor_path else [None, "build/"] for p in raptor_paths: @@ -502,7 +501,6 @@ def set_paths(self): print_log("raptor binary was not found. Please inform a specific path with --raptor-path") missing_path = True - if self.which in ["classify"]: ganon_classify_paths = [self.ganon_path, self.ganon_path+"build/"] if self.ganon_path else [None, "build/"] for p in ganon_classify_paths: From 3e7ce61f84ea49b5fb6127e14ed19b045abbdf9f Mon Sep 17 00:00:00 2001 From: pirovc <4673375+pirovc@users.noreply.github.com> Date: Tue, 9 May 2023 15:00:17 +0200 Subject: [PATCH 6/9] Feature/performance build by sequence (#248) * v1.5.1 fix path script build-custom (#246) * v1.5.1 fix path script build-custom * gu v0.6.1 * improve parsing when counting hashes --- src/ganon-build/GanonBuild.cpp | 50 +++++++++++++++++++++------ src/ganon-classify/GanonClassify.cpp | 10 +++--- tests/aux/Aux.hpp | 10 +++--- tests/ganon-build/GanonBuild.test.cpp | 19 +++++++++- 4 files changed, 68 insertions(+), 21 deletions(-) diff --git a/src/ganon-build/GanonBuild.cpp b/src/ganon-build/GanonBuild.cpp index 518c443c..cc21b839 100644 --- a/src/ganon-build/GanonBuild.cpp +++ b/src/ganon-build/GanonBuild.cpp @@ -118,7 +118,7 @@ robin_hood::unordered_map< std::string, TFile > parse_input_file( const std::str if ( fields.size() == 1 ) { // target is the file itself (filename only wihtout path) - auto target = std::filesystem::path( file ).filename(); + const auto target = std::filesystem::path( file ).filename(); input_map[target][file] = {}; hashes_count[target] = 0; } @@ -130,10 +130,10 @@ robin_hood::unordered_map< std::string, TFile > parse_input_file( const std::str } else { - // sequence as target - std::string seqid = fields[2]; + // parse by sequence + const std::string seqid = fields[2]; input_map[fields[1]][file].insert( seqid ); - hashes_count[seqid] = 0; + hashes_count[fields[1]] = 0; } stats.total.files++; } @@ -199,6 +199,7 @@ void count_hashes( SafeQueue< InputFileMap >& ifm_queue, * it also keep the counts to be further use to define best filter size */ + // one view per thread auto minimiser_view = seqan3::views::minimiser_hash( seqan3::shape{ seqan3::ungapped{ config.kmer_size } }, seqan3::window_size{ config.window_size }, @@ -225,10 +226,10 @@ void count_hashes( SafeQueue< InputFileMap >& ifm_queue, if ( seqids.empty() ) { - robin_hood::unordered_set< uint64_t > hashes; - // File as target - generate all hashes from file with possible multiple sequences // before counting and storing + + robin_hood::unordered_set< uint64_t > hashes; for ( auto const& [header, seq] : fin ) { total.sequences++; @@ -239,24 +240,50 @@ void count_hashes( SafeQueue< InputFileMap >& ifm_queue, hashes_count[ifm.target] += hashes.size(); detail::store_hashes( ifm.target, hashes, config.tmp_output_folder ); } - else + else if ( seqids.size() == 1 ) { - // Sequence as target - count and store hashes for each sequence + // Single seqid for a target (or target is seqid itself) + // Directly count and store for ( auto const& [header, seq] : fin ) { const auto seqid = get_seqid( header ); - // If header is present for this target if ( seqids.count( seqid ) ) { + total.sequences++; + total.length_bp += seq.size(); robin_hood::unordered_set< uint64_t > hashes; + const auto mh = seq | minimiser_view | std::views::common; + hashes.insert( mh.begin(), mh.end() ); + hashes_count[ifm.target] += hashes.size(); + detail::store_hashes( ifm.target, hashes, config.tmp_output_folder ); + break; + } + } + } + else + { + // Multiple seqids for a target + // generate all hashes from file with possible multiple sequences before counting and storing + + // Keep track of total seqids to be parsed + size_t n_seqids = seqids.size(); + robin_hood::unordered_set< uint64_t > hashes; + for ( auto const& [header, seq] : fin ) + { + const auto seqid = get_seqid( header ); + if ( seqids.count( seqid ) ) + { total.sequences++; total.length_bp += seq.size(); const auto mh = seq | minimiser_view | std::views::common; hashes.insert( mh.begin(), mh.end() ); - hashes_count[seqid] = hashes.size(); - detail::store_hashes( seqid, hashes, config.tmp_output_folder ); + // In case all seqids were already processed + if ( !--n_seqids ) + break; } } + hashes_count[ifm.target] += hashes.size(); + detail::store_hashes( ifm.target, hashes, config.tmp_output_folder ); } } catch ( seqan3::parse_error const& e ) @@ -834,6 +861,7 @@ bool run( Config config ) // Initialize in parallel (by target) the hash counting and storing timeCountStoreHashes.start(); std::vector< std::future< void > > tasks_count; + for ( uint16_t taskn = 0; taskn < config.threads; ++taskn ) { tasks_count.emplace_back( std::async( std::launch::async, diff --git a/src/ganon-classify/GanonClassify.cpp b/src/ganon-classify/GanonClassify.cpp index 492ff91a..3132f295 100644 --- a/src/ganon-classify/GanonClassify.cpp +++ b/src/ganon-classify/GanonClassify.cpp @@ -532,17 +532,19 @@ void classify( std::vector< Filter< TFilter > >& filters, if ( config.output_lca ) classified_lca_queue.push( read_out_lca ); } - else + else { // Not running lca and has unique match - if ( count_filtered_matches == 1 ){ + if ( count_filtered_matches == 1 ) + { rep[read_out.matches[0].target].unique_reads++; - }else{ + } + else + { // without tax, no lca, count multi-matches to a root node // to keep consistency among reports (no. of classified reads) rep[config.tax_root_node].unique_reads++; } - } if ( config.output_all ) diff --git a/tests/aux/Aux.hpp b/tests/aux/Aux.hpp index 0da3af45..1c243189 100644 --- a/tests/aux/Aux.hpp +++ b/tests/aux/Aux.hpp @@ -209,13 +209,13 @@ struct SeqTarget for ( uint16_t i = 0; i < files.size(); ++i ) { output_file << files[i]; - if ( custom_targets || sequence_as_target ) + if ( sequence_as_target ) + { + output_file << '\t' << headers[i] << '\t' << headers[i]; + } + else if ( custom_targets ) { output_file << '\t' << targets[i]; - if ( sequence_as_target ) - { - output_file << '\t' << headers[i]; - } } output_file << '\n'; diff --git a/tests/ganon-build/GanonBuild.test.cpp b/tests/ganon-build/GanonBuild.test.cpp index df9eb520..6db8860d 100644 --- a/tests/ganon-build/GanonBuild.test.cpp +++ b/tests/ganon-build/GanonBuild.test.cpp @@ -202,11 +202,28 @@ SCENARIO( "building indices", "[ganon-build]" ) config_build::validate_elements( cfg, seqtarget ); } - SECTION( "three columns - sequence as target" ) + SECTION( "three columns - custom target" ) { std::string prefix{ folder_prefix + "input_file_three_cols" }; auto cfg = config_build::defaultConfig( prefix ); + std::vector< std::string > targets{ "T1", "T9", "T1", "T8", "T1", "T1", "T1", "T1", "T4", "T1" }; + + auto seqtarget = aux::SeqTarget( prefix, seqs, targets ); + seqtarget.sequence_as_target = false; + seqtarget.write_input_file( cfg.input_file ); + seqtarget.write_sequences_files(); + + REQUIRE( GanonBuild::run( cfg ) ); + config_build::validate_filter( cfg ); + config_build::validate_elements( cfg, seqtarget ); + } + + SECTION( "three columns - sequence as target" ) + { + std::string prefix{ folder_prefix + "input_file_three_cols_sequence_target" }; + auto cfg = config_build::defaultConfig( prefix ); + auto seqtarget = aux::SeqTarget( prefix, seqs ); seqtarget.sequence_as_target = true; seqtarget.write_input_file( cfg.input_file ); From 23734599889cf2db452f44d26421d4fc53ef1e88 Mon Sep 17 00:00:00 2001 From: pirovc <4673375+pirovc@users.noreply.github.com> Date: Tue, 9 May 2023 17:11:46 +0200 Subject: [PATCH 7/9] Feature/filter seq len build (#249) * v1.5.1 fix path script build-custom (#246) * v1.5.1 fix path script build-custom * gu v0.6.1 * filter by size * fix call --- src/ganon-build/CommandLineParser.cpp | 5 +- src/ganon-build/GanonBuild.cpp | 65 +++++++++++++------ .../include/ganon-build/Config.hpp | 4 +- src/ganon/build_update.py | 3 + src/ganon/config.py | 3 +- tests/ganon-build/GanonBuild.test.cpp | 60 +++++++++++++++++ 6 files changed, 117 insertions(+), 23 deletions(-) diff --git a/src/ganon-build/CommandLineParser.cpp b/src/ganon-build/CommandLineParser.cpp index d753b947..bfeadd0e 100644 --- a/src/ganon-build/CommandLineParser.cpp +++ b/src/ganon-build/CommandLineParser.cpp @@ -16,11 +16,12 @@ std::optional< Config > CommandLineParser::parse( int argc, char** argv ) ( "i,input-file", "Define sequences to use. Tabular file with the fields: file [ target seqid]", cxxopts::value< std::string >() ) ( "o,output-file", "Filter output file", cxxopts::value< std::string >() ) ( "k,kmer-size", "k-mer size. Default: 19", cxxopts::value< uint8_t >() ) - ( "w,window-size", "window size. Default: 32", cxxopts::value< uint16_t >() ) + ( "w,window-size", "window size. Default: 31", cxxopts::value< uint16_t >() ) ( "s,hash-functions", "number of hash functions. 0 to auto-detect. Default: 0", cxxopts::value< uint8_t >() ) ( "p,max-fp", "Maximum false positive rate per target. Used to define filter size [mutually exclusive --filter-size]. Default: 0.05", cxxopts::value< double >() ) ( "f,filter-size", "Filter size (MB) [mutually exclusive --max-fp]", cxxopts::value< double >() ) ( "j,mode", "mode to build filter [avg, smaller, smallest, faster, fastest]. Default: avg", cxxopts::value< std::string >() ) + ( "y,min-length", "min. sequence length (bp) to keep. 0 to keep all. Default: 0", cxxopts::value< uint64_t >() ) ( "m,tmp-output-folder", "Folder to write temporary files", cxxopts::value< std::string >() ) ( "t,threads", "Number of threads", cxxopts::value< uint16_t >()) ( "verbose", "Verbose output mode", cxxopts::value()) @@ -67,6 +68,8 @@ std::optional< Config > CommandLineParser::parse( int argc, char** argv ) config.filter_size = args["filter-size"].as< double >(); if ( args.count( "mode" ) ) config.mode = args["mode"].as< std::string >(); + if ( args.count( "min-length" ) ) + config.min_length = args["min-length"].as< uint64_t >(); if ( args.count( "tmp-output-folder" ) ) config.tmp_output_folder = args["tmp-output-folder"].as< std::string >(); if ( args.count( "threads" ) ) diff --git a/src/ganon-build/GanonBuild.cpp b/src/ganon-build/GanonBuild.cpp index cc21b839..f26c1cae 100644 --- a/src/ganon-build/GanonBuild.cpp +++ b/src/ganon-build/GanonBuild.cpp @@ -55,7 +55,7 @@ struct Total uint64_t files = 0; uint64_t invalid_files = 0; uint64_t sequences = 0; - uint64_t invalid_sequences = 0; + uint64_t skipped_sequences = 0; uint64_t length_bp = 0; }; @@ -70,7 +70,7 @@ struct Stats total.files += t.files; total.invalid_files += t.invalid_files; total.sequences += t.sequences; - total.invalid_sequences += t.invalid_sequences; + total.skipped_sequences += t.skipped_sequences; total.length_bp += t.length_bp; } } @@ -97,6 +97,7 @@ robin_hood::unordered_map< std::string, TFile > parse_input_file( const std::str */ robin_hood::unordered_map< std::string, TFile > input_map; std::string line; + robin_hood::unordered_set< std::string > files; std::ifstream infile( input_file ); while ( std::getline( infile, line, '\n' ) ) { @@ -107,6 +108,7 @@ robin_hood::unordered_map< std::string, TFile > parse_input_file( const std::str fields.push_back( field ); // file [ target seqid] const std::string file = fields[0]; + files.insert( file ); if ( !std::filesystem::exists( file ) || std::filesystem::file_size( file ) == 0 ) { if ( !quiet ) @@ -135,8 +137,8 @@ robin_hood::unordered_map< std::string, TFile > parse_input_file( const std::str input_map[fields[1]][file].insert( seqid ); hashes_count[fields[1]] = 0; } - stats.total.files++; } + stats.total.files = files.size(); return input_map; } @@ -232,6 +234,12 @@ void count_hashes( SafeQueue< InputFileMap >& ifm_queue, robin_hood::unordered_set< uint64_t > hashes; for ( auto const& [header, seq] : fin ) { + if ( seq.size() < config.min_length ) + { + total.skipped_sequences++; + continue; + } + total.sequences++; total.length_bp += seq.size(); const auto mh = seq | minimiser_view | std::views::common; @@ -249,13 +257,20 @@ void count_hashes( SafeQueue< InputFileMap >& ifm_queue, const auto seqid = get_seqid( header ); if ( seqids.count( seqid ) ) { - total.sequences++; - total.length_bp += seq.size(); - robin_hood::unordered_set< uint64_t > hashes; - const auto mh = seq | minimiser_view | std::views::common; - hashes.insert( mh.begin(), mh.end() ); - hashes_count[ifm.target] += hashes.size(); - detail::store_hashes( ifm.target, hashes, config.tmp_output_folder ); + if ( seq.size() < config.min_length ) + { + total.skipped_sequences++; + } + else + { + total.sequences++; + total.length_bp += seq.size(); + robin_hood::unordered_set< uint64_t > hashes; + const auto mh = seq | minimiser_view | std::views::common; + hashes.insert( mh.begin(), mh.end() ); + hashes_count[ifm.target] += hashes.size(); + detail::store_hashes( ifm.target, hashes, config.tmp_output_folder ); + } break; } } @@ -273,10 +288,18 @@ void count_hashes( SafeQueue< InputFileMap >& ifm_queue, const auto seqid = get_seqid( header ); if ( seqids.count( seqid ) ) { - total.sequences++; - total.length_bp += seq.size(); - const auto mh = seq | minimiser_view | std::views::common; - hashes.insert( mh.begin(), mh.end() ); + if ( seq.size() < config.min_length ) + { + total.skipped_sequences++; + continue; + } + else + { + total.sequences++; + total.length_bp += seq.size(); + const auto mh = seq | minimiser_view | std::views::common; + hashes.insert( mh.begin(), mh.end() ); + } // In case all seqids were already processed if ( !--n_seqids ) break; @@ -756,8 +779,8 @@ void print_stats( Stats& stats, const IBFConfig& ibf_config, const StopClock& ti if ( stats.total.invalid_files > 0 ) std::cerr << " - " << stats.total.invalid_files << " invalid files skipped" << std::endl; - if ( stats.total.invalid_sequences > 0 ) - std::cerr << " - " << stats.total.invalid_sequences << " invalid sequences skipped" << std::endl; + if ( stats.total.skipped_sequences > 0 ) + std::cerr << " - " << stats.total.skipped_sequences << " sequences skipped" << std::endl; std::cerr << std::fixed << std::setprecision( 4 ) << " - max. false positive: " << ibf_config.true_max_fp; std::cerr << std::fixed << std::setprecision( 4 ) << " (avg.: " << ibf_config.true_avg_fp << ")" << std::endl; @@ -842,10 +865,6 @@ bool run( Config config ) std::cerr << "No valid input files" << std::endl; return false; } - else - { - stats.total.files = ifm_queue.size(); - } // Create temporary output folder if not existing to write minimizer hashes if ( config.tmp_output_folder != "" && !std::filesystem::exists( config.tmp_output_folder ) ) @@ -909,6 +928,12 @@ bool run( Config config ) << " Megabytes)" << std::endl; } + if ( ibf_config.n_bins == 0 ) + { + std::cerr << "No valid sequences to build" << std::endl; + return false; + } + // Split hashes into optimal size creating technical bins // {binno: (target, idx_hashes_start, idx_hashes_end)} const detail::TBinMapHash bin_map_hash = detail::create_bin_map_hash( ibf_config, hashes_count ); diff --git a/src/ganon-build/include/ganon-build/Config.hpp b/src/ganon-build/include/ganon-build/Config.hpp index 7c0f4e29..54f42c42 100644 --- a/src/ganon-build/include/ganon-build/Config.hpp +++ b/src/ganon-build/include/ganon-build/Config.hpp @@ -17,8 +17,9 @@ struct Config double max_fp = 0.05; double filter_size = 0; uint8_t kmer_size = 19; - uint16_t window_size = 32; + uint16_t window_size = 31; uint8_t hash_functions = 0; + uint64_t min_length = 0; uint16_t threads = 1; bool verbose = false; bool quiet = false; @@ -122,6 +123,7 @@ inline std::ostream& operator<<( std::ostream& stream, const Config& config ) stream << "--window-size " << config.window_size << newl; stream << "--hash-functions " << unsigned( config.hash_functions ) << newl; stream << "--mode " << config.mode << newl; + stream << "--min-length " << config.min_length << newl; stream << "--threads " << config.threads << newl; stream << "--verbose " << config.verbose << newl; stream << "--quiet " << config.quiet << newl; diff --git a/src/ganon/build_update.py b/src/ganon/build_update.py index e0217c6c..b292a7a1 100644 --- a/src/ganon/build_update.py +++ b/src/ganon/build_update.py @@ -89,6 +89,7 @@ def build(cfg): "window_size": cfg.window_size, "hash_functions": cfg.hash_functions, "mode": cfg.mode, + "min_length": cfg.min_length, "verbose": cfg.verbose, "quiet": cfg.quiet, "ganon_path": cfg.ganon_path, @@ -173,6 +174,7 @@ def update(cfg): build_custom_params["window_size"] = loaded_params["window_size"] build_custom_params["hash_functions"] = loaded_params["hash_functions"] build_custom_params["mode"] = loaded_params["mode"] if "mode" in loaded_params else "avg" # mode introduce in v1.4.0 + build_custom_params["min_length"] = loaded_params["min_length"] if "min_length" in loaded_params else 0 # mode introduce in v1.6.0 build_custom_params["hibf"] = loaded_params["hibf"] build_custom_config = Config("build-custom", **build_custom_params) @@ -353,6 +355,7 @@ def build_custom(cfg, which_call: str="build_custom"): "--window-size " + str(cfg.window_size), "--hash-functions " + str(cfg.hash_functions), "--mode " + cfg.mode, + "--min-length " + str(cfg.min_length), "--max-fp " + str(cfg.max_fp) if cfg.max_fp else "", "--filter-size " + str(cfg.filter_size) if cfg.filter_size else "", "--tmp-output-folder '" + build_output_folder + "'", diff --git a/src/ganon/config.py b/src/ganon/config.py index 923ae49a..822832d2 100644 --- a/src/ganon/config.py +++ b/src/ganon/config.py @@ -51,7 +51,8 @@ def __init__(self, which: str=None, **kwargs): build_default_advanced_args.add_argument("-w", "--window-size", type=unsigned_int(minval=1), metavar="", default=31, help="The window-size to build filter with minimizers.") build_default_advanced_args.add_argument("-s", "--hash-functions", type=unsigned_int(minval=0, maxval=5), metavar="", default=4, help="The number of hash functions for the interleaved bloom filter [0-5]. 0 to detect optimal value.", choices=range(6)) build_default_advanced_args.add_argument("-j", "--mode", type=str, metavar="", default="avg", help="Create smaller or faster filters at the cost of classification speed or database size, respectively [" + ", ".join(self.choices_mode) + "]. If --filter-size is used, smaller/smallest refers to the false positive rate. By default, an average value is calculated to balance classification speed and database size.", choices=self.choices_mode) - build_default_advanced_args.add_argument("--hibf", action="store_true", help="Builds an HIBF with raptor/chopper (v3). --mode and --filter-size will be ignored.") + build_default_advanced_args.add_argument("-y", "--min-length", type=unsigned_int(minval=0), metavar="", default=0, help="Skip sequences smaller then value defined. 0 to not skip any sequence.") + build_default_advanced_args.add_argument("--hibf", action="store_true", help="Builds an HIBF with raptor/chopper (v3). --mode, --filter-size and --min-length will be ignored.") #################################################################################################### diff --git a/tests/ganon-build/GanonBuild.test.cpp b/tests/ganon-build/GanonBuild.test.cpp index 6db8860d..bbf0b137 100644 --- a/tests/ganon-build/GanonBuild.test.cpp +++ b/tests/ganon-build/GanonBuild.test.cpp @@ -550,4 +550,64 @@ SCENARIO( "building indices", "[ganon-build]" ) REQUIRE_FALSE( GanonBuild::run( cfg ) ); } } + + + SECTION( "--min-length" ) + { + // 10 seqs2 lens: 80, 75, 70, ..., 35 + aux::sequences_type seqs2{ + "ACACTCTTTGAAAATGCATATAATATTGAACGTTATTTTGAAATAGATTAATTACTCATATCCATTTGCTAATCTTATCG"_dna4, + "TTTATTATATGTAATTATAAATTTATCGTTAAGCTTGACATAAGTGAGTGTATCTATGTTCTTAACAAATACATC"_dna4, + "TTTTATTTTTATTTCTTATGCACAAGAATAAATTATATGCATATGATAATTTCTCATTCAATGCGGATGT"_dna4, + "TATGGTAAGCTATTATGGCATGATAAAAAACCAGTCATATACCCATTGGCATCCTTATCTGATTA"_dna4, + "ATCCGACCCATTTGAAACGATTTATTATGTGGAGCAATACTATAAAATTAGCTTAAATGA"_dna4, + "AAAAGGACATTTACGCACACCTTCAATTAAAACATAATAAATCATTAATTACAGC"_dna4, + "AATAGTTCGTATTATGTTCATCGGATGAATTTACCAGCAAACATCCATGA"_dna4, + "TTTTTTAATCGTAACAAATAACATACGGTTAGATTATATAAGAAA"_dna4, + "CTGACTGGATAGAAATATCACCCGGAGAAAAACTCTCATA"_dna4, + "ATGCATCAATATGATATAGGAACTGTAGAGTTCAC"_dna4 + }; + + SECTION( "0" ) + { + std::string prefix{ folder_prefix + "min_length_0" }; + auto cfg = config_build::defaultConfig( prefix ); + cfg.min_length = 0; + + auto seqtarget = aux::SeqTarget( prefix, seqs2 ); + seqtarget.write_input_file( cfg.input_file ); + seqtarget.write_sequences_files(); + + REQUIRE( GanonBuild::run( cfg ) ); + config_build::validate_filter( cfg ); + config_build::validate_elements( cfg, seqtarget ); + } + + SECTION( "50" ) + { + std::string prefix{ folder_prefix + "min_length_50" }; + auto cfg = config_build::defaultConfig( prefix ); + cfg.min_length = 50; + + auto seqtarget = aux::SeqTarget( prefix, seqs2 ); + seqtarget.write_input_file( cfg.input_file ); + seqtarget.write_sequences_files(); + + REQUIRE( GanonBuild::run( cfg ) ); + config_build::validate_filter( cfg ); + + // check if only valid sequences were added + aux::sequences_type seqs3{ + "ACACTCTTTGAAAATGCATATAATATTGAACGTTATTTTGAAATAGATTAATTACTCATATCCATTTGCTAATCTTATCG"_dna4, + "TTTATTATATGTAATTATAAATTTATCGTTAAGCTTGACATAAGTGAGTGTATCTATGTTCTTAACAAATACATC"_dna4, + "TTTTATTTTTATTTCTTATGCACAAGAATAAATTATATGCATATGATAATTTCTCATTCAATGCGGATGT"_dna4, + "TATGGTAAGCTATTATGGCATGATAAAAAACCAGTCATATACCCATTGGCATCCTTATCTGATTA"_dna4, + "ATCCGACCCATTTGAAACGATTTATTATGTGGAGCAATACTATAAAATTAGCTTAAATGA"_dna4, + "AAAAGGACATTTACGCACACCTTCAATTAAAACATAATAAATCATTAATTACAGC"_dna4, + "AATAGTTCGTATTATGTTCATCGGATGAATTTACCAGCAAACATCCATGA"_dna4 + }; + auto seqtarget_valid = aux::SeqTarget( prefix, seqs3 ); + config_build::validate_elements( cfg, seqtarget_valid ); + } + } } From 15a05a7b1672bc20a15b86d2d735d3072458687c Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Tue, 9 May 2023 17:20:20 +0200 Subject: [PATCH 8/9] docs and readme --- .gitignore | 3 +++ README.md | 19 ++++++++++--------- docs/custom_databases.md | 1 + docs/index.md | 16 +++++++++------- 4 files changed, 23 insertions(+), 16 deletions(-) diff --git a/.gitignore b/.gitignore index 5534d742..e35701cb 100755 --- a/.gitignore +++ b/.gitignore @@ -43,3 +43,6 @@ # python setuptools *.egg-info/ dist/ + +# mkdocs +site/ diff --git a/README.md b/README.md index c1cc9692..2c6a04b8 100644 --- a/README.md +++ b/README.md @@ -2,15 +2,16 @@ ganon classifies DNA sequences against large sets of genomic reference sequences efficiently. It features: -- automatic download, build and update for commonly used repos (refseq/genbank) -- binning and taxonomic profiling -- multiple taxonomy integration (ncbi/gtdb) -- LCA algorithm + read reassignment EM algorithm -- hierarchical use of databases +- automatic download, build and update procedures for commonly used databases (RefSeq and GenBank) +- classification with binning and taxonomic profiling +- multiple taxonomy integration (NCBI and GTDB) with lowest common ancestor (LCA) +- read reassignment EM algorithm for multi-matching reads +- hierarchical use of multiple databases - taxonomic and sequence abundance reports with genome size correction -- contingency tables and many more +- advanced reporting and filtration of results +- contingency table generation -User manual: https://pirovc.github.io/ganon/ +Find out more in the user manual: https://pirovc.github.io/ganon/ ## Quick install @@ -28,7 +29,7 @@ ganon build --db-prefix arc_cg_rs --source refseq --organism-group archaea --com ### Classify ```bash -ganon classify --db-prefix arc_cg_rs --output-prefix classify_results --single-reads my_reads.fq.gz --threads 12 +ganon classify --db-prefix arc_cg_rs --output-prefix classify_results --paired-reads my_reads.1.fq.gz my_reads.2.fq.gz --threads 12 ``` -For complete examples, databases, installation from source and information -> https://pirovc.github.io/ganon/ \ No newline at end of file +For further examples, database guide, installation from source and information: https://pirovc.github.io/ganon/ \ No newline at end of file diff --git a/docs/custom_databases.md b/docs/custom_databases.md index ca078847..6d8aad94 100644 --- a/docs/custom_databases.md +++ b/docs/custom_databases.md @@ -202,6 +202,7 @@ db="16S_ribosomal_RNA" # Download BLAST db wget -nd --quiet --show-progress "ftp://ftp.ncbi.nlm.nih.gov/blast/db/${db}*.tar.gz" +# Using 12 threads: curl --silent --list-only ftp://ftp.ncbi.nlm.nih.gov/blast/db/ | grep "${db}.*.tar.gz$" | xargs -P 12 -I{} wget -nd --quiet --show-progress "ftp://ftp.ncbi.nlm.nih.gov/blast/db/{}" # Merge and extract BLAST db files cat "${db}"*.tar.gz | tar xvfz - > ex_files.txt diff --git a/docs/index.md b/docs/index.md index af1675d6..1d96049c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -130,7 +130,7 @@ options: ``` usage: ganon build [-h] [-g [...]] [-a [...]] [-b [...]] [-o] [-c] [-u] [-m [...]] [-z [...]] -d DB_PREFIX [-x] [-t] - [-p] [-f] [-k] [-w] [-s] [-j] [--hibf] [--restart] [--verbose] [--quiet] [--write-info-file] + [-p] [-f] [-k] [-w] [-s] [-j] [-y] [--hibf] [--restart] [--verbose] [--quiet] [--write-info-file] options: -h, --help show this help message and exit @@ -176,8 +176,9 @@ advanced arguments: respectively [avg, smaller, smallest, faster, fastest]. If --filter-size is used, smaller/smallest refers to the false positive rate. By default, an average value is calculated to balance classification speed and database size. (default: avg) - --hibf Builds an HIBF with raptor/chopper (v3). --mode and --filter-size will be ignored. (default: - False) + -y , --min-length Skip sequences smaller then value defined. 0 to not skip any sequence. (default: 0) + --hibf Builds an HIBF with raptor/chopper (v3). --mode, --filter-size and --min-length will be ignored. + (default: False) optional arguments: --restart Restart build/update from scratch, do not try to resume from the latest possible step. @@ -195,8 +196,8 @@ optional arguments: ``` usage: ganon build-custom [-h] [-i [...]] [-e] [-c] [-n] [-a] [-l] [-m [...]] [-z [...]] [-r [...]] [-q [...]] -d - DB_PREFIX [-x] [-t] [-p] [-f] [-k] [-w] [-s] [-j] [--hibf] [--restart] [--verbose] [--quiet] - [--write-info-file] + DB_PREFIX [-x] [-t] [-p] [-f] [-k] [-w] [-s] [-j] [-y] [--hibf] [--restart] [--verbose] + [--quiet] [--write-info-file] options: -h, --help show this help message and exit @@ -259,8 +260,9 @@ advanced arguments: respectively [avg, smaller, smallest, faster, fastest]. If --filter-size is used, smaller/smallest refers to the false positive rate. By default, an average value is calculated to balance classification speed and database size. (default: avg) - --hibf Builds an HIBF with raptor/chopper (v3). --mode and --filter-size will be ignored. (default: - False) + -y , --min-length Skip sequences smaller then value defined. 0 to not skip any sequence. (default: 0) + --hibf Builds an HIBF with raptor/chopper (v3). --mode, --filter-size and --min-length will be ignored. + (default: False) optional arguments: --restart Restart build/update from scratch, do not try to resume from the latest possible step. From 9435c4f330bb73023c9e32ff5f7c34e44db14834 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Wed, 10 May 2023 10:59:36 +0200 Subject: [PATCH 9/9] docs readme --- README.md | 14 +++++++------- docs/default_databases.md | 4 ++-- docs/index.md | 4 +++- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 2c6a04b8..c195188e 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# ganon [![Build Status](https://travis-ci.com/pirovc/ganon.svg?branch=master)](https://travis-ci.com/pirovc/ganon) [![codecov](https://codecov.io/gh/pirovc/ganon/branch/master/graph/badge.svg)](https://codecov.io/gh/pirovc/ganon) [![Anaconda-Server Badge](https://anaconda.org/bioconda/ganon/badges/downloads.svg)](https://anaconda.org/bioconda/ganon) [![Anaconda-Server Badge](https://anaconda.org/bioconda/ganon/badges/platforms.svg)](https://anaconda.org/bioconda/ganon) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/ganon/README.html) [![Publication](https://img.shields.io/badge/DOI-10.1101%2F406017-blue)](https://dx.doi.org/10.1093/bioinformatics/btaa458) +# ganon [![GitHub release (latest by date)](https://img.shields.io/github/v/release/pirovc/ganon)](https://github.com/pirovc/ganon) [![Build Status](https://travis-ci.com/pirovc/ganon.svg?branch=master)](https://travis-ci.com/pirovc/ganon) [![codecov](https://codecov.io/gh/pirovc/ganon/branch/master/graph/badge.svg)](https://codecov.io/gh/pirovc/ganon) [![Anaconda-Server Badge](https://anaconda.org/bioconda/ganon/badges/downloads.svg)](https://anaconda.org/bioconda/ganon) [![Anaconda-Server Badge](https://anaconda.org/bioconda/ganon/badges/platforms.svg)](https://anaconda.org/bioconda/ganon) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/ganon/README.html) [![Publication](https://img.shields.io/badge/DOI-10.1101%2F406017-blue)](https://dx.doi.org/10.1093/bioinformatics/btaa458) ganon classifies DNA sequences against large sets of genomic reference sequences efficiently. It features: @@ -11,9 +11,9 @@ ganon classifies DNA sequences against large sets of genomic reference sequences - advanced reporting and filtration of results - contingency table generation -Find out more in the user manual: https://pirovc.github.io/ganon/ +Find out more information in the user manual: https://pirovc.github.io/ganon/ -## Quick install +## Quick install with conda ```sh conda install -c bioconda -c conda-forge ganon @@ -21,15 +21,15 @@ conda install -c bioconda -c conda-forge ganon ## Basic usage -### Download and build (Archaea - complete genomes - NCBI RefSeq) +### Download and Build (Archaea - complete genomes - NCBI RefSeq) ```bash -ganon build --db-prefix arc_cg_rs --source refseq --organism-group archaea --complete-genomes --threads 12 +ganon build --db-prefix arc_cg_rs --source refseq --organism-group archaea --complete-genomes --threads 24 ``` ### Classify ```bash -ganon classify --db-prefix arc_cg_rs --output-prefix classify_results --paired-reads my_reads.1.fq.gz my_reads.2.fq.gz --threads 12 +ganon classify --db-prefix arc_cg_rs --output-prefix classify_results --paired-reads my_reads.1.fq.gz my_reads.2.fq.gz --threads 24 ``` -For further examples, database guide, installation from source and information: https://pirovc.github.io/ganon/ \ No newline at end of file +For further examples, database guide, installation from source and more: https://pirovc.github.io/ganon/ \ No newline at end of file diff --git a/docs/default_databases.md b/docs/default_databases.md index 394aaca9..a5b2b3e9 100644 --- a/docs/default_databases.md +++ b/docs/default_databases.md @@ -41,8 +41,8 @@ NCBI RefSeq and GenBank repositories are common resources to obtain reference se |---|---|---|---| | Complete | 1595845 | |
cmd`ganon build --source genbank --organism-group archaea bacteria fungi viral --threads 48 --db-prefix abfv_gb`
| | One assembly per species | 99505 | 91 - 420 |
cmd`ganon build --source genbank --organism-group archaea bacteria fungi viral --threads 48 --genome-updater "-A 'species:1'" --db-prefix abfv_gb_t1s`
| -| Complete genomes (higher quality) | 92917 | |
cmd`ganon build --source genbank --organism-group archaea bacteria fungi viral --threads 48 --complete-genomes --db-prefix abfv_gb_cg`
| -| One assembly per species of complete genomes | 34497 | |
cmd`ganon build --source genbank --organism-group archaea bacteria fungi viral --threads 48 --complete-genomes "-A 'species:1'" --db-prefix abfv_gb_cg_t1s`
| +| Complete genomes (higher quality) | 92917 | 24 - |
cmd`ganon build --source genbank --organism-group archaea bacteria fungi viral --threads 48 --complete-genomes --db-prefix abfv_gb_cg`
| +| One assembly per species of complete genomes | 34497 | 10 - |
cmd`ganon build --source genbank --organism-group archaea bacteria fungi viral --threads 48 --complete-genomes "-A 'species:1'" --db-prefix abfv_gb_cg_t1s`
| \* Size (GB) is the final size of the database and the approximate amount of RAM necessary to build it (calculated with default parameters). The two values represent databases built with and without the `--hibf` parameter, respectively. The trade-offs between those two modes are explained [here](#hibf). diff --git a/docs/index.md b/docs/index.md index 1d96049c..0d542e38 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,6 +1,8 @@ # ganon -[![Build Status](https://travis-ci.com/pirovc/ganon.svg?branch=master)](https://travis-ci.com/pirovc/ganon) [![codecov](https://codecov.io/gh/pirovc/ganon/branch/master/graph/badge.svg)](https://codecov.io/gh/pirovc/ganon) [![Anaconda-Server Badge](https://anaconda.org/bioconda/ganon/badges/downloads.svg)](https://anaconda.org/bioconda/ganon) [![Anaconda-Server Badge](https://anaconda.org/bioconda/ganon/badges/platforms.svg)](https://anaconda.org/bioconda/ganon) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/ganon/README.html) [![Publication](https://img.shields.io/badge/DOI-10.1101%2F406017-blue)](https://dx.doi.org/10.1093/bioinformatics/btaa458) +[![GitHub release (latest by date)](https://img.shields.io/github/v/release/pirovc/ganon)](https://github.com/pirovc/ganon) [![Build Status](https://travis-ci.com/pirovc/ganon.svg?branch=master)](https://travis-ci.com/pirovc/ganon) [![codecov](https://codecov.io/gh/pirovc/ganon/branch/master/graph/badge.svg)](https://codecov.io/gh/pirovc/ganon) [![Anaconda-Server Badge](https://anaconda.org/bioconda/ganon/badges/downloads.svg)](https://anaconda.org/bioconda/ganon) [![Anaconda-Server Badge](https://anaconda.org/bioconda/ganon/badges/platforms.svg)](https://anaconda.org/bioconda/ganon) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/ganon/README.html) [![Publication](https://img.shields.io/badge/DOI-10.1101%2F406017-blue)](https://dx.doi.org/10.1093/bioinformatics/btaa458) + +[GitHub repository](https://github.com/pirovc/ganon) ganon is designed to index large sets of genomic reference sequences and to classify reads against them efficiently. The tool uses Interleaved Bloom Filters as indices based on k-mers/minimizers. It was mainly developed, but not limited, to the metagenomics classification problem: quickly assign sequence fragments to their closest reference among thousands of references. After classification, taxonomic abundance is estimated and reported.