From 941874c30bf6a070da40eab139e2f290df2e29be Mon Sep 17 00:00:00 2001 From: pirovc <4673375+pirovc@users.noreply.github.com> Date: Wed, 10 May 2023 20:55:37 +0200 Subject: [PATCH] fixes ganon v1.6.0 (#251) * docs, fix test sets * genome_updater v0.6.2, small fixes --- .travis.yml | 2 +- docs/default_databases.md | 4 ++-- libs/genome_updater | 2 +- setup.py | 2 +- src/ganon/config.py | 2 +- tests/ganon/data/build/releases/latest/MD5SUM.txt | 2 ++ .../ar53_taxonomy.tsv.gz} | Bin .../bac120_taxonomy.tsv.gz} | Bin .../data/build/releases/release207/207.0/MD5SUM | 2 -- tests/ganon/data/download_test_set_build.sh | 8 ++++---- 10 files changed, 12 insertions(+), 12 deletions(-) create mode 100644 tests/ganon/data/build/releases/latest/MD5SUM.txt rename tests/ganon/data/build/releases/{release207/207.0/ar53_taxonomy_r207.tsv.gz => latest/ar53_taxonomy.tsv.gz} (100%) rename tests/ganon/data/build/releases/{release207/207.0/bac120_taxonomy_r207.tsv.gz => latest/bac120_taxonomy.tsv.gz} (100%) delete mode 100644 tests/ganon/data/build/releases/release207/207.0/MD5SUM diff --git a/.travis.yml b/.travis.yml index b1cff5be..85878890 100755 --- a/.travis.yml +++ b/.travis.yml @@ -31,7 +31,7 @@ before_install: - eval "${MATRIX_EVAL}" - python3 -m pip install --upgrade pip - python3 -m pip install "pandas>=1.1.0" - - python3 -m pip install "multitax>=1.2.1" + - python3 -m pip install "multitax>=1.3.1" - if [ "$BUILD_TYPE" == "Coverage" ]; then python3 -m pip install coverage; fi diff --git a/docs/default_databases.md b/docs/default_databases.md index a5b2b3e9..b2ec6518 100644 --- a/docs/default_databases.md +++ b/docs/default_databases.md @@ -41,8 +41,8 @@ NCBI RefSeq and GenBank repositories are common resources to obtain reference se |---|---|---|---| | Complete | 1595845 | |
cmd`ganon build --source genbank --organism-group archaea bacteria fungi viral --threads 48 --db-prefix abfv_gb`
| | One assembly per species | 99505 | 91 - 420 |
cmd`ganon build --source genbank --organism-group archaea bacteria fungi viral --threads 48 --genome-updater "-A 'species:1'" --db-prefix abfv_gb_t1s`
| -| Complete genomes (higher quality) | 92917 | 24 - |
cmd`ganon build --source genbank --organism-group archaea bacteria fungi viral --threads 48 --complete-genomes --db-prefix abfv_gb_cg`
| -| One assembly per species of complete genomes | 34497 | 10 - |
cmd`ganon build --source genbank --organism-group archaea bacteria fungi viral --threads 48 --complete-genomes "-A 'species:1'" --db-prefix abfv_gb_cg_t1s`
| +| Complete genomes (higher quality) | 92917 | 24 - 132 |
cmd`ganon build --source genbank --organism-group archaea bacteria fungi viral --threads 48 --complete-genomes --db-prefix abfv_gb_cg`
| +| One assembly per species of complete genomes | 34497 | 10 - 34 |
cmd`ganon build --source genbank --organism-group archaea bacteria fungi viral --threads 48 --complete-genomes "-A 'species:1'" --db-prefix abfv_gb_cg_t1s`
| \* Size (GB) is the final size of the database and the approximate amount of RAM necessary to build it (calculated with default parameters). The two values represent databases built with and without the `--hibf` parameter, respectively. The trade-offs between those two modes are explained [here](#hibf). diff --git a/libs/genome_updater b/libs/genome_updater index 077e4aee..56f610b3 160000 --- a/libs/genome_updater +++ b/libs/genome_updater @@ -1 +1 @@ -Subproject commit 077e4aee4dc9e2019477e4a8bbd41227d52f92db +Subproject commit 56f610b3b4d7a280e418809370212c605c8ff17f diff --git a/setup.py b/setup.py index c5d7d15e..e2b63570 100755 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ def read(filename): url="https://www.github.com/pirovc/ganon", license='MIT', author="Vitor C. Piro", - description="ganon classifies short DNA sequences against large sets of genomic reference sequences efficiently", + description="ganon classifies DNA sequences against large sets of genomic reference sequences efficiently", long_description=read("README.md"), package_dir={'': 'src'}, packages=["ganon"], diff --git a/src/ganon/config.py b/src/ganon/config.py index 822832d2..c0c974ab 100644 --- a/src/ganon/config.py +++ b/src/ganon/config.py @@ -397,7 +397,7 @@ def validate(self): elif check_file(db_prefix + ".ibf"): ibf = True else: - print_log("File not found: " + prefix + ".ibf/.hibf" ) + print_log("File not found: " + db_prefix + ".ibf/.hibf" ) return False if check_file(db_prefix + ".tax"): diff --git a/tests/ganon/data/build/releases/latest/MD5SUM.txt b/tests/ganon/data/build/releases/latest/MD5SUM.txt new file mode 100644 index 00000000..4d2dd789 --- /dev/null +++ b/tests/ganon/data/build/releases/latest/MD5SUM.txt @@ -0,0 +1,2 @@ +07b534765d6b7d3e4d8bf67f549a5d66 build/releases/latest/ar53_taxonomy.tsv.gz +70a673d332f60af1cf68e34d09a56816 build/releases/latest/bac120_taxonomy.tsv.gz diff --git a/tests/ganon/data/build/releases/release207/207.0/ar53_taxonomy_r207.tsv.gz b/tests/ganon/data/build/releases/latest/ar53_taxonomy.tsv.gz similarity index 100% rename from tests/ganon/data/build/releases/release207/207.0/ar53_taxonomy_r207.tsv.gz rename to tests/ganon/data/build/releases/latest/ar53_taxonomy.tsv.gz diff --git a/tests/ganon/data/build/releases/release207/207.0/bac120_taxonomy_r207.tsv.gz b/tests/ganon/data/build/releases/latest/bac120_taxonomy.tsv.gz similarity index 100% rename from tests/ganon/data/build/releases/release207/207.0/bac120_taxonomy_r207.tsv.gz rename to tests/ganon/data/build/releases/latest/bac120_taxonomy.tsv.gz diff --git a/tests/ganon/data/build/releases/release207/207.0/MD5SUM b/tests/ganon/data/build/releases/release207/207.0/MD5SUM deleted file mode 100644 index 7976640f..00000000 --- a/tests/ganon/data/build/releases/release207/207.0/MD5SUM +++ /dev/null @@ -1,2 +0,0 @@ -07b534765d6b7d3e4d8bf67f549a5d66 build/releases/release207/207.0/ar53_taxonomy_r207.tsv.gz -70a673d332f60af1cf68e34d09a56816 build/releases/release207/207.0/bac120_taxonomy_r207.tsv.gz diff --git a/tests/ganon/data/download_test_set_build.sh b/tests/ganon/data/download_test_set_build.sh index dcd3cd57..71c165d1 100755 --- a/tests/ganon/data/download_test_set_build.sh +++ b/tests/ganon/data/download_test_set_build.sh @@ -61,14 +61,14 @@ md5sum "${outfld}pub/taxonomy/new_taxdump/new_taxdump.tar.gz" > "${outfld}pub/ta rm "${outfld}new_taxdump.tar.gz" "${outfld}taxidlineage.dmp" "${outfld}rankedlineage.dmp" "${outfld}pub/taxonomy/new_taxdump/taxidlineage.dmp" "${outfld}pub/taxonomy/new_taxdump/rankedlineage.dmp" #gtdb -gtdb_out="${outfld}releases/release207/207.0/" +gtdb_out="${outfld}releases/latest/" mkdir -p "${gtdb_out}" -gtdb_tax=( "ar53_taxonomy_r207.tsv.gz" "bac120_taxonomy_r207.tsv.gz" ) +gtdb_tax=( "ar53_taxonomy.tsv.gz" "bac120_taxonomy.tsv.gz" ) for tax in "${gtdb_tax[@]}"; do - wget --quiet --show-progress --output-document "${outfld}${tax}" "https://data.gtdb.ecogenomic.org/releases/release207/207.0/${tax}" + wget --quiet --show-progress --output-document "${outfld}${tax}" "https://data.gtdb.ecogenomic.org/releases/latest/${tax}" join -1 1 -2 1 <(cut -f 1 "${outfld}accessions_taxids.txt" | sort) <(zcat "${outfld}${tax}" | awk 'BEGIN{FS=OFS="\t"}{print $1,$1,$2}' | sed -r 's/^.{3}//' | sort) -t$'\t' -o "2.2,2.3" | gzip > "${gtdb_out}${tax}" rm "${outfld}${tax}" done -md5sum ${gtdb_out}*.tsv.gz > "${gtdb_out}MD5SUM" +md5sum ${gtdb_out}*.tsv.gz > "${gtdb_out}MD5SUM.txt" rm ${outfld}accessions_taxids.txt