diff --git a/.conda/environment.yaml b/.conda/environment.yaml index 0e458b2026..26df1c59a7 100644 --- a/.conda/environment.yaml +++ b/.conda/environment.yaml @@ -31,3 +31,4 @@ dependencies: - r-magrittr - bioconductor-qvalue - fastani +- meme diff --git a/.github/workflows/daily-component-tests-and-migrations.yaml b/.github/workflows/daily-component-tests-and-migrations.yaml index d39362690e..61e780e8f0 100644 --- a/.github/workflows/daily-component-tests-and-migrations.yaml +++ b/.github/workflows/daily-component-tests-and-migrations.yaml @@ -37,10 +37,13 @@ jobs: anvi-self-test --suite metagenomics-full --no-interactive anvi-self-test --suite pangenomics --no-interactive anvi-self-test --suite inversions --no-interactive - anvi-self-test --suite metabolism --no-interactive # the following steps cause our actions to fail on GitHub runners # due to space limitations :/ please do not uncomment this until we # have a solution for this :/ + #- name: "Run component tests for metabolism framework" + # shell: bash -l {0} + # run: | + # anvi-self-test --suite metabolism --no-interactive #- name: "Migrate ancient anvi'o databases" # shell: bash -l {0} # run: | diff --git a/Dockerfiles/anvio-structure/Dockerfile b/Dockerfiles/anvio-structure/Dockerfile index cf0372f676..fce22f6b6e 100644 --- a/Dockerfiles/anvio-structure/Dockerfile +++ b/Dockerfiles/anvio-structure/Dockerfile @@ -72,7 +72,7 @@ RUN rm anvio-7.1.tar.gz # Setup anvi'o databases ############################################################## RUN anvi-setup-interacdome -RUN anvi-setup-kegg-kofams --kegg-snapshot v2020-12-23 +RUN anvi-setup-kegg-data --kegg-snapshot v2020-12-23 RUN anvi-setup-pfams --pfam-version 33.1 RUN anvi-setup-ncbi-cogs --cog-version COG20 diff --git a/anvio/__init__.py b/anvio/__init__.py index 312bc622c6..e478d64bca 100644 --- a/anvio/__init__.py +++ b/anvio/__init__.py @@ -1044,27 +1044,26 @@ def TABULATE(table, header, numalign="right", max_width=0): "you will not have the most up-to-date version of KEGG for your annotations, metabolism " "estimations, or any other downstream uses of this data. If that is going to be a problem for you, " "do not fear - you can provide this flag to tell anvi'o to download the latest, freshest data directly " - "from KEGG's REST API and set it up into an anvi'o-compatible database."} + "from KEGG's REST API and set it up into anvi'o-compatible files."} ), 'only-download': ( ['--only-download'], {'default': False, 'action': 'store_true', 'help': "You want this program to only download data from KEGG, and then stop. It will not " - "make a modules database. (It would be a *very* good idea for you to specify a " - "data directory using --kegg-data-dir in this case, so that you can find the resulting " - "data easily and avoid messing up any data in the default KEGG directory. But you are " - "of course free to do whatever you want.). Note that KOfam profiles will still be " - "processed with `hmmpress` if you choose this option."} + "process the data (ie, into organized HMMs or a modules database). (It would be a " + "*very* good idea for you to specify a data directory using --kegg-data-dir in this " + "case, so that you can find the resulting data easily and avoid messing up any data " + "in the default KEGG directory. But you are of course free to do whatever you want.)"} ), - 'only-database': ( - ['--only-database'], + 'only-processing': ( + ['--only-processing'], {'default': False, 'action': 'store_true', - 'help': "You already have all the KEGG data you need on your computer. Perhaps you even got it from " + 'help': "You already have all the KEGG data you need on your computer. Probably you even got it from " "this program, using the --only-download option. We don't know. What matters is that you don't " - "need anything downloaded, you just want this program to setup a modules database from that " - "existing data. Good. We can do that if you provide this flag (and probably also the --kegg-data-dir " + "need anything downloaded, you just want this program to process that " + "existing data. Good. We can do that if you provide this flag (and hopefully also the --kegg-data-dir " "in which said data is located)."} ), 'kegg-snapshot': ( @@ -1072,9 +1071,10 @@ def TABULATE(table, header, numalign="right", max_width=0): {'default': None, 'type': str, 'metavar': 'RELEASE_NUM', - 'help': "If you are particularly interested in an earlier snapshot of KEGG that anvi'o knows about, you can set it here. " - "Otherwise anvi'o will always use the latest snapshot it knows about, which is likely to be the one associated with " - "the current release of anvi'o."} + 'help': "The default behavior of this program is to download a pre-processed snapshot of data " + "from KEGG. If you are particularly interested in an earlier snapshot of KEGG that anvi'o " + "knows about, you can set it here. Otherwise anvi'o will always use the latest snapshot " + "it knows about, which is likely to be the one associated with the current release of anvi'o."} ), 'hide-outlier-SNVs': ( ['--hide-outlier-SNVs'], diff --git a/anvio/biochemistry/reactionnetwork.py b/anvio/biochemistry/reactionnetwork.py index 4b13409305..d4b810ea58 100644 --- a/anvio/biochemistry/reactionnetwork.py +++ b/anvio/biochemistry/reactionnetwork.py @@ -1076,7 +1076,8 @@ class KODatabase: Unless an alternative directory is provided, the database is downloaded and set up in a default anvi'o data directory, and loaded from this directory in network construction. """ - default_dir = os.path.join(os.path.dirname(ANVIO_PATH), 'data/MISC/REACTION_NETWORK/KO') + default_dir = os.path.join(os.path.dirname(ANVIO_PATH), 'data/MISC/KEGG/KO_REACTION_NETWORK') + expected_files = ['ko_info.txt', 'ko_data.tsv'] def __init__(self, ko_dir: str = None) -> None: """ @@ -1093,19 +1094,17 @@ def __init__(self, ko_dir: str = None) -> None: raise ConfigError(f"There is no such directory, '{ko_dir}'.") else: ko_dir = self.default_dir - info_path = os.path.join(ko_dir, 'ko_info.txt') - if not os.path.isfile(info_path): - raise ConfigError(f"No required file named 'ko_info.txt' was found in the KO directory, '{ko_dir}'.") - table_path = os.path.join(ko_dir, 'ko_data.tsv') - if not os.path.isfile(table_path): - raise ConfigError(f"No required file named 'ko_data.tsv' was found in the KO directory, '{ko_dir}'.") - f = open(info_path) + for expected_file in self.expected_files: + if not os.path.isfile(os.path.join(ko_dir, expected_file)): + raise ConfigError(f"No required file named '{expected_file}' was found in the KO directory, '{ko_dir}'.") + + f = open(os.path.join(ko_dir, 'ko_info.txt')) f.readline() self.release = ' '.join(f.readline().strip().split()[1:]) f.close() - self.ko_table = pd.read_csv(table_path, sep='\t', header=0, index_col=0, low_memory=False) + self.ko_table = pd.read_csv(os.path.join(ko_dir, 'ko_data.tsv'), sep='\t', header=0, index_col=0, low_memory=False) def set_up( num_threads: int = 1, @@ -1124,12 +1123,13 @@ def set_up( Number of threads to use in parallelizing the download of KO files. dir : str, None - Directory in which to create a new subdirectory called 'KO', in which files are - downloaded and set up. This argument overrides the default directory. + Directory in which to create a subdirectory called `KO_REACTION_NETWORK`, + in which files are downloaded and set up. This argument overrides + the default directory. reset : bool, False - If True, remove any existing 'KO' database directory and the files therein. If False, - an exception is raised if there are files in this directory. + If True, remove any existing 'KO_REACTION_NETWORK' database directory and the files + therein. If False, an exception is raised if there are files in this directory. run : anvio.terminal.Run, None @@ -1137,9 +1137,10 @@ def set_up( """ if dir: if os.path.isdir(dir): - ko_dir = os.path.join(dir, 'KO') + ko_dir = os.path.join(dir, 'KO_REACTION_NETWORK') else: - raise ConfigError(f"There is no such directory, '{dir}'.") + raise ConfigError(f"There is no such directory, '{dir}'. You should create it " + "first if you want to use it.") else: ko_dir = KODatabase.default_dir parent_dir = os.path.dirname(ko_dir) @@ -1242,7 +1243,7 @@ def set_up( "from the KO database. Anvi'o will now attempt to redownload all of the files. " ) run.info(f"Total number of KOs/entry files", total) - run.info("KEGG database version", release_after) + run.info("KEGG KO database version", release_after) run.info("KEGG KO list", list_path) run.info("KEGG KO info", info_path) @@ -1264,7 +1265,7 @@ def set_up( section = line.split()[0] if section == 'NAME': # The name value follows 'NAME' at the beginning of the line. - ko_data['name'] = line[4:].lstrip().rstrip() + ko_data['name'] = line[4:].strip() # EC numbers associated with the KO are recorded at the end of the name value. ec_string = re.search('\[EC:.*\]', line) if ec_string: diff --git a/anvio/biochemistry/refdbs.py b/anvio/biochemistry/refdbs.py index 81f033a0a0..b3c98254ed 100644 --- a/anvio/biochemistry/refdbs.py +++ b/anvio/biochemistry/refdbs.py @@ -91,6 +91,8 @@ def raise_missing_files(self, missing: List[str]) -> None: ) def _set_up_db_dir(self, reset: bool) -> None: + if os.path.split(self.db_dir)[0] == self.default_superdir and not os.path.exists(self.default_superdir): + os.mkdir(self.default_superdir) if os.path.exists(self.db_dir): if reset: rmtree(self.db_dir) diff --git a/anvio/data/misc/KEGG-SNAPSHOTS.yaml b/anvio/data/misc/KEGG-SNAPSHOTS.yaml index f0ce9f20ca..601d581b68 100644 --- a/anvio/data/misc/KEGG-SNAPSHOTS.yaml +++ b/anvio/data/misc/KEGG-SNAPSHOTS.yaml @@ -6,60 +6,77 @@ v2020-04-27: archive_name: KEGG_build_2020-04-27_b893b7b915cb.tar.gz hash: b893b7b915cb modules_db_version: 1 + no_modeling_data: True v2020-06-23: url: https://ndownloader.figshare.com/files/23701919 archive_name: KEGG_build_2020-06-23_4a75508b48aa.tar.gz hash: 4a75508b48aa modules_db_version: 2 + no_modeling_data: True v2020-08-06: url: https://ndownloader.figshare.com/files/25464530 archive_name: KEGG_build_2020-08-06_8f88ef165f4c.tar.gz hash: 8f88ef165f4c modules_db_version: 2 + no_modeling_data: True v2020-12-23: url: https://ndownloader.figshare.com/files/25878342 archive_name: KEGG_build_2020-12-23_45b7cc2e4fdc.tar.gz hash: 45b7cc2e4fdc modules_db_version: 2 + no_modeling_data: True v2021-12-18: url: https://figshare.com/ndownloader/files/31959416 archive_name: KEGG_build_2021-12-18_58937b64c44c.tar.gz hash: 58937b64c44c modules_db_version: 3 + no_modeling_data: True v2022-04-14: url: https://figshare.com/ndownloader/files/34817812 archive_name: KEGG_build_2022-04-14_666feeac5de2.tar.gz hash: 666feeac5de2 modules_db_version: 4 + no_modeling_data: True v2023-01-10: url: https://figshare.com/ndownloader/files/38799687 archive_name: KEGG_build_2023-01-10_d20a0dcd2128.tar.gz hash: d20a0dcd2128 modules_db_version: 4 + no_modeling_data: True v2023-09-18: url: https://figshare.com/ndownloader/files/42381873 archive_name: KEGG_build_2023-09-18_a2b5bde358bb.tar.gz hash: a2b5bde358bb modules_db_version: 4 + no_modeling_data: True + +v2023-09-22: + url: https://figshare.com/ndownloader/files/42428115 + archive_name: KEGG_build_2023-09-22_a2b5bde358bb.tar.gz + hash: a2b5bde358bb + modules_db_version: 4 # How to add a new KEGG snapshot to this file: # 1. download the latest data directly from KEGG by running -# `anvi-setup-kegg-kofams -D --kegg-data-dir ./KEGG` +# `anvi-setup-kegg-data -D --kegg-data-dir ./KEGG -T 5` # 2. get the hash value and version info from the MODULES.db: # `anvi-db-info ./KEGG/MODULES.db` # 3. archive that directory: # `tar -czvf KEGG_build_YYYY-MM-DD_HASH.tar.gz ./KEGG` -# Please remember to replace YYYY-MM-DD with the current date and replace HASH with the MODULES.db hash value obtained in step 2 +# Please remember to replace YYYY-MM-DD with the current date and replace HASH with the +# MODULES.db hash value obtained in step 2 # 4. Test that setup works with this archive by running -# `anvi-setup-kegg-kofams --kegg-archive KEGG_build_YYYY-MM-DD_HASH.tar.gz --kegg-data-dir TEST_NEW_KEGG_ARCHIVE` +# `anvi-setup-kegg-data --kegg-archive KEGG_build_YYYY-MM-DD_HASH.tar.gz --kegg-data-dir TEST_NEW_KEGG_ARCHIVE` # 5. Upload the .tar.gz archive to figshare and get the download url -# 6. Finally, add an entry to the bottom of this file with the url, archive name, and MODULES.db hash and version. You should also update the -# default self.target_snapshot variable in kegg.py to point to this latest version that you have added. -# 7. Test it by running `anvi-setup-kegg-kofams --kegg-data-dir TEST_NEW_KEGG`, and if it works you are done :) +# 6. Finally, add an entry to the bottom of this file with the url, archive name, and MODULES.db hash and version. +# You should also update the default self.target_snapshot variable in kegg.py to point to this +# latest version that you have added. +# 7. Test it by running `anvi-setup-kegg-data --kegg-data-dir TEST_NEW_KEGG` (you don't need to run the full thing, +# just long enough to see that the correct snapshot is being downloaded), and if it works you are done :) diff --git a/anvio/data/misc/PEOPLE/DEVELOPERS.yaml b/anvio/data/misc/PEOPLE/DEVELOPERS.yaml index d6c4edd8a1..c3a770ef61 100644 --- a/anvio/data/misc/PEOPLE/DEVELOPERS.yaml +++ b/anvio/data/misc/PEOPLE/DEVELOPERS.yaml @@ -7,7 +7,7 @@ linkedin: meren orcid: 0000-0001-9013-4827 skype: a.murat.eren - bio: "Computer scientist and microbial ecologist interested in undersatnding mechanisms by which microbes interact with their surroundings, evolve, disperse, and respond to environmental change." + bio: "Computer scientist and microbial ecologist interested in understanding mechanisms by which microbes interact with their surroundings, evolve, disperse, and respond to environmental change." affiliations: - title: Professor inst: Helmholtz Institute for Functional Marine Biodiversity at Oldenburg diff --git a/anvio/docs/artifacts/anvi-reaction-network.md b/anvio/docs/artifacts/anvi-reaction-network.md index d37ff0eddf..6d2dbc5d53 100644 --- a/anvio/docs/artifacts/anvi-reaction-network.md +++ b/anvio/docs/artifacts/anvi-reaction-network.md @@ -1,3 +1,3 @@ This program **generates a metabolic reaction network in a %(contigs-db)s.** Gene %(functions)s that have been annotated in the %(contigs-db)s are compared to reference databases, yielding predictions of the biochemical reactions that may be catalyzed by the gene products. Possible applications of anvi'o metabolic networks include the export of draft metabolic models (see %(anvi-get-metabolic-model-file)s) and the import and integration of metabolomic datasets. -A network can currently be generated from KEGG Orthology (KO) annotations of genes in conjunction with %(reaction-ref-data)s: KEGG ([KO](https://www.genome.jp/kegg/ko.html), [REACTION](https://www.genome.jp/kegg/reaction/), and [COMPOUND](https://www.genome.jp/kegg/compound/)) databases and the [ModelSEED Biochemistry](https://github.com/ModelSEED/ModelSEEDDatabase) database. The reference databases must have been downloaded and set up by %(anvi-setup-protein-reference-database)s. +A network can currently be generated from KEGG Orthology (KO) annotations of genes in conjunction with %(reaction-ref-data)s: KEGG ([KO](https://www.genome.jp/kegg/ko.html), [REACTION](https://www.genome.jp/kegg/reaction/), and [COMPOUND](https://www.genome.jp/kegg/compound/)) databases and the [ModelSEED Biochemistry](https://github.com/ModelSEED/ModelSEEDDatabase) database. The reference databases must have been downloaded and set up by %(anvi-setup-modelseed-database)s. diff --git a/anvio/docs/artifacts/kegg-data.md b/anvio/docs/artifacts/kegg-data.md index b2ef0b5b67..b240340f58 100644 --- a/anvio/docs/artifacts/kegg-data.md +++ b/anvio/docs/artifacts/kegg-data.md @@ -1,16 +1,16 @@ A **directory of data** downloaded from the [KEGG database resource](https://www.kegg.jp/) for use in function annotation and metabolism estimation. -It is created by running the program %(anvi-setup-kegg-kofams)s. Not everything from KEGG is included in this directory, only the information relevant to downstream programs. The most critical components of this directory are KOfam HMM profiles and the %(modules-db)s which contains information on metabolic pathways as described in the [KEGG MODULES resource](https://www.genome.jp/kegg/module.html), as well as functional classification hierarchies from [KEGG BRITE](https://www.genome.jp/kegg/brite.html). +It is created by running the program %(anvi-setup-kegg-data)s. Not everything from KEGG is included in this directory, only the information relevant to downstream programs. The most critical components of this directory are KOfam HMM profiles and the %(modules-db)s which contains information on metabolic pathways as described in the [KEGG MODULES resource](https://www.genome.jp/kegg/module.html), as well as functional classification hierarchies from [KEGG BRITE](https://www.genome.jp/kegg/brite.html). Programs that rely on this data directory include %(anvi-run-kegg-kofams)s and %(anvi-estimate-metabolism)s. ## Directory Location The default location of this data is in the anvi'o folder, at `anvio/anvio/data/misc/KEGG/`. -You can change this location when you run %(anvi-setup-kegg-kofams)s by providing a different path to the `--kegg-data-dir` parameter: +You can change this location when you run %(anvi-setup-kegg-data)s by providing a different path to the `--kegg-data-dir` parameter: {{ codestart }} -anvi-setup-kegg-kofams --kegg-data-dir /path/to/directory/KEGG +anvi-setup-kegg-data --kegg-data-dir /path/to/directory/KEGG {{ codestop }} If you do this, you will need to provide this path to downstream programs that require this data as well. diff --git a/anvio/docs/artifacts/modules-db.md b/anvio/docs/artifacts/modules-db.md index d5654d90fd..8863e1b053 100644 --- a/anvio/docs/artifacts/modules-db.md +++ b/anvio/docs/artifacts/modules-db.md @@ -1,6 +1,6 @@ A type of database containing information from either A) the [KEGG MODULE database](https://www.genome.jp/kegg/module.html) and [KEGG BRITE database](https://www.genome.jp/kegg/brite.html), or B) user-defined metabolic modules, for use in metabolism estimation and/or functional annotation of KEGG Orthologs (KOs). -These databases are part of the %(kegg-data)s and %(user-modules-data)s directories. You can get one on your computer by running %(anvi-setup-kegg-kofams)s or %(anvi-setup-user-modules)s. Programs that rely on this type of database include %(anvi-run-kegg-kofams)s and %(anvi-estimate-metabolism)s. +These databases are part of the %(kegg-data)s and %(user-modules-data)s directories. You can get one on your computer by running %(anvi-setup-kegg-data)s or %(anvi-setup-user-modules)s. Programs that rely on this type of database include %(anvi-run-kegg-kofams)s and %(anvi-estimate-metabolism)s. Most users will never have to interact directly with this kind of database. However, for the brave few who want to try this (or who are figuring out how anvi'o works under the hood), there is some relevant information below. @@ -19,7 +19,7 @@ In the current implementation, data about each metabolic pathway (from the KEGG | M00001 | ORTHOLOGY | K12407 | hexokinase/glucokinase [EC:2.7.1.1 2.7.1.2] [RN:R01786] | 4 | | (...) | (...) | (...) | (...) | (...) | -For the MODULES.db that comes out of %(anvi-setup-kegg-kofams)s, these data correspond to the information that can be found on the KEGG website for each metabolic module - for an example, you can see the page for [M00001](https://www.genome.jp/dbget-bin/www_bget?md:M00001) (or, alternatively, its [flat text file version](http://rest.kegg.jp/get/M00001) from the KEGG REST API). +For the MODULES.db that comes out of %(anvi-setup-kegg-data)s, these data correspond to the information that can be found on the KEGG website for each metabolic module - for an example, you can see the page for [M00001](https://www.genome.jp/dbget-bin/www_bget?md:M00001) (or, alternatively, its [flat text file version](http://rest.kegg.jp/get/M00001) from the KEGG REST API). The USER_MODULES.db that comes out of %(anvi-setup-user-modules)s contains similar information, but defined by the user instead of downloaded from the KEGG website. @@ -31,7 +31,7 @@ Finally, some rows of data originate from the same line in the original KEGG MOD ### The BRITE hierarchies table -In database version 4 or later, there is the option to include KEGG BRITE data in the modules database when setting one up using %(anvi-setup-kegg-kofams)s. If this is done, the database will include a table called `brite_hierarchies` which stores the set of functional hierarchies that each KEGG Ortholog belongs to. It will look like this: +In database version 4 or later, there is the option to include KEGG BRITE data in the modules database when setting one up using %(anvi-setup-kegg-data)s. If this is done, the database will include a table called `brite_hierarchies` which stores the set of functional hierarchies that each KEGG Ortholog belongs to. It will look like this: |**hierarchy_accession**|**hierarchy_name**|**ortholog_accession**|**ortholog_name**|**categorization**| |:--|:--|:--|:--|:--| @@ -81,7 +81,7 @@ modules_db_hash ..............................: 45b7cc2e4fdc ### Other important values in the self table -The `data_source` key will tell you if the current database was generated from KEGG data using %(anvi-setup-kegg-kofams)s or from user-defined metabolic modules using %(anvi-setup-user-modules)s. +The `data_source` key will tell you if the current database was generated from KEGG data using %(anvi-setup-kegg-data)s or from user-defined metabolic modules using %(anvi-setup-user-modules)s. The `annotation_sources` key will list the functional annotation sources that are required to annotate all enzymes found in the module definitions. diff --git a/anvio/docs/artifacts/reaction-ref-data.md b/anvio/docs/artifacts/reaction-ref-data.md index 8f9fe48591..9b6ccf67fa 100644 --- a/anvio/docs/artifacts/reaction-ref-data.md +++ b/anvio/docs/artifacts/reaction-ref-data.md @@ -1,3 +1,5 @@ -Reference databases required for %(anvi-reaction-network)s are stored in **directories of downloaded files set up by %(anvi-setup-protein-reference-database)s**. +Reference databases required for %(anvi-reaction-network)s are stored in **directories of downloaded files set up by %(anvi-setup-modelseed-database)s and %(anvi-setup-kegg-data)s**. -%(anvi-reaction-network)s currently relies upon comparison of KEGG Orthology (KO) gene annotations (%(kegg-functions)s) stored in a %(contigs-db) to reference databases: KEGG [KO](https://www.genome.jp/kegg/ko.html), [REACTION](https://www.genome.jp/kegg/reaction/), and [COMPOUND](https://www.genome.jp/kegg/compound/) and [ModelSEED Biochemistry](https://github.com/ModelSEED/ModelSEEDDatabase). The ModelSEED Biochemistry database harmonizes and consolidates reference data from multiple sources, including KEGG, in two comprehensive tables of reactions and compounds. +%(anvi-reaction-network)s currently relies upon comparison of KEGG Orthology (KO) gene annotations (%(kegg-functions)s) stored in a %(contigs-db) to reference databases: KEGG [KO](https://www.genome.jp/kegg/ko.html) and [ModelSEED Biochemistry](https://github.com/ModelSEED/ModelSEEDDatabase). The ModelSEED Biochemistry database harmonizes and consolidates reference data from multiple sources, including KEGG, in two comprehensive tables of reactions and compounds. + +The KEGG databases (%(kegg-data)s) can be obtained by running %(anvi-setup-kegg-data)s, and the ModelSEED database can be obtained by running %(anvi-setup-modelseed-database)s. diff --git a/anvio/docs/programs/anvi-estimate-metabolism.md b/anvio/docs/programs/anvi-estimate-metabolism.md index 81e13056c2..ac688191da 100644 --- a/anvio/docs/programs/anvi-estimate-metabolism.md +++ b/anvio/docs/programs/anvi-estimate-metabolism.md @@ -12,7 +12,7 @@ For a practical tutorial on how to use this program, visit [this link](https://m You have three options when it comes to estimating metabolism. -1. KEGG only (this is the default). In this case, estimation will be run on modules from the KEGG MODULES database, which you must set up on your computer using %(anvi-setup-kegg-kofams)s. If you have a default setup of KEGG, you need not provide any parameters to choose this option. However, if you have your KEGG data in a non-default location on your computer, you will have to use the `--kegg-data-dir` parameter to point out its location. +1. KEGG only (this is the default). In this case, estimation will be run on modules from the KEGG MODULES database, which you must set up on your computer using %(anvi-setup-kegg-data)s. If you have a default setup of KEGG, you need not provide any parameters to choose this option. However, if you have your KEGG data in a non-default location on your computer, you will have to use the `--kegg-data-dir` parameter to point out its location. 2. KEGG + USER data. In this case, we estimate on KEGG modules as in (1), but _also_ on user-defined metabolic modules that you set up with %(anvi-setup-user-modules)s and provide to this program with the `--user-modules` parameter. 3. USER data only. You can elect to skip estimation on KEGG modules and _only_ run on your own data by providing both the `--user-modules` and `--only-user-modules` parameters. @@ -20,9 +20,9 @@ You have three options when it comes to estimating metabolism. Metabolism estimation relies on gene annotations from the functional annotation source 'KOfam', also referred to as %(kegg-functions)s. Therefore, for this to work, you need to have annotated your %(contigs-db)s with hits to the KEGG KOfam database by running %(anvi-run-kegg-kofams)s prior to using this program, unless you are using the `--only-user-modules` option to ONLY estimate on user-defined metabolic modules. -Both %(anvi-run-kegg-kofams)s and %(anvi-estimate-metabolism)s rely on the %(kegg-data)s provided by %(anvi-setup-kegg-kofams)s, so if you do not already have that data on your computer, %(anvi-setup-kegg-kofams)s needs to be run first. To summarize, these are the steps that need to be done before you can use %(anvi-estimate-metabolism)s: +Both %(anvi-run-kegg-kofams)s and %(anvi-estimate-metabolism)s rely on the %(kegg-data)s provided by %(anvi-setup-kegg-data)s, so if you do not already have that data on your computer, %(anvi-setup-kegg-data)s needs to be run first. To summarize, these are the steps that need to be done before you can use %(anvi-estimate-metabolism)s: -1. Run %(anvi-setup-kegg-kofams)s to get data from KEGG onto your computer. This step only needs to be done once. +1. Run %(anvi-setup-kegg-data)s to get data from KEGG onto your computer. This step only needs to be done once. 2. [If not using `--only-user-modules`] Run %(anvi-run-kegg-kofams)s to annotate your %(contigs-db)s with %(kegg-functions)s. This program must be run on each contigs database that you want to estimate metabolism for. If you want to estimate for your own metabolism data, then you have a couple of extra steps to go through: @@ -483,7 +483,7 @@ Config Error: The contigs DB that you are working with has been annotated with a This means that the %(modules-db)s used by %(anvi-run-kegg-kofams)s has different contents (different KOs and/or different modules) than the one you are currently using to estimate metabolism, which would lead to mismatches if metabolism estimation were to continue. There are a few ways this can happen: 1. You upgraded to a new anvi'o version and downloaded the default %(kegg-data)s associated with that release, but are working with a %(contigs-db)s that was annotated with a previous anvi'o version (and therefore a different instance of %(kegg-data)s). -2. Without changing anvi'o versions, you annotated your %(contigs-db)s with default %(kegg-data)s, and subsequently replaced that data with a different instance by running %(anvi-setup-kegg-kofams)s again with the `--reset` flag (and likely also with the `--kegg-archive`, `--kegg-snapshot`, or `--download-from-kegg` options, all of which get you a non-default version of KEGG data). Then you tried to run %(anvi-estimate-metabolism)s with the new data. +2. Without changing anvi'o versions, you annotated your %(contigs-db)s with default %(kegg-data)s, and subsequently replaced that data with a different instance by running %(anvi-setup-kegg-data)s again with the `--reset` flag (and likely also with the `--kegg-archive`, `--kegg-snapshot`, or `--download-from-kegg` options, all of which get you a non-default version of KEGG data). Then you tried to run %(anvi-estimate-metabolism)s with the new data. 3. You have multiple instances of %(kegg-data)s on your computer in different locations, and you used different ones for %(anvi-run-kegg-kofams)s and %(anvi-estimate-metabolism)s. 4. Your collaborator gave you some databases that they annotated with a different version of %(kegg-data)s than you have on your computer. @@ -505,10 +505,10 @@ export ANVIO_KEGG_SNAPSHOTS=`python -c "import anvio; import os; print(os.path.j cat $ANVIO_KEGG_SNAPSHOTS`. {{ codestop }} -Take a look through the different versions. If you see one with a hash matching to the one used to annotate your %(contigs-db)s, then you can download that version by following [the directions for setting up a KEGG snapshot](https://anvio.org/help/main/programs/anvi-setup-kegg-kofams/#setting-up-an-earlier-kegg-snapshot). Provide the snapshot version name to the `--kegg-snapshot` parameter of %(anvi-setup-kegg-kofams)s. +Take a look through the different versions. If you see one with a hash matching to the one used to annotate your %(contigs-db)s, then you can download that version by following [the directions for setting up a KEGG snapshot](https://anvio.org/help/main/programs/anvi-setup-kegg-data/#setting-up-an-earlier-kegg-snapshot). Provide the snapshot version name to the `--kegg-snapshot` parameter of %(anvi-setup-kegg-data)s. **I can't find KEGG data with a matching hash!** -If you don't have a matching metabolism database on your computer, and none of the snapshots in the `KEGG-SNAPSHOTS.yaml` file have the hash that you need, your %(contigs-db)s was probably annotated with KO and module data [downloaded directly from KEGG](https://anvio.org/help/main/programs/anvi-setup-kegg-kofams/#getting-the-most-up-to-date-kegg-data-downloading-directly-from-kegg). If you have obtained the %(contigs-db)s from a collaborator (i.e., situation 4 from above), ask them to also share their %(kegg-data)s with you, following [these steps](https://anvio.org/help/main/programs/anvi-setup-kegg-kofams/#how-do-i-share-this-data). Otherwise, anvi'o cannot really help you get this data back, and you may have to resort to option 1 described above. +If you don't have a matching metabolism database on your computer, and none of the snapshots in the `KEGG-SNAPSHOTS.yaml` file have the hash that you need, your %(contigs-db)s was probably annotated with KO and module data [downloaded directly from KEGG](https://anvio.org/help/main/programs/anvi-setup-kegg-data/#getting-the-most-up-to-date-kegg-data-downloading-directly-from-kegg). If you have obtained the %(contigs-db)s from a collaborator (i.e., situation 4 from above), ask them to also share their %(kegg-data)s with you, following [these steps](https://anvio.org/help/main/programs/anvi-setup-kegg-data/#how-do-i-share-this-data). Otherwise, anvi'o cannot really help you get this data back, and you may have to resort to option 1 described above. If none of these solutions help you to get rid of the version incompatibility error, please feel free to reach out to the anvi'o developers for help. @@ -528,7 +528,7 @@ Regardless of which input type is provided to this program, the basic requiremen #### Module Definitions One set of metabolic pathway definitions that can be used by this program is the [KEGG MODULE resource](https://www.genome.jp/kegg/module.html). You can also define your own set of metabolic modules, but the definition format and estimation strategy will be the same. So for brevity's sake, the following discussion will cover the KEGG data case. -The program %(anvi-setup-kegg-kofams)s acquires the definitions of these modules using the KEGG API and puts them into the %(modules-db)s. The definitions are strings of KEGG Ortholog (KO) identifiers, representing the functions necessary to carry out each step of the metabolic pathway. Let's use module [M00018](https://www.genome.jp/kegg-bin/show_module?M00018), Threonine Biosynthesis, as an example. Here is the module definition, in picture form: +The program %(anvi-setup-kegg-data)s acquires the definitions of these modules using the KEGG API and puts them into the %(modules-db)s. The definitions are strings of KEGG Ortholog (KO) identifiers, representing the functions necessary to carry out each step of the metabolic pathway. Let's use module [M00018](https://www.genome.jp/kegg-bin/show_module?M00018), Threonine Biosynthesis, as an example. Here is the module definition, in picture form: ![Module M00018 Definition](../../images/M00018.png){:.center-img .width-50} diff --git a/anvio/docs/programs/anvi-run-kegg-kofams.md b/anvio/docs/programs/anvi-run-kegg-kofams.md index 24aa2d88f7..e5c765b790 100644 --- a/anvio/docs/programs/anvi-run-kegg-kofams.md +++ b/anvio/docs/programs/anvi-run-kegg-kofams.md @@ -1,4 +1,4 @@ -Essentially, this program uses the KEGG database to annotate functions and metabolic pathways in a %(contigs-db)s. More specifically, %(anvi-run-kegg-kofams)s annotates a %(contigs-db)s with HMM hits from KOfam, a database of KEGG Orthologs (KOs). You must set up these HMMs on your computer using %(anvi-setup-kegg-kofams)s before you can use this program. Membership of KOfam functions in KEGG metabolic MODULES and BRITE hierarchies is also stored in the %(contigs-db)s. +Essentially, this program uses the KEGG database to annotate functions and metabolic pathways in a %(contigs-db)s. More specifically, %(anvi-run-kegg-kofams)s annotates a %(contigs-db)s with HMM hits from KOfam, a database of KEGG Orthologs (KOs). You must set up these HMMs on your computer using %(anvi-setup-kegg-data)s before you can use this program. If a %(modules-db)s is available, membership of KOfam functions in KEGG metabolic MODULES and BRITE hierarchies is also stored in the %(contigs-db)s. Running this program is a pre-requisite for metabolism estimation with %(anvi-estimate-metabolism)s. Note that if you are planning to run metabolism estimation, it must be run with the same %(kegg-data)s that is used in this program to annotate KOfam hits. @@ -20,7 +20,7 @@ For every gene without a KOfam annotation, we examine all the hits with an e-val Please note that this strategy is just a heuristic. We have tried to pick default parameters that seemed reasonable but by no means have we comprehensively tested and optimized them. This is why X and Y are mutable so that you can explore different values and see how they work for your data. It is always a good idea to double-check your annotations to make sure they are reasonable and as stringent as you'd like them to be. In addition, if you do not feel comfortable using this heuristic at all, you can always turn this behavior off and rely solely on KEGG's bitscore thresholds. :) **3) Put annotations in the database** -In the %(contigs-db)s functions table, annotated KO hits (%(kegg-functions)s) will have the source `KOfam`. Metabolic Modules and BRITE functional classifications containing these functions also have entries in the table, with sources labeled `KEGG_Module` and `KEGG_BRITE`. BRITE classification will not occur if %(anvi-setup-kegg-kofams)s was not set up with BRITE data (see the artifact for that program to see how to include BRITE). +In the %(contigs-db)s functions table, annotated KO hits (%(kegg-functions)s) will have the source `KOfam`. If a %(modules-db)s is available, metabolic modules and BRITE functional classifications containing these functions also have entries in the table, with sources labeled `KEGG_Module` and `KEGG_BRITE`. BRITE classification will not occur if %(anvi-setup-kegg-data)s was not set up with BRITE data (see the artifact for that program to see how to include BRITE). ## Standard usage @@ -29,7 +29,7 @@ anvi-run-kegg-kofams -c %(contigs-db)s {{ codestop }} ## Use a specific non-default KEGG data directory -If you have previously setup your KEGG data directory using `--kegg-data-dir` (see %(anvi-setup-kegg-kofams)s), or have moved the KEGG data directory that you wish to use to a non-default location (maybe you like keeping the older versions around when you update, we don't know how you roll), then you may need to specify where to find the KEGG data so that this program can use the right one. In that case, this is how you do it: +If you have previously setup your KEGG data directory using `--kegg-data-dir` (see %(anvi-setup-kegg-data)s), or have moved the KEGG data directory that you wish to use to a non-default location (maybe you like keeping the older versions around when you update, we don't know how you roll), then you may need to specify where to find the KEGG data so that this program can use the right one. In that case, this is how you do it: {{ codestart }} anvi-run-kegg-kofams -c %(contigs-db)s \ diff --git a/anvio/docs/programs/anvi-setup-kegg-kofams.md b/anvio/docs/programs/anvi-setup-kegg-data.md similarity index 71% rename from anvio/docs/programs/anvi-setup-kegg-kofams.md rename to anvio/docs/programs/anvi-setup-kegg-data.md index be18100c2d..00b0423daf 100644 --- a/anvio/docs/programs/anvi-setup-kegg-kofams.md +++ b/anvio/docs/programs/anvi-setup-kegg-data.md @@ -1,11 +1,33 @@ -%(anvi-setup-kegg-kofams)s downloads and organizes data from KEGG for use by other programs, namely %(anvi-run-kegg-kofams)s and %(anvi-estimate-metabolism)s. It downloads HMM profiles from the [KOfam](https://academic.oup.com/bioinformatics/article/36/7/2251/5631907) database as well as the metabolism information of [KEGG MODULES](https://www.genome.jp/kegg/module.html) and the functional classification information of [KEGG BRITE](https://www.genome.jp/kegg/brite.html). The KOfam profiles are prepared for later use by the HMMER software, and the information from MODULES and BRITE is made accessible to other anvi'o programs as a %(modules-db)s. This program generates a directory with these files (%(kegg-data)s), which by default is located at `anvio/anvio/data/misc/KEGG/`. +%(anvi-setup-kegg-data)s downloads and organizes data from KEGG for use by other programs, namely %(anvi-run-kegg-kofams)s, %(anvi-estimate-metabolism)s and %(anvi-reaction-network)s. Depending on what download mode you choose, it can download and setup one or more of the following: + +- HMM profiles from the [KOfam](https://academic.oup.com/bioinformatics/article/36/7/2251/5631907) database +- metabolic pathway information from [KEGG MODULES](https://www.genome.jp/kegg/module.html) +- functional classification information from [KEGG BRITE](https://www.genome.jp/kegg/brite.html) +- protein family information of the [KEGG Orthology database](https://www.genome.jp/kegg/ko.html) + + Typically, some processing is done following the data download to make the data work with downstream anvi'o programs. The KOfam profiles are prepared for later use by the HMMER software, and the information from MODULES and BRITE is made accessible to other anvi'o programs as a %(modules-db)s. The Orthology data is converted into a nice table that can be utilized by %(anvi-reaction-network)s. This program generates a directory with these files (%(kegg-data)s). + +## Choosing a download mode + +You need to pick a mode to work with this program to control which data will be downloaded from KEGG. You can see the available modes by running the following command: + +{{ codestart }} +anvi-setup-kegg-data --list-modes +{{ codestop }} + +You use the `--mode` parameter to tell the program which mode you want, for example: + +{{ codestart }} +anvi-setup-kegg-data --mode modules +{{ codestop }} + ## Default usage: downloading a KEGG snapshot -If you do not provide any arguments to this program, the KOfam profiles and KEGG information will be set up in the default KEGG data directory. +If you do not provide any arguments to this program, all KEGG data (ie, `--mode all`) will be set up in the default KEGG data directory. {{ codestart }} -anvi-setup-kegg-kofams +anvi-setup-kegg-data {{ codestop }} ### How does it work? @@ -21,14 +43,17 @@ Doing it this way ensures that almost everyone uses the same version of KEGG dat But the trade-off to this is that the default KEGG data version is tied to an anvi'o release, and it will not always include the most up-to-date information from KEGG. Luckily, **for those who want the most updated version of KEGG, you can still use this program to generate the KEGG data directory by downloading directly from KEGG** (see 'Getting the most up-to-date KEGG data' section below). {:.warning} -BRITE hierarchy data is not included in the default KEGG snapshot for anvi'o `v7`. Starting from the `v7.1-dev` version of anvi'o, there is a new default KEGG snapshot including BRITE information. This data can also be set up by using the option to download directly from KEGG in `v7.1-dev` or later. +BRITE hierarchy data is not included in the default KEGG snapshot for anvi'o `v7`. Starting from the `v7.1-dev` version of anvi'o, there is a new default KEGG snapshot including BRITE information. If you are missing this data, it can be acquired by either installing a later snapshot or by independently downloading it with this program using `--mode modules`. + +{:.warning} +The data for metabolic modeling are not included in the KEGG snapshots created before anvi'o `v8`. If you are missing this data, it can be acquired by either installing a later snapshot or by independently downloading it with this program using `--mode modeling`. ### Set up KEGG data in a non-default location You can specify a different directory in which to put this data, if you wish: {{ codestart }} -anvi-setup-kegg-kofams --kegg-data-dir /path/to/directory/KEGG +anvi-setup-kegg-data --kegg-data-dir /path/to/directory/KEGG {{ codestop }} This is helpful if you don't have write access to the default directory location, or if you want to keep several different versions of the KEGG data on your computer. Just remember that when you want to use this specific KEGG data directory with later programs such as %(anvi-run-kegg-kofams)s, you will have to specify its location with the `--kegg-data-dir` flag. @@ -38,7 +63,7 @@ This is helpful if you don't have write access to the default directory location By default, the KEGG snapshot that will be installed is the latest one, which is up-to-date with your current version of anvi'o. If, however, you want a snapshot from an earlier version, you can run something like the following to get it: {{ codestart }} -anvi-setup-kegg-kofams --kegg-data-dir /path/to/directory/KEGG \ +anvi-setup-kegg-data --kegg-data-dir /path/to/directory/KEGG \ --kegg-snapshot v2020-04-27 {{ codestop }} @@ -46,7 +71,7 @@ Just keep in mind that you may need to migrate the MODULES.db from these earlier Not sure what KEGG snapshots are available for you to request? Well, you could check out the YAML file at `anvio/anvio/data/misc/KEGG-SNAPSHOTS.yaml` in your anvi'o directory, or you could just give something random to the `--kegg-snapshot` parameter and watch anvi'o freak out and tell you what is available: {{ codestart }} -anvi-setup-kegg-kofams --kegg-snapshot hahaha +anvi-setup-kegg-data --kegg-snapshot hahaha {{ codestop }} @@ -55,9 +80,11 @@ anvi-setup-kegg-kofams --kegg-snapshot hahaha This program is also capable of downloading data directly from KEGG and converting it into an anvi'o-compatible format. In fact, this is how we generate the default KEGG archive. If you want the latest KEGG data instead of the default snapshot of KEGG, try the following: {{ codestart }} -anvi-setup-kegg-kofams --download-from-kegg +anvi-setup-kegg-data --download-from-kegg {{ codestop }} +Please note that this will download all the KEGG data (ie, `--mode all` is the default). If you want to independently download individual KEGG datasets, you should pick one of the other modes (the `--download-from-kegg` flag is implicitly turned on in these modes). + ### How does it work? KOfam profiles are downloadable from KEGG's [FTP site](ftp://ftp.genome.jp/pub/db/kofam/) and all other KEGG data is accessible as flat text files through their [API](https://www.kegg.jp/kegg/rest/keggapi.html). When you run this program it will first get all the files that it needs from these sources, and then it will process them by doing the following: @@ -66,54 +93,57 @@ KOfam profiles are downloadable from KEGG's [FTP site](ftp://ftp.genome.jp/pub/d - concatenate all remaining KOfam profiles into one file and run `hmmpress` on them - parse the flat text file for each KEGG module and the JSON file for each BRITE hierarchy - store the MODULE and BRITE information in the %(modules-db)s +- parse the flat text files from KEGG Orthology and organize these into a table for metabolic modeling An important thing to note about this option is that it has rigid expectations for the format of the KEGG data that it works with. Future updates to KEGG may break things such that the data can no longer be directly obtained from KEGG or properly processed. In the sad event that this happens, you will have to download KEGG from one of our archives instead. ### The --only-download option -Suppose you only want to download data from KEGG, but you don't need a %(modules-db)s - at least not right away. You can instruct this program to stop after downloading by providing the `--only-download` flag: +The `--only-download` flag works for `KOfam` mode and `modules` mode. + +Suppose you only want to download data from KEGG without processing it. For instance, perhaps you don't need a %(modules-db)s or you don't want `hmmpress` to be run on the KOfam profiles. You can instruct this program to stop after downloading by providing the `--only-download` flag: {{ codestart }} -anvi-setup-kegg-kofams --download-from-kegg \ +anvi-setup-kegg-data --mode modules \ --only-download \ --kegg-data-dir /path/to/directory/KEGG {{ codestop }} It's probably a good idea in this case to specify where you want this data to go using `--kegg-data-dir`, to make sure you can find it later. -Actually, in addition to downloading the data, the program will also do a bit of processing on the KOfam profiles: it will remove those without bitscore thresholds, concatenate the remaining profiles into one file, and run `hmmpress` on them. But no database will be created when this flag is used. - {:.notice} -This option is primarily useful for developers to test `anvi-setup-kegg-kofams` - for instance, so that you can download the data once and run the database setup option (`--only-database`) multiple times. However, if non-developers find another practical use-case for this flag, we'd be happy to add those ideas here. Send us a message, or feel free to edit this file and pull request your changes on the anvi'o Github repository. :) +This option is primarily useful for developers to test `anvi-setup-kegg-data` - for instance, so that you can download the data once and run the database setup option (`--only-processing`) multiple times. However, if non-developers find another practical use-case for this flag, we'd be happy to add those ideas here. Send us a message, or feel free to edit this file and pull request your changes on the anvi'o Github repository. :) + +### The --only-processing option -### The --only-database option +The `--only-processing` flag works for `KOfam` mode and `modules` mode. -Let's say you already have KEGG data on your computer that you got by running this program with the `--only-download` flag. Now you want to turn this data into a %(modules-db)s. To do that, run this program using the `--only-database` flag and provide the location of the pre-downloaded KEGG data: +Let's say you already have KEGG data on your computer that you got by running this program with the `--only-download` flag. Now you want to process the HMM files, or turn the MODULES data into a %(modules-db)s. To do that, run this program using the `--only-processing` flag and provide the location of the pre-downloaded KEGG data: {{ codestart }} -anvi-setup-kegg-kofams --download-from-kegg \ - --only-database \ +anvi-setup-kegg-data --mode modules \ + --only-processing \ --kegg-data-dir /path/to/directory/KEGG {{ codestop }} {:.notice} The KEGG data that you already have on your computer has to be in the format expected by this program, or you'll run into errors. Pretty much the only reasonable way to get the data into the proper format is to run this program with the `--only-download` option. Otherwise you would have to go through a lot of manual file-changing shenanigans - possible, but not advisable. -One more note: since this flag is most often used for testing the database setup capabilities of this program, which entails running `anvi-setup-kegg-kofams -D --only-database` multiple times on the same KEGG data directory, there is an additional flag that may be useful in this context. To avoid having to manually delete the created modules database each time you run, you can use the `--overwrite-output-destinations` flag: +One more note: since this flag is most often used for testing the database setup capabilities of this program, which entails running `anvi-setup-kegg-data --mode modules --only-processing` multiple times on the same KEGG data directory, there is an additional flag that may be useful in this context. To avoid having to manually delete the created modules database each time you run, you can use the `--overwrite-output-destinations` flag: {{ codestart }} -anvi-setup-kegg-kofams --download-from-kegg \ - --only-database \ +anvi-setup-kegg-data --mode modules \ + --only-processing \ --kegg-data-dir /path/to/directory/KEGG \ --overwrite-output-destinations {{ codestop }} ### Avoiding BRITE setup -As of anvi'o `v7.1-dev` or later, KEGG BRITE hierarchies are added to the %(modules-db)s when running this program with the `-D` (`--download-from-kegg`) option. If you don't want this cool new feature - because you are a rebel, or adverse to change, or something is not working on your computer, whatever - then fine. You can use the `--skip-brite-hierarchies` flag: +As of anvi'o `v7.1-dev` or later, KEGG BRITE hierarchies are added to the %(modules-db)s when running this program with `--mode modules`. If you don't want this cool new feature - because you are a rebel, or adverse to change, or something is not working on your computer, whatever - then fine. You can use the `--skip-brite-hierarchies` flag: {{ codestart }} -anvi-setup-kegg-kofams -D --skip-brite-hierarchies +anvi-setup-kegg-data --mode modules --skip-brite-hierarchies {{ codestop }} Hopefully it makes sense to you that this flag does not work when setting up from a KEGG snapshot that already includes BRITE data in it. @@ -121,9 +151,9 @@ Hopefully it makes sense to you that this flag does not work when setting up fro ### How do I share this data? Suppose you have been living on the edge and annotating your contigs databases with a non-default version of %(kegg-data)s, and you share these databases with a collaborator who wants to run downstream programs like %(anvi-estimate-metabolism)s on them. Your collaborator (who has a different version of %(kegg-data)s on their computer) will likely get version errors as detailed on the %(anvi-estimate-metabolism)s help page. -In order for your collaborator to be able to work with your dataset, they need to have the same %(kegg-data)s version as you did when you ran %(anvi-run-kegg-kofams)s. If you are very lucky and KEGG has not been updated since you set up your %(kegg-data)s, they may be able to run `anvi-setup-kegg-kofams -D` to get it. But if not, there are a few options for you to share your version of %(kegg-data)s: +In order for your collaborator to be able to work with your dataset, they need to have the same %(kegg-data)s version as you did when you ran %(anvi-run-kegg-kofams)s. If you are very lucky and KEGG has not been updated since you set up your %(kegg-data)s, they may be able to run `anvi-setup-kegg-data -D` to get it. But if not, there are a few options for you to share your version of %(kegg-data)s: -1. You could send them your KEGG data directory. First, run `tar -czvf kegg_archive.tar.gz ./KEGG` on the data directory to compress and archive it before sending it over (this command _must_ be run from its parent directory so that the archive has the expected directory structure when it is unpacked). Then your collaborator can just run `anvi-setup-kegg-kofams --kegg-archive kegg_archive.tar.gz --kegg-data-dir ./KEGG_ARCHIVE` and be good to go. They would just have to use `--kegg-data-dir ./KEGG_ARCHIVE` when running downstream programs. The problem here is that even the archived %(kegg-data)s is quite large, ~4-5GB, and may be unfeasible for you to send. +1. You could send them your KEGG data directory. First, run `tar -czvf kegg_archive.tar.gz ./KEGG` on the data directory to compress and archive it before sending it over (this command _must_ be run from its parent directory so that the archive has the expected directory structure when it is unpacked). Then your collaborator can just run `anvi-setup-kegg-data --kegg-archive kegg_archive.tar.gz --kegg-data-dir ./KEGG_ARCHIVE` and be good to go. They would just have to use `--kegg-data-dir ./KEGG_ARCHIVE` when running downstream programs. The problem here is that even the archived %(kegg-data)s is quite large, ~4-5GB, and may be unfeasible for you to send. 2. You could share with your collaborator just the %(modules-db)s. If all they want to do is to run %(anvi-estimate-metabolism)s on databases annotated by your version of the KEGG data directory, this should be all they need. They would need to pass the folder containing your %(modules-db)s to %(anvi-estimate-metabolism)s using the `--kegg-data-dir` parameter. 3. If your collaborator also wants to be able to annotate other databases with your version of %(kegg-data)s, then they need to have the KOfam profiles as well. You can send them your %(modules-db)s and have them download the KOfam profiles most similar to the ones you have from the [KOfam archives](https://www.genome.jp/ftp/db/kofam/archives/) (which are labeled by date). Then they would have to essentially construct their own KEGG data directory by copying the structure of the default one and putting the downloaded files (and the %(modules-db)s you sent them) into the correct locations. The KOfam profiles must be concatenated into a `Kofam.hmm` file and `hmmpress` must be run on that file to generate the required indices for `hmmsearch`. Your collaborator must also have the `ko_list.txt` file (which _should_ be downloaded with the profiles) in the right spot. Then they could pass their makeshift KEGG data directory to %(anvi-run-kegg-kofams)s using `--kegg-data-dir`, and they should be golden. (A word of warning: they may want to remove KOs without bitscore thresholds in the `ko_list.txt` before concatenating the profiles, otherwise they will likely get a lot of weak hits for these KOs.) @@ -132,7 +162,7 @@ In order for your collaborator to be able to work with your dataset, they need t If you have an archive (`.tar.gz`) of the KEGG data directory already on your computer (perhaps a colleague or Meren Lab developer gave you one), you can set up KEGG from this archive instead: {{ codestart }} -anvi-setup-kegg-kofams --kegg-archive KEGG_archive.tar.gz +anvi-setup-kegg-data --kegg-archive KEGG_archive.tar.gz {{ codestop }} This works the same way as the default, except that it bypasses the download step and instead uses the archive file you have provided with `--kegg-archive`. @@ -143,13 +173,13 @@ Periodically (especially before releasing a new version of anvi'o), we want to a Available KEGG snapshots are stored in the anvi'o code repository in `anvio/data/misc/KEGG-SNAPSHOTS.yaml`. To add a new snapshot, you first need to create one by downloading and processing the data from KEGG, testing to make sure it works, and then updating this file. Here are the steps: -1. Download the latest data directly from KEGG by running `anvi-setup-kegg-kofams -D --kegg-data-dir ./KEGG`. This will create the new KEGG data folder with its %(modules-db)s in your current working directory. Make sure you use the exact folder name of `./KEGG`, because that is what anvi'o expects to find when it unpacks a KEGG snapshot. +1. Download the latest data directly from KEGG by running `anvi-setup-kegg-data -D --kegg-data-dir ./KEGG -T 5`. This will create the new KEGG data folder with its %(modules-db)s in your current working directory. Make sure you use the exact folder name of `./KEGG`, because that is what anvi'o expects to find when it unpacks a KEGG snapshot. You may want to reduce or increase the number of threads (`-T`) according to your available compute resources. 2. Get the hash value and version info from the MODULES.db by running `anvi-db-info ./KEGG/MODULES.db`. 3. Archive the KEGG data directory by running `tar -czvf KEGG_build_YYYY-MM-DD_HASH.tar.gz ./KEGG`. Please remember to replace YYYY-MM-DD with the current date and replace HASH with the MODULES.db hash value obtained in step 2. This convention makes it easier to distinguish between KEGG snapshots by simply looking at the file name. -4. Test that setup works with this archive by running `anvi-setup-kegg-kofams --kegg-archive KEGG_build_YYYY-MM-DD_HASH.tar.gz --kegg-data-dir TEST_NEW_KEGG_ARCHIVE`. +4. Test that setup works with this archive by running `anvi-setup-kegg-data --kegg-archive KEGG_build_YYYY-MM-DD_HASH.tar.gz --kegg-data-dir TEST_NEW_KEGG_ARCHIVE`. 5. If setup worked in the last step without errors, upload the `.tar.gz` archive to [Figshare](https://figshare.com/). If you need inspiration for filling out the keywords, categories, and description fields for the archive, you can check the previous KEGG snapshots that have been uploaded - for instance, [this one](https://figshare.com/articles/dataset/KEGG_build_2023-01-10/21862494) or [this one](https://figshare.com/articles/dataset/KEGG_build_2022-04-14/19601761). At minimum, we typically indicate the database version and hash value, and an example setup command (ie, the one from step 4), in the description of the dataset. Once the archive is published on Figshare (warning: this usually takes a while due to the large file size), you can get the download url of the archive by right-clicking on the Download button and copying the address, which should be a URL with a format similar to this example (but different numbers): `https://figshare.com/ndownloader/files/34817812` 6. Add an entry to the bottom of the `anvio/data/misc/KEGG-SNAPSHOTS.yaml` file with the Figshare download URL, archive name, and MODULES.db hash and version. If you want this to become the default snapshot (which usually only changes before the next anvi'o release), you should also update the default `self.target_snapshot` variable in `anvio/kegg.py` to be this latest version that you have added. -7. Test it by running `anvi-setup-kegg-kofams --kegg-data-dir TEST_NEW_KEGG`, and if it works you are done, and can push your changes to the anvi'o repository. :) +7. Test it by running `anvi-setup-kegg-data --kegg-data-dir TEST_NEW_KEGG`, and if it works you are done, and can push your changes to the anvi'o repository. :) ## Downloading generic KEGG data in Python @@ -182,7 +212,7 @@ setup.download_kegg_files_from_hierarchy('br08001', download_dir='KEGG_COMPOUND' If you just want to get a KEGG `htext` file (with extension `.keg`), use the following function: ```python -etup.download_generic_htext('br08001', download_dir='KEGG_COMPOUND') +setup.download_generic_htext('br08001', download_dir='KEGG_COMPOUND') ``` ### Processing a hierarchical text file @@ -193,12 +223,13 @@ etup.download_generic_htext('br08001', download_dir='KEGG_COMPOUND') accession_list = setup.get_accessions_from_htext_file("br08001.keg") ``` - If you want to process the KEGG module `htext` file to get a dictionary of all modules and their names/classes/etc, use the following function. You will need to set the `kegg_module_file` attribute (of the KeggSetup class) to point to the location of the `modules.keg` file, and the function will store the module dictionary in the `module_dict` attribute. + If you want to process the KEGG module `htext` file to get a dictionary of all modules and their names/classes/etc, use the following code. You will need to set the `kegg_module_file` attribute (of the ModulesDownload class) to point to the location of the `modules.keg` file, and the function will store the module dictionary in the `module_dict` attribute. ```python -setup.kegg_module_file = "modules.keg" -setup.process_module_file() -setup.module_dict # this attribute now stores the module dictionary +modules_setup = kegg.ModulesDownload(args) +modules_setup.kegg_module_file = "modules.keg" +modules_setup.process_module_file() +modules_setup.module_dict # this attribute now stores the module dictionary ``` ### Downloading a flat file using the KEGG API diff --git a/anvio/docs/programs/anvi-setup-user-modules.md b/anvio/docs/programs/anvi-setup-user-modules.md index ecf6971224..32b69edd90 100644 --- a/anvio/docs/programs/anvi-setup-user-modules.md +++ b/anvio/docs/programs/anvi-setup-user-modules.md @@ -74,7 +74,7 @@ Why must we format the module files this way, you ask? Well, to be honest, KEGG ### Specifying KEGG data to be used for sanity checking -If you haven't yet run %(anvi-setup-kegg-kofams)s on your computer, you will get an error when you try to run this program. This is because KEGG data can be used in addition to user-defined modules, and we need to be aware of which KEGG modules exist so we can make sure none of the user-defined modules have the same identifiers as these. +If you haven't yet run %(anvi-setup-kegg-data)s on your computer, you will get an error when you try to run this program. This is because KEGG data can be used in addition to user-defined modules, and we need to be aware of which KEGG modules exist so we can make sure none of the user-defined modules have the same identifiers as these. By default, this program looks for the KEGG data in the default location, so if you have set up KEGG data in a non-default directory, you should specify the path to that directory using the `--kegg-data-dir` parameter: diff --git a/anvio/drivers/hmmer.py b/anvio/drivers/hmmer.py index a34e835885..00cc2a9341 100644 --- a/anvio/drivers/hmmer.py +++ b/anvio/drivers/hmmer.py @@ -87,7 +87,7 @@ def verify_hmmpress_output(self, hmm_path): if not os.path.exists(base_path + ext): raise ConfigError("It appears that hmmpress was not properly run on the hmm profiles at %s. The " "file %s does not exist. It is likely that you will have to set up your profiles " - "again by running a program such as `anvi-setup-pfams` or `anvi-setup-kegg-kofams`. " + "again by running a program such as `anvi-setup-pfams` or `anvi-setup-kegg-data`. " "We are very sorry about this." % (hmm_path, base_path + ext)) diff --git a/anvio/kegg.py b/anvio/kegg.py index 21f639fadd..df9000a61b 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -23,6 +23,7 @@ import anvio.filesnpaths as filesnpaths import anvio.tables as t import anvio.ccollections as ccollections +import anvio.biochemistry.reactionnetwork as reactionnetwork from anvio.errors import ConfigError from anvio.drivers.hmmer import HMMer @@ -492,7 +493,7 @@ class KeggSetup(KeggContext): Parameters ========== args: Namespace object - All the arguments supplied by user to anvi-setup-kegg-kofams. If using this class through the API, please + All the arguments supplied by user to anvi-setup-kegg-data. If using this class through the API, please provide a Namespace object with the Boolean 'reset' parameter. skip_init: Boolean Developers can use this flag to skip the sanity checks and creation of directories when testing this class @@ -503,13 +504,13 @@ def __init__(self, args, run=run, progress=progress, skip_init=False): self.args = args self.run = run self.progress = progress + self.num_threads = 1 if not A('num_threads') else A('num_threads') self.kegg_archive_path = A('kegg_archive') + self.kegg_snapshot = A('kegg_snapshot') self.download_from_kegg = True if A('download_from_kegg') else False self.only_download = True if A('only_download') else False - self.only_database = True if A('only_database') else False - self.kegg_snapshot = A('kegg_snapshot') - self.skip_brite_hierarchies = A('skip_brite_hierarchies') - self.overwrite_modules_db = A('overwrite_output_destinations') + self.only_processing = True if A('only_processing') else False + self.skip_init = skip_init if self.kegg_archive_path and self.download_from_kegg: raise ConfigError("You provided two incompatible input options, --kegg-archive and --download-from-kegg. " @@ -517,19 +518,15 @@ def __init__(self, args, run=run, progress=progress, skip_init=False): if self.kegg_snapshot and self.download_from_kegg or self.kegg_snapshot and self.kegg_archive_path: raise ConfigError("You cannot request setup from an anvi'o KEGG snapshot at the same time as from KEGG directly or from one of your " "KEGG archives. Please pick just one setup option and try again.") - if (not self.download_from_kegg) and (self.only_download or self.only_database): - raise ConfigError("Erm. The --only-download and --only-database options are only valid if you are also using the --download-from-kegg " + + if (not self.download_from_kegg) and (self.only_download or self.only_processing): + raise ConfigError("Erm. The --only-download and --only-processing options are only valid if you are also using the --download-from-kegg " "option. Sorry.") - if self.only_download and self.only_database: - raise ConfigError("The --only-download and --only-database options are incompatible. Please choose only one. Or, if you want both " + if self.only_download and self.only_processing: + raise ConfigError("The --only-download and --only-processing options are incompatible. Please choose only one. Or, if you want both " "download AND database setup to happen, then use only the -D flag without providing either of these two options.") - if (not self.download_from_kegg) and self.skip_brite_hierarchies: - self.run.warning("Just so you know, the --skip-brite-hierarchies flag does not do anything (besides suppress some warning output) when used " - "without the -D option. You are setting up from an archived KEGG snapshot which may already include BRITE data, and if it " - "does, this data will not be removed. You can always check if the resulting modules database contains BRITE data by " - "running `anvi-db-info` on it and looking at the `is_brite_setup` value (which will be 1 if the database contains BRITE data).") - + # initializing these to None here so that it doesn't break things downstream self.pathway_dict = None self.brite_dict = None @@ -537,56 +534,37 @@ def __init__(self, args, run=run, progress=progress, skip_init=False): # init the base class KeggContext.__init__(self, self.args) + # get KEGG snapshot info for default setup + self.target_snapshot = self.kegg_snapshot or 'v2023-09-22' + self.target_snapshot_yaml = os.path.join(os.path.dirname(anvio.__file__), 'data/misc/KEGG-SNAPSHOTS.yaml') + self.snapshot_dict = utils.get_yaml_as_dict(self.target_snapshot_yaml) + + if self.target_snapshot not in self.snapshot_dict.keys(): + self.run.warning(None, header="AVAILABLE KEGG SNAPSHOTS", lc="yellow") + available_snapshots = sorted(list(self.snapshot_dict.keys())) + for snapshot_name in available_snapshots: + self.run.info_single(snapshot_name + (' (latest)' if snapshot_name == available_snapshots[-1] else '')) + + raise ConfigError("Whoops. The KEGG snapshot you requested is not one that is known to anvi'o. Please try again, and " + "this time pick from the list shown above.") + + # default download path for KEGG snapshot + self.default_kegg_data_url = self.snapshot_dict[self.target_snapshot]['url'] + self.default_kegg_archive_file = self.snapshot_dict[self.target_snapshot]['archive_name'] + self.expect_modeling_files_in_archive = True if 'no_modeling_data' in self.snapshot_dict[self.target_snapshot].keys() and \ + (not self.snapshot_dict[self.target_snapshot]['no_modeling_data']) else False + if self.user_input_dir: self.run.warning(f"Just so you know, we will be setting up the metabolism data provided at the following " f"location: '{self.user_input_dir}'. The success of this will be determined by how well you " f"followed our formatting guidelines, so keep an eye out for errors below.") - filesnpaths.is_program_exists('hmmpress') if not self.user_input_dir: - if not args.reset and not anvio.DEBUG and not skip_init: - self.is_database_exists(fail_if_exists=(not self.only_database)) - - if self.download_from_kegg and not self.only_database and not self.kegg_archive_path and not skip_init: + # establish parent directory + if self.download_from_kegg and not self.only_processing and not self.kegg_archive_path and not skip_init: filesnpaths.gen_output_directory(self.kegg_data_dir, delete_if_exists=args.reset) - filesnpaths.gen_output_directory(self.kegg_hmm_data_dir, delete_if_exists=args.reset) - filesnpaths.gen_output_directory(self.orphan_data_dir, delete_if_exists=args.reset) - filesnpaths.gen_output_directory(self.kegg_module_data_dir, delete_if_exists=args.reset) - filesnpaths.gen_output_directory(self.brite_data_dir, delete_if_exists=args.reset) - - # get KEGG snapshot info for default setup - self.target_snapshot = self.kegg_snapshot or 'v2023-09-18' - self.target_snapshot_yaml = os.path.join(os.path.dirname(anvio.__file__), 'data/misc/KEGG-SNAPSHOTS.yaml') - self.snapshot_dict = utils.get_yaml_as_dict(self.target_snapshot_yaml) - - if self.target_snapshot not in self.snapshot_dict.keys(): - self.run.warning(None, header="AVAILABLE KEGG SNAPSHOTS", lc="yellow") - available_snapshots = sorted(list(self.snapshot_dict.keys())) - for snapshot_name in available_snapshots: - self.run.info_single(snapshot_name + (' (latest)' if snapshot_name == available_snapshots[-1] else '')) - - raise ConfigError("Whoops. The KEGG snapshot you requested is not one that is known to anvi'o. Please try again, and " - "this time pick from the list shown above.") - - # default download path for KEGG snapshot - self.default_kegg_data_url = self.snapshot_dict[self.target_snapshot]['url'] - self.default_kegg_archive_file = self.snapshot_dict[self.target_snapshot]['archive_name'] - - # download from KEGG option: ftp path for HMM profiles and KO list - # for ko list, add /ko_list.gz to end of url - # for profiles, add /profiles.tar.gz to end of url - self.database_url = "ftp://ftp.genome.jp/pub/db/kofam" - # dictionary mapping downloaded file name to final decompressed file name or folder location - self.files = {'ko_list.gz': self.ko_list_file_path, 'profiles.tar.gz': self.kegg_data_dir} - - # download from KEGG option: module/pathway map htext files and API link - self.kegg_module_download_path = "https://www.genome.jp/kegg-bin/download_htext?htext=ko00002.keg&format=htext&filedir=" - self.kegg_pathway_download_path = "https://www.genome.jp/kegg-bin/download_htext?htext=br08901.keg&format=htext&filedir=" - self.kegg_rest_api_get = "http://rest.kegg.jp/get" - # download a json file containing all BRITE hierarchies, which can then be downloaded themselves - self.kegg_brite_hierarchies_download_path = os.path.join(self.kegg_rest_api_get, "br:br08902/json") else: # user input setup filesnpaths.is_output_dir_writable(os.path.dirname(self.user_input_dir)) @@ -606,7 +584,7 @@ def __init__(self, args, run=run, progress=progress, skip_init=False): self.run.info("Successfully removed", path) - def is_database_exists(self, fail_if_exists=True): + def is_database_exists(self, files_to_check, fail_if_exists=True): """This function determines whether the user has already downloaded all required KEGG data. More specifically, it looks for the KEGG files that we use to learn what to download (as in @@ -615,22 +593,22 @@ def is_database_exists(self, fail_if_exists=True): PARAMETERS ========== + files_to_check : list of file paths + this list should contain the paths to all required KEGG data or directories. what those + files are depends on the download mode. fail_if_exists : Boolean if this is True, this function will fail if the KEGG data already exists on the user's computer. If it is False, AND the user has already downloaded all required KEGG data, - then this function will not fail. This is to enable the --only-database option. + then this function will not fail. This is to enable the --only-processing option. Note that in this case we require all KEGG data to be pre-downloaded to avoid mixing older and newer KEGG data - so if this data is only partially downloaded, the function will raise an error even if this parameter is False. """ - files_to_check = [self.kofam_hmm_file_path, - self.kegg_module_file, - self.kegg_module_data_dir, - ] - if not self.skip_brite_hierarchies: - files_to_check.append(self.kegg_brite_hierarchies_file) - files_to_check.append(self.brite_data_dir) + if anvio.DEBUG: + file_str = ", ".join(files_to_check) + self.run.warning(f"We are looking for the following files to see if the KEGG data already " + f"exists on you computer: {file_str}") files_that_exist = [] for f in files_to_check: @@ -638,7 +616,7 @@ def is_database_exists(self, fail_if_exists=True): if fail_if_exists: raise ConfigError(f"It seems you already have data at {f}, please use the `--reset` flag " "or delete the KEGG data directory manually if you want to re-download KEGG data. " - "See also the --only-database option, which you can use if you already " + "See also the --only-processing option, which you can use if you already " "have all required KEGG data in that folder. (API users: skip this sanity " "check by initializing this class with `skip_init=True`)") else: @@ -651,7 +629,7 @@ def is_database_exists(self, fail_if_exists=True): raise ConfigError(f"We found some, but not all, required KEGG data on your computer in the KEGG " f"data directory. Since you don't have everything you need, we need you to re-download " f"everything from scratch. Please re-run this program using the --reset flag, and if " - f"you were using the --only-database option, remove that flag. :) HOWEVER, if you notice that " + f"you were using the --only-processing option, remove that flag. :) HOWEVER, if you notice that " "KEGG BRITE data does not appear to be in the upcoming list, but you don't actually want " "to download BRITE data, then you can just add the --skip-brite-hierarchies to your previous " f"command and be on your way (ie, no --reset needed). Here is the KEGG data we found:\n{exist_str}") @@ -662,12 +640,227 @@ def is_database_exists(self, fail_if_exists=True): f"need to check it to make sure we are not using something that is too old:\n" f"{exist_str}") - if self.only_database and not files_that_exist: + if self.only_processing and not files_that_exist: raise ConfigError(f"We noticed that there is no KEGG data on your computer at {self.kegg_data_dir} even " - f"though you used the --only-database option. If you don't actually have KEGG data already " - f"downloaded, you should get rid of the --only-database flag and re-run this program. If you " + f"though you used the --only-processing option. If you don't actually have KEGG data already " + f"downloaded, you should get rid of the --only-processing flag and re-run this program. If you " f"know that you DO have KEGG data, perhaps you gave us the wrong data directory?") + + def setup_from_archive(self): + """This function sets up the KEGG data directory from an archive of a previously-setup KEGG data directory. + + To do so, it unpacks the archive and checks its structure and that all required components are there. + """ + + self.run.info("KEGG archive", self.kegg_archive_path) + self.progress.new('Unzipping KEGG archive file...') + if not self.kegg_archive_path.endswith("tar.gz"): + self.progress.reset() + raise ConfigError("The provided archive file %s does not appear to be an archive at all. Perhaps you passed " + "the wrong file to anvi'o?" % (self.kegg_archive_path)) + unpacked_archive_name = "KEGG_archive_unpacked" + utils.tar_extract_file(self.kegg_archive_path, output_file_path=unpacked_archive_name, keep_original=True) + + self.progress.update('Checking KEGG archive structure and contents...') + archive_is_ok = self.kegg_archive_is_ok(unpacked_archive_name, no_modeling_is_ok = (not self.expect_modeling_files_in_archive)) + archive_contains_brite = self.check_archive_for_brite(unpacked_archive_name) + self.progress.end() + if archive_is_ok: + if os.path.exists(self.kegg_data_dir): + shutil.rmtree(self.kegg_data_dir) + path_to_kegg_in_archive = os.path.join(unpacked_archive_name, "KEGG") + shutil.move(path_to_kegg_in_archive, self.kegg_data_dir) + shutil.rmtree(unpacked_archive_name) + + if not archive_contains_brite and not self.skip_brite_hierarchies: + self.run.warning("The KEGG data archive does not contain the necessary files to set up BRITE hierarchy classification. " + "This is not a problem, and KEGG set up proceeded without it. BRITE is guaranteed to be set up when " + "downloading the latest version of KEGG with `anvi-setup-kegg-data`.") + + # if necessary, warn user about migrating the modules db + self.check_modules_db_version() + + else: + debug_output = f"We kept the unpacked archive for you to take a look at it. It is at " \ + f"{os.path.abspath(unpacked_archive_name)} and you may want " \ + f"to delete it after you are done checking its contents." + if not anvio.DEBUG: + shutil.rmtree(unpacked_archive_name) + debug_output = "The unpacked archive has been deleted, but you can re-run the script with the --debug " \ + "flag to keep it if you want to see its contents." + else: + self.run.warning(f"The unpacked archive file {os.path.abspath(unpacked_archive_name)} was kept for " + f"debugging purposes. You may want to clean it up after you are done looking through it.") + + raise ConfigError(f"SETUP FAILED. The provided archive file is missing some critical files, " + f"so anvi'o is unable to use it. {debug_output}") + + + def check_modules_db_version(self): + """This function checks if the MODULES.db is out of date and if so warns the user to migrate it""" + + # get current version of db + db_conn = db.DB(self.kegg_modules_db_path, None, ignore_version=True) + current_db_version = int(db_conn.get_meta_value('version')) + db_conn.disconnect() + + # if modules.db is out of date, give warning + target_version = int(anvio.tables.versions_for_db_types['modules']) + if current_db_version != target_version: + self.run.warning(f"Just so you know, the KEGG archive that was just set up contains an outdated MODULES.db (version: " + f"{current_db_version}). You may want to run `anvi-migrate` on this database before you do anything else. " + f"Here is the path to the database: {self.kegg_modules_db_path}") + + + def check_archive_for_brite(self, unpacked_archive_path): + """Check the archive for the BRITE directory and 'hierarchy of hierarchies' json file. + + It is ok for archives not to have these present, but let the user know. + """ + + is_brite_included = True + + path_to_kegg_in_archive = os.path.join(unpacked_archive_path, "KEGG") + brite_directories_and_files = [self.brite_data_dir, + self.kegg_brite_hierarchies_file] + for f in brite_directories_and_files: + path_to_f_in_archive = os.path.join(path_to_kegg_in_archive, os.path.basename(f)) + if not os.path.exists(path_to_f_in_archive) and not self.skip_brite_hierarchies: + is_brite_included = False + if anvio.DEBUG: + self.run.warning(f"The KEGG archive does not contain the following optional BRITE file or directory: {path_to_f_in_archive}") + + return is_brite_included + + + def setup_kegg_snapshot(self): + """This is the default setup strategy in which we unpack a specific KEGG archive. + + We do this so that everyone who uses the same release of anvi'o will also have the same default KEGG + data, which facilitates sharing and also means they do not have to continuously re-annotate their datasets + when KEGG is updated. + + It is essentially a special case of setting up from an archive. + """ + + if anvio.DEBUG: + self.run.info("Downloading from: ", self.default_kegg_data_url) + self.run.info("Downloading to: ", self.default_kegg_archive_file) + utils.download_file(self.default_kegg_data_url, self.default_kegg_archive_file, progress=self.progress, run=self.run) + + # a hack so we can use the archive setup function + self.kegg_archive_path = self.default_kegg_archive_file + self.setup_from_archive() + + # if all went well, let's get rid of the archive we used and the log file + if not anvio.DEBUG: + os.remove(self.default_kegg_archive_file) + else: + self.run.warning(f"Because you used the --debug flag, the KEGG archive file at {self.default_kegg_archive_file} " + "has been kept. You may want to remove it later.") + + + def kegg_archive_is_ok(self, unpacked_archive_path, no_modeling_is_ok = False): + """This function checks the structure and contents of an unpacked KEGG archive and returns True if it is as expected. + + Please note that we check for existence of the files that are necessary to run KEGG scripts, but we don't check the file + formats. This means that people could technically trick this function into returning True by putting a bunch of crappy files + with the right names/paths into the archive file. But what would be the point of that? + + We also don't care about the contents of certain folders (ie modules) because they are not being directly used + when running KEGG scripts. In the case of modules, all the information should already be in the MODULES.db so we don't + waste our time checking that all the module files are there. We only check that the directory is there. If later changes + to the implementation require the direct use of the files in these folders, then this function should be updated + to check for those. + + Parameters + ========== + unpacked_archive_path : str + Path to the unpacked archive directory + no_modeling_is_ok : boolean + Whether or not we care if modeling data is not found in the archive. This is added for backwards compatibility to the + previous versions of KEGG archives that do not include this data. + """ + + is_ok = True + + # check top-level files and folders + path_to_kegg_in_archive = os.path.join(unpacked_archive_path, "KEGG") + expected_directories_and_files = [self.orphan_data_dir, + self.kegg_module_data_dir, + self.kegg_hmm_data_dir, + self.ko_list_file_path, + self.kegg_module_file, + self.kegg_modules_db_path] + for f in expected_directories_and_files: + path_to_f_in_archive = os.path.join(path_to_kegg_in_archive, os.path.basename(f)) + if not os.path.exists(path_to_f_in_archive): + is_ok = False + if anvio.DEBUG: + self.run.warning(f"The KEGG archive does not contain the following expected file or directory: " + f"{path_to_f_in_archive}") + + # check hmm files + path_to_hmms_in_archive = os.path.join(path_to_kegg_in_archive, os.path.basename(self.kegg_hmm_data_dir)) + kofam_hmm_basename = os.path.basename(self.kofam_hmm_file_path) + expected_hmm_files = [kofam_hmm_basename] + for h in expected_hmm_files: + path_to_h_in_archive = os.path.join(path_to_hmms_in_archive, h) + if not os.path.exists(path_to_h_in_archive): + is_ok = False + if anvio.DEBUG: + self.run.warning(f"The KEGG archive does not contain the following expected hmm file: " + f"{path_to_h_in_archive}") + expected_extensions = ['.h3f', '.h3i', '.h3m', '.h3p'] + for ext in expected_extensions: + path_to_expected_hmmpress_file = path_to_h_in_archive + ext + if not os.path.exists(path_to_expected_hmmpress_file): + is_ok = False + if anvio.DEBUG: + self.run.warning(f"The KEGG archive does not contain the following expected `hmmpress` output: " + f"{path_to_expected_hmmpress_file}") + + # check modeling files + # this section needs to be kept up to date with any changes to requirements in reactionnetwork.py + # which is a bit silly, but since these two classes don't know about each other it is the workaround we need :( + path_to_modeling_files_in_archive = os.path.join(path_to_kegg_in_archive, "KO_REACTION_NETWORK") + expected_modeling_files = reactionnetwork.KODatabase.expected_files + missing_modeling_files = [] + for f in expected_modeling_files: + path_to_f_in_archive = os.path.join(path_to_modeling_files_in_archive, f) + if not os.path.exists(path_to_f_in_archive): + is_ok = False or no_modeling_is_ok + missing_modeling_files.append(f) + if anvio.DEBUG: + self.run.warning(f"The KEGG archive does not contain the following expected modeling file: " + f"{path_to_f_in_archive}") + + if no_modeling_is_ok and missing_modeling_files: + self.run.warning("Modeling files are missing from the KEGG archive you have set up. However, somebody " + "upstream thinks this is okay. Likely you are setting up an early KEGG snapshot version " + "that doesn't contain this data. That's fine. But please keep in mind that you won't be " + "able to run metabolic modeling. If this is a problem, you should either pick a later " + "KEGG snapshot, or download the modeling data independently using the command " + "`anvi-setup-kegg-data --mode modeling`.") + + return is_ok + + + def setup_all_data_from_archive_or_snapshot(self): + """This driver function controls whether we download one of our KEGG snapshots and set that up, or + set up directly from an archive file already on the user's computer. + """ + + if os.path.exists(self.kegg_data_dir) and not self.args.reset: + raise ConfigError(f"The directory {self.kegg_data_dir} already exists. Are you sure you want to " + f"overwrite it? If yes, feel free to restart this program with the --reset flag.") + + if self.kegg_archive_path: + self.setup_from_archive() + else: + self.setup_kegg_snapshot() + def check_user_input_dir_format(self): """This function checks whether the user input directory exists and contains the required subfolders @@ -698,97 +891,6 @@ def is_user_database_exists(self): f"please use the --reset flag or delete this file manually if you want to re-generate it.") - def download_profiles(self): - """This function downloads the Kofam profiles.""" - - self.run.info("Kofam Profile Database URL", self.database_url) - - try: - for file_name in self.files.keys(): - utils.download_file(self.database_url + '/' + file_name, - os.path.join(self.kegg_data_dir, file_name), progress=self.progress, run=self.run) - except Exception as e: - print(e) - raise ConfigError("Anvi'o failed to download KEGG KOfam profiles from the KEGG website. Something " - "likely changed on the KEGG end. Please contact the developers to see if this is " - "a fixable issue. If it isn't, we may be able to provide you with a legacy KEGG " - "data archive that you can use to setup KEGG with the --kegg-archive flag.") - - - def process_module_file(self): - """This function reads the kegg module file into a dictionary. It should be called during setup to get the KEGG module numbers so that KEGG modules can be downloaded. - - The structure of this file is like this: - - +D Module - #