diff --git a/.conda/environment.yaml b/.conda/environment.yaml index 0e458b2026..26df1c59a7 100644 --- a/.conda/environment.yaml +++ b/.conda/environment.yaml @@ -31,3 +31,4 @@ dependencies: - r-magrittr - bioconductor-qvalue - fastani +- meme diff --git a/.github/workflows/daily-component-tests-and-migrations.yaml b/.github/workflows/daily-component-tests-and-migrations.yaml index d39362690e..61e780e8f0 100644 --- a/.github/workflows/daily-component-tests-and-migrations.yaml +++ b/.github/workflows/daily-component-tests-and-migrations.yaml @@ -37,10 +37,13 @@ jobs: anvi-self-test --suite metagenomics-full --no-interactive anvi-self-test --suite pangenomics --no-interactive anvi-self-test --suite inversions --no-interactive - anvi-self-test --suite metabolism --no-interactive # the following steps cause our actions to fail on GitHub runners # due to space limitations :/ please do not uncomment this until we # have a solution for this :/ + #- name: "Run component tests for metabolism framework" + # shell: bash -l {0} + # run: | + # anvi-self-test --suite metabolism --no-interactive #- name: "Migrate ancient anvi'o databases" # shell: bash -l {0} # run: | diff --git a/Dockerfiles/anvio-structure/Dockerfile b/Dockerfiles/anvio-structure/Dockerfile index cf0372f676..fce22f6b6e 100644 --- a/Dockerfiles/anvio-structure/Dockerfile +++ b/Dockerfiles/anvio-structure/Dockerfile @@ -72,7 +72,7 @@ RUN rm anvio-7.1.tar.gz # Setup anvi'o databases ############################################################## RUN anvi-setup-interacdome -RUN anvi-setup-kegg-kofams --kegg-snapshot v2020-12-23 +RUN anvi-setup-kegg-data --kegg-snapshot v2020-12-23 RUN anvi-setup-pfams --pfam-version 33.1 RUN anvi-setup-ncbi-cogs --cog-version COG20 diff --git a/anvio/__init__.py b/anvio/__init__.py index 312bc622c6..e478d64bca 100644 --- a/anvio/__init__.py +++ b/anvio/__init__.py @@ -1044,27 +1044,26 @@ def TABULATE(table, header, numalign="right", max_width=0): "you will not have the most up-to-date version of KEGG for your annotations, metabolism " "estimations, or any other downstream uses of this data. If that is going to be a problem for you, " "do not fear - you can provide this flag to tell anvi'o to download the latest, freshest data directly " - "from KEGG's REST API and set it up into an anvi'o-compatible database."} + "from KEGG's REST API and set it up into anvi'o-compatible files."} ), 'only-download': ( ['--only-download'], {'default': False, 'action': 'store_true', 'help': "You want this program to only download data from KEGG, and then stop. It will not " - "make a modules database. (It would be a *very* good idea for you to specify a " - "data directory using --kegg-data-dir in this case, so that you can find the resulting " - "data easily and avoid messing up any data in the default KEGG directory. But you are " - "of course free to do whatever you want.). Note that KOfam profiles will still be " - "processed with `hmmpress` if you choose this option."} + "process the data (ie, into organized HMMs or a modules database). (It would be a " + "*very* good idea for you to specify a data directory using --kegg-data-dir in this " + "case, so that you can find the resulting data easily and avoid messing up any data " + "in the default KEGG directory. But you are of course free to do whatever you want.)"} ), - 'only-database': ( - ['--only-database'], + 'only-processing': ( + ['--only-processing'], {'default': False, 'action': 'store_true', - 'help': "You already have all the KEGG data you need on your computer. Perhaps you even got it from " + 'help': "You already have all the KEGG data you need on your computer. Probably you even got it from " "this program, using the --only-download option. We don't know. What matters is that you don't " - "need anything downloaded, you just want this program to setup a modules database from that " - "existing data. Good. We can do that if you provide this flag (and probably also the --kegg-data-dir " + "need anything downloaded, you just want this program to process that " + "existing data. Good. We can do that if you provide this flag (and hopefully also the --kegg-data-dir " "in which said data is located)."} ), 'kegg-snapshot': ( @@ -1072,9 +1071,10 @@ def TABULATE(table, header, numalign="right", max_width=0): {'default': None, 'type': str, 'metavar': 'RELEASE_NUM', - 'help': "If you are particularly interested in an earlier snapshot of KEGG that anvi'o knows about, you can set it here. " - "Otherwise anvi'o will always use the latest snapshot it knows about, which is likely to be the one associated with " - "the current release of anvi'o."} + 'help': "The default behavior of this program is to download a pre-processed snapshot of data " + "from KEGG. If you are particularly interested in an earlier snapshot of KEGG that anvi'o " + "knows about, you can set it here. Otherwise anvi'o will always use the latest snapshot " + "it knows about, which is likely to be the one associated with the current release of anvi'o."} ), 'hide-outlier-SNVs': ( ['--hide-outlier-SNVs'], diff --git a/anvio/biochemistry/reactionnetwork.py b/anvio/biochemistry/reactionnetwork.py index 4b13409305..d4b810ea58 100644 --- a/anvio/biochemistry/reactionnetwork.py +++ b/anvio/biochemistry/reactionnetwork.py @@ -1076,7 +1076,8 @@ class KODatabase: Unless an alternative directory is provided, the database is downloaded and set up in a default anvi'o data directory, and loaded from this directory in network construction. """ - default_dir = os.path.join(os.path.dirname(ANVIO_PATH), 'data/MISC/REACTION_NETWORK/KO') + default_dir = os.path.join(os.path.dirname(ANVIO_PATH), 'data/MISC/KEGG/KO_REACTION_NETWORK') + expected_files = ['ko_info.txt', 'ko_data.tsv'] def __init__(self, ko_dir: str = None) -> None: """ @@ -1093,19 +1094,17 @@ def __init__(self, ko_dir: str = None) -> None: raise ConfigError(f"There is no such directory, '{ko_dir}'.") else: ko_dir = self.default_dir - info_path = os.path.join(ko_dir, 'ko_info.txt') - if not os.path.isfile(info_path): - raise ConfigError(f"No required file named 'ko_info.txt' was found in the KO directory, '{ko_dir}'.") - table_path = os.path.join(ko_dir, 'ko_data.tsv') - if not os.path.isfile(table_path): - raise ConfigError(f"No required file named 'ko_data.tsv' was found in the KO directory, '{ko_dir}'.") - f = open(info_path) + for expected_file in self.expected_files: + if not os.path.isfile(os.path.join(ko_dir, expected_file)): + raise ConfigError(f"No required file named '{expected_file}' was found in the KO directory, '{ko_dir}'.") + + f = open(os.path.join(ko_dir, 'ko_info.txt')) f.readline() self.release = ' '.join(f.readline().strip().split()[1:]) f.close() - self.ko_table = pd.read_csv(table_path, sep='\t', header=0, index_col=0, low_memory=False) + self.ko_table = pd.read_csv(os.path.join(ko_dir, 'ko_data.tsv'), sep='\t', header=0, index_col=0, low_memory=False) def set_up( num_threads: int = 1, @@ -1124,12 +1123,13 @@ def set_up( Number of threads to use in parallelizing the download of KO files. dir : str, None - Directory in which to create a new subdirectory called 'KO', in which files are - downloaded and set up. This argument overrides the default directory. + Directory in which to create a subdirectory called `KO_REACTION_NETWORK`, + in which files are downloaded and set up. This argument overrides + the default directory. reset : bool, False - If True, remove any existing 'KO' database directory and the files therein. If False, - an exception is raised if there are files in this directory. + If True, remove any existing 'KO_REACTION_NETWORK' database directory and the files + therein. If False, an exception is raised if there are files in this directory. run : anvio.terminal.Run, None @@ -1137,9 +1137,10 @@ def set_up( """ if dir: if os.path.isdir(dir): - ko_dir = os.path.join(dir, 'KO') + ko_dir = os.path.join(dir, 'KO_REACTION_NETWORK') else: - raise ConfigError(f"There is no such directory, '{dir}'.") + raise ConfigError(f"There is no such directory, '{dir}'. You should create it " + "first if you want to use it.") else: ko_dir = KODatabase.default_dir parent_dir = os.path.dirname(ko_dir) @@ -1242,7 +1243,7 @@ def set_up( "from the KO database. Anvi'o will now attempt to redownload all of the files. " ) run.info(f"Total number of KOs/entry files", total) - run.info("KEGG database version", release_after) + run.info("KEGG KO database version", release_after) run.info("KEGG KO list", list_path) run.info("KEGG KO info", info_path) @@ -1264,7 +1265,7 @@ def set_up( section = line.split()[0] if section == 'NAME': # The name value follows 'NAME' at the beginning of the line. - ko_data['name'] = line[4:].lstrip().rstrip() + ko_data['name'] = line[4:].strip() # EC numbers associated with the KO are recorded at the end of the name value. ec_string = re.search('\[EC:.*\]', line) if ec_string: diff --git a/anvio/biochemistry/refdbs.py b/anvio/biochemistry/refdbs.py index 81f033a0a0..b3c98254ed 100644 --- a/anvio/biochemistry/refdbs.py +++ b/anvio/biochemistry/refdbs.py @@ -91,6 +91,8 @@ def raise_missing_files(self, missing: List[str]) -> None: ) def _set_up_db_dir(self, reset: bool) -> None: + if os.path.split(self.db_dir)[0] == self.default_superdir and not os.path.exists(self.default_superdir): + os.mkdir(self.default_superdir) if os.path.exists(self.db_dir): if reset: rmtree(self.db_dir) diff --git a/anvio/data/misc/KEGG-SNAPSHOTS.yaml b/anvio/data/misc/KEGG-SNAPSHOTS.yaml index f0ce9f20ca..601d581b68 100644 --- a/anvio/data/misc/KEGG-SNAPSHOTS.yaml +++ b/anvio/data/misc/KEGG-SNAPSHOTS.yaml @@ -6,60 +6,77 @@ v2020-04-27: archive_name: KEGG_build_2020-04-27_b893b7b915cb.tar.gz hash: b893b7b915cb modules_db_version: 1 + no_modeling_data: True v2020-06-23: url: https://ndownloader.figshare.com/files/23701919 archive_name: KEGG_build_2020-06-23_4a75508b48aa.tar.gz hash: 4a75508b48aa modules_db_version: 2 + no_modeling_data: True v2020-08-06: url: https://ndownloader.figshare.com/files/25464530 archive_name: KEGG_build_2020-08-06_8f88ef165f4c.tar.gz hash: 8f88ef165f4c modules_db_version: 2 + no_modeling_data: True v2020-12-23: url: https://ndownloader.figshare.com/files/25878342 archive_name: KEGG_build_2020-12-23_45b7cc2e4fdc.tar.gz hash: 45b7cc2e4fdc modules_db_version: 2 + no_modeling_data: True v2021-12-18: url: https://figshare.com/ndownloader/files/31959416 archive_name: KEGG_build_2021-12-18_58937b64c44c.tar.gz hash: 58937b64c44c modules_db_version: 3 + no_modeling_data: True v2022-04-14: url: https://figshare.com/ndownloader/files/34817812 archive_name: KEGG_build_2022-04-14_666feeac5de2.tar.gz hash: 666feeac5de2 modules_db_version: 4 + no_modeling_data: True v2023-01-10: url: https://figshare.com/ndownloader/files/38799687 archive_name: KEGG_build_2023-01-10_d20a0dcd2128.tar.gz hash: d20a0dcd2128 modules_db_version: 4 + no_modeling_data: True v2023-09-18: url: https://figshare.com/ndownloader/files/42381873 archive_name: KEGG_build_2023-09-18_a2b5bde358bb.tar.gz hash: a2b5bde358bb modules_db_version: 4 + no_modeling_data: True + +v2023-09-22: + url: https://figshare.com/ndownloader/files/42428115 + archive_name: KEGG_build_2023-09-22_a2b5bde358bb.tar.gz + hash: a2b5bde358bb + modules_db_version: 4 # How to add a new KEGG snapshot to this file: # 1. download the latest data directly from KEGG by running -# `anvi-setup-kegg-kofams -D --kegg-data-dir ./KEGG` +# `anvi-setup-kegg-data -D --kegg-data-dir ./KEGG -T 5` # 2. get the hash value and version info from the MODULES.db: # `anvi-db-info ./KEGG/MODULES.db` # 3. archive that directory: # `tar -czvf KEGG_build_YYYY-MM-DD_HASH.tar.gz ./KEGG` -# Please remember to replace YYYY-MM-DD with the current date and replace HASH with the MODULES.db hash value obtained in step 2 +# Please remember to replace YYYY-MM-DD with the current date and replace HASH with the +# MODULES.db hash value obtained in step 2 # 4. Test that setup works with this archive by running -# `anvi-setup-kegg-kofams --kegg-archive KEGG_build_YYYY-MM-DD_HASH.tar.gz --kegg-data-dir TEST_NEW_KEGG_ARCHIVE` +# `anvi-setup-kegg-data --kegg-archive KEGG_build_YYYY-MM-DD_HASH.tar.gz --kegg-data-dir TEST_NEW_KEGG_ARCHIVE` # 5. Upload the .tar.gz archive to figshare and get the download url -# 6. Finally, add an entry to the bottom of this file with the url, archive name, and MODULES.db hash and version. You should also update the -# default self.target_snapshot variable in kegg.py to point to this latest version that you have added. -# 7. Test it by running `anvi-setup-kegg-kofams --kegg-data-dir TEST_NEW_KEGG`, and if it works you are done :) +# 6. Finally, add an entry to the bottom of this file with the url, archive name, and MODULES.db hash and version. +# You should also update the default self.target_snapshot variable in kegg.py to point to this +# latest version that you have added. +# 7. Test it by running `anvi-setup-kegg-data --kegg-data-dir TEST_NEW_KEGG` (you don't need to run the full thing, +# just long enough to see that the correct snapshot is being downloaded), and if it works you are done :) diff --git a/anvio/data/misc/PEOPLE/DEVELOPERS.yaml b/anvio/data/misc/PEOPLE/DEVELOPERS.yaml index d6c4edd8a1..c3a770ef61 100644 --- a/anvio/data/misc/PEOPLE/DEVELOPERS.yaml +++ b/anvio/data/misc/PEOPLE/DEVELOPERS.yaml @@ -7,7 +7,7 @@ linkedin: meren orcid: 0000-0001-9013-4827 skype: a.murat.eren - bio: "Computer scientist and microbial ecologist interested in undersatnding mechanisms by which microbes interact with their surroundings, evolve, disperse, and respond to environmental change." + bio: "Computer scientist and microbial ecologist interested in understanding mechanisms by which microbes interact with their surroundings, evolve, disperse, and respond to environmental change." affiliations: - title: Professor inst: Helmholtz Institute for Functional Marine Biodiversity at Oldenburg diff --git a/anvio/docs/artifacts/anvi-reaction-network.md b/anvio/docs/artifacts/anvi-reaction-network.md index d37ff0eddf..6d2dbc5d53 100644 --- a/anvio/docs/artifacts/anvi-reaction-network.md +++ b/anvio/docs/artifacts/anvi-reaction-network.md @@ -1,3 +1,3 @@ This program **generates a metabolic reaction network in a %(contigs-db)s.** Gene %(functions)s that have been annotated in the %(contigs-db)s are compared to reference databases, yielding predictions of the biochemical reactions that may be catalyzed by the gene products. Possible applications of anvi'o metabolic networks include the export of draft metabolic models (see %(anvi-get-metabolic-model-file)s) and the import and integration of metabolomic datasets. -A network can currently be generated from KEGG Orthology (KO) annotations of genes in conjunction with %(reaction-ref-data)s: KEGG ([KO](https://www.genome.jp/kegg/ko.html), [REACTION](https://www.genome.jp/kegg/reaction/), and [COMPOUND](https://www.genome.jp/kegg/compound/)) databases and the [ModelSEED Biochemistry](https://github.com/ModelSEED/ModelSEEDDatabase) database. The reference databases must have been downloaded and set up by %(anvi-setup-protein-reference-database)s. +A network can currently be generated from KEGG Orthology (KO) annotations of genes in conjunction with %(reaction-ref-data)s: KEGG ([KO](https://www.genome.jp/kegg/ko.html), [REACTION](https://www.genome.jp/kegg/reaction/), and [COMPOUND](https://www.genome.jp/kegg/compound/)) databases and the [ModelSEED Biochemistry](https://github.com/ModelSEED/ModelSEEDDatabase) database. The reference databases must have been downloaded and set up by %(anvi-setup-modelseed-database)s. diff --git a/anvio/docs/artifacts/kegg-data.md b/anvio/docs/artifacts/kegg-data.md index b2ef0b5b67..b240340f58 100644 --- a/anvio/docs/artifacts/kegg-data.md +++ b/anvio/docs/artifacts/kegg-data.md @@ -1,16 +1,16 @@ A **directory of data** downloaded from the [KEGG database resource](https://www.kegg.jp/) for use in function annotation and metabolism estimation. -It is created by running the program %(anvi-setup-kegg-kofams)s. Not everything from KEGG is included in this directory, only the information relevant to downstream programs. The most critical components of this directory are KOfam HMM profiles and the %(modules-db)s which contains information on metabolic pathways as described in the [KEGG MODULES resource](https://www.genome.jp/kegg/module.html), as well as functional classification hierarchies from [KEGG BRITE](https://www.genome.jp/kegg/brite.html). +It is created by running the program %(anvi-setup-kegg-data)s. Not everything from KEGG is included in this directory, only the information relevant to downstream programs. The most critical components of this directory are KOfam HMM profiles and the %(modules-db)s which contains information on metabolic pathways as described in the [KEGG MODULES resource](https://www.genome.jp/kegg/module.html), as well as functional classification hierarchies from [KEGG BRITE](https://www.genome.jp/kegg/brite.html). Programs that rely on this data directory include %(anvi-run-kegg-kofams)s and %(anvi-estimate-metabolism)s. ## Directory Location The default location of this data is in the anvi'o folder, at `anvio/anvio/data/misc/KEGG/`. -You can change this location when you run %(anvi-setup-kegg-kofams)s by providing a different path to the `--kegg-data-dir` parameter: +You can change this location when you run %(anvi-setup-kegg-data)s by providing a different path to the `--kegg-data-dir` parameter: {{ codestart }} -anvi-setup-kegg-kofams --kegg-data-dir /path/to/directory/KEGG +anvi-setup-kegg-data --kegg-data-dir /path/to/directory/KEGG {{ codestop }} If you do this, you will need to provide this path to downstream programs that require this data as well. diff --git a/anvio/docs/artifacts/modules-db.md b/anvio/docs/artifacts/modules-db.md index d5654d90fd..8863e1b053 100644 --- a/anvio/docs/artifacts/modules-db.md +++ b/anvio/docs/artifacts/modules-db.md @@ -1,6 +1,6 @@ A type of database containing information from either A) the [KEGG MODULE database](https://www.genome.jp/kegg/module.html) and [KEGG BRITE database](https://www.genome.jp/kegg/brite.html), or B) user-defined metabolic modules, for use in metabolism estimation and/or functional annotation of KEGG Orthologs (KOs). -These databases are part of the %(kegg-data)s and %(user-modules-data)s directories. You can get one on your computer by running %(anvi-setup-kegg-kofams)s or %(anvi-setup-user-modules)s. Programs that rely on this type of database include %(anvi-run-kegg-kofams)s and %(anvi-estimate-metabolism)s. +These databases are part of the %(kegg-data)s and %(user-modules-data)s directories. You can get one on your computer by running %(anvi-setup-kegg-data)s or %(anvi-setup-user-modules)s. Programs that rely on this type of database include %(anvi-run-kegg-kofams)s and %(anvi-estimate-metabolism)s. Most users will never have to interact directly with this kind of database. However, for the brave few who want to try this (or who are figuring out how anvi'o works under the hood), there is some relevant information below. @@ -19,7 +19,7 @@ In the current implementation, data about each metabolic pathway (from the KEGG | M00001 | ORTHOLOGY | K12407 | hexokinase/glucokinase [EC:2.7.1.1 2.7.1.2] [RN:R01786] | 4 | | (...) | (...) | (...) | (...) | (...) | -For the MODULES.db that comes out of %(anvi-setup-kegg-kofams)s, these data correspond to the information that can be found on the KEGG website for each metabolic module - for an example, you can see the page for [M00001](https://www.genome.jp/dbget-bin/www_bget?md:M00001) (or, alternatively, its [flat text file version](http://rest.kegg.jp/get/M00001) from the KEGG REST API). +For the MODULES.db that comes out of %(anvi-setup-kegg-data)s, these data correspond to the information that can be found on the KEGG website for each metabolic module - for an example, you can see the page for [M00001](https://www.genome.jp/dbget-bin/www_bget?md:M00001) (or, alternatively, its [flat text file version](http://rest.kegg.jp/get/M00001) from the KEGG REST API). The USER_MODULES.db that comes out of %(anvi-setup-user-modules)s contains similar information, but defined by the user instead of downloaded from the KEGG website. @@ -31,7 +31,7 @@ Finally, some rows of data originate from the same line in the original KEGG MOD ### The BRITE hierarchies table -In database version 4 or later, there is the option to include KEGG BRITE data in the modules database when setting one up using %(anvi-setup-kegg-kofams)s. If this is done, the database will include a table called `brite_hierarchies` which stores the set of functional hierarchies that each KEGG Ortholog belongs to. It will look like this: +In database version 4 or later, there is the option to include KEGG BRITE data in the modules database when setting one up using %(anvi-setup-kegg-data)s. If this is done, the database will include a table called `brite_hierarchies` which stores the set of functional hierarchies that each KEGG Ortholog belongs to. It will look like this: |**hierarchy_accession**|**hierarchy_name**|**ortholog_accession**|**ortholog_name**|**categorization**| |:--|:--|:--|:--|:--| @@ -81,7 +81,7 @@ modules_db_hash ..............................: 45b7cc2e4fdc ### Other important values in the self table -The `data_source` key will tell you if the current database was generated from KEGG data using %(anvi-setup-kegg-kofams)s or from user-defined metabolic modules using %(anvi-setup-user-modules)s. +The `data_source` key will tell you if the current database was generated from KEGG data using %(anvi-setup-kegg-data)s or from user-defined metabolic modules using %(anvi-setup-user-modules)s. The `annotation_sources` key will list the functional annotation sources that are required to annotate all enzymes found in the module definitions. diff --git a/anvio/docs/artifacts/reaction-ref-data.md b/anvio/docs/artifacts/reaction-ref-data.md index 8f9fe48591..9b6ccf67fa 100644 --- a/anvio/docs/artifacts/reaction-ref-data.md +++ b/anvio/docs/artifacts/reaction-ref-data.md @@ -1,3 +1,5 @@ -Reference databases required for %(anvi-reaction-network)s are stored in **directories of downloaded files set up by %(anvi-setup-protein-reference-database)s**. +Reference databases required for %(anvi-reaction-network)s are stored in **directories of downloaded files set up by %(anvi-setup-modelseed-database)s and %(anvi-setup-kegg-data)s**. -%(anvi-reaction-network)s currently relies upon comparison of KEGG Orthology (KO) gene annotations (%(kegg-functions)s) stored in a %(contigs-db) to reference databases: KEGG [KO](https://www.genome.jp/kegg/ko.html), [REACTION](https://www.genome.jp/kegg/reaction/), and [COMPOUND](https://www.genome.jp/kegg/compound/) and [ModelSEED Biochemistry](https://github.com/ModelSEED/ModelSEEDDatabase). The ModelSEED Biochemistry database harmonizes and consolidates reference data from multiple sources, including KEGG, in two comprehensive tables of reactions and compounds. +%(anvi-reaction-network)s currently relies upon comparison of KEGG Orthology (KO) gene annotations (%(kegg-functions)s) stored in a %(contigs-db) to reference databases: KEGG [KO](https://www.genome.jp/kegg/ko.html) and [ModelSEED Biochemistry](https://github.com/ModelSEED/ModelSEEDDatabase). The ModelSEED Biochemistry database harmonizes and consolidates reference data from multiple sources, including KEGG, in two comprehensive tables of reactions and compounds. + +The KEGG databases (%(kegg-data)s) can be obtained by running %(anvi-setup-kegg-data)s, and the ModelSEED database can be obtained by running %(anvi-setup-modelseed-database)s. diff --git a/anvio/docs/programs/anvi-estimate-metabolism.md b/anvio/docs/programs/anvi-estimate-metabolism.md index 81e13056c2..ac688191da 100644 --- a/anvio/docs/programs/anvi-estimate-metabolism.md +++ b/anvio/docs/programs/anvi-estimate-metabolism.md @@ -12,7 +12,7 @@ For a practical tutorial on how to use this program, visit [this link](https://m You have three options when it comes to estimating metabolism. -1. KEGG only (this is the default). In this case, estimation will be run on modules from the KEGG MODULES database, which you must set up on your computer using %(anvi-setup-kegg-kofams)s. If you have a default setup of KEGG, you need not provide any parameters to choose this option. However, if you have your KEGG data in a non-default location on your computer, you will have to use the `--kegg-data-dir` parameter to point out its location. +1. KEGG only (this is the default). In this case, estimation will be run on modules from the KEGG MODULES database, which you must set up on your computer using %(anvi-setup-kegg-data)s. If you have a default setup of KEGG, you need not provide any parameters to choose this option. However, if you have your KEGG data in a non-default location on your computer, you will have to use the `--kegg-data-dir` parameter to point out its location. 2. KEGG + USER data. In this case, we estimate on KEGG modules as in (1), but _also_ on user-defined metabolic modules that you set up with %(anvi-setup-user-modules)s and provide to this program with the `--user-modules` parameter. 3. USER data only. You can elect to skip estimation on KEGG modules and _only_ run on your own data by providing both the `--user-modules` and `--only-user-modules` parameters. @@ -20,9 +20,9 @@ You have three options when it comes to estimating metabolism. Metabolism estimation relies on gene annotations from the functional annotation source 'KOfam', also referred to as %(kegg-functions)s. Therefore, for this to work, you need to have annotated your %(contigs-db)s with hits to the KEGG KOfam database by running %(anvi-run-kegg-kofams)s prior to using this program, unless you are using the `--only-user-modules` option to ONLY estimate on user-defined metabolic modules. -Both %(anvi-run-kegg-kofams)s and %(anvi-estimate-metabolism)s rely on the %(kegg-data)s provided by %(anvi-setup-kegg-kofams)s, so if you do not already have that data on your computer, %(anvi-setup-kegg-kofams)s needs to be run first. To summarize, these are the steps that need to be done before you can use %(anvi-estimate-metabolism)s: +Both %(anvi-run-kegg-kofams)s and %(anvi-estimate-metabolism)s rely on the %(kegg-data)s provided by %(anvi-setup-kegg-data)s, so if you do not already have that data on your computer, %(anvi-setup-kegg-data)s needs to be run first. To summarize, these are the steps that need to be done before you can use %(anvi-estimate-metabolism)s: -1. Run %(anvi-setup-kegg-kofams)s to get data from KEGG onto your computer. This step only needs to be done once. +1. Run %(anvi-setup-kegg-data)s to get data from KEGG onto your computer. This step only needs to be done once. 2. [If not using `--only-user-modules`] Run %(anvi-run-kegg-kofams)s to annotate your %(contigs-db)s with %(kegg-functions)s. This program must be run on each contigs database that you want to estimate metabolism for. If you want to estimate for your own metabolism data, then you have a couple of extra steps to go through: @@ -483,7 +483,7 @@ Config Error: The contigs DB that you are working with has been annotated with a This means that the %(modules-db)s used by %(anvi-run-kegg-kofams)s has different contents (different KOs and/or different modules) than the one you are currently using to estimate metabolism, which would lead to mismatches if metabolism estimation were to continue. There are a few ways this can happen: 1. You upgraded to a new anvi'o version and downloaded the default %(kegg-data)s associated with that release, but are working with a %(contigs-db)s that was annotated with a previous anvi'o version (and therefore a different instance of %(kegg-data)s). -2. Without changing anvi'o versions, you annotated your %(contigs-db)s with default %(kegg-data)s, and subsequently replaced that data with a different instance by running %(anvi-setup-kegg-kofams)s again with the `--reset` flag (and likely also with the `--kegg-archive`, `--kegg-snapshot`, or `--download-from-kegg` options, all of which get you a non-default version of KEGG data). Then you tried to run %(anvi-estimate-metabolism)s with the new data. +2. Without changing anvi'o versions, you annotated your %(contigs-db)s with default %(kegg-data)s, and subsequently replaced that data with a different instance by running %(anvi-setup-kegg-data)s again with the `--reset` flag (and likely also with the `--kegg-archive`, `--kegg-snapshot`, or `--download-from-kegg` options, all of which get you a non-default version of KEGG data). Then you tried to run %(anvi-estimate-metabolism)s with the new data. 3. You have multiple instances of %(kegg-data)s on your computer in different locations, and you used different ones for %(anvi-run-kegg-kofams)s and %(anvi-estimate-metabolism)s. 4. Your collaborator gave you some databases that they annotated with a different version of %(kegg-data)s than you have on your computer. @@ -505,10 +505,10 @@ export ANVIO_KEGG_SNAPSHOTS=`python -c "import anvio; import os; print(os.path.j cat $ANVIO_KEGG_SNAPSHOTS`. {{ codestop }} -Take a look through the different versions. If you see one with a hash matching to the one used to annotate your %(contigs-db)s, then you can download that version by following [the directions for setting up a KEGG snapshot](https://anvio.org/help/main/programs/anvi-setup-kegg-kofams/#setting-up-an-earlier-kegg-snapshot). Provide the snapshot version name to the `--kegg-snapshot` parameter of %(anvi-setup-kegg-kofams)s. +Take a look through the different versions. If you see one with a hash matching to the one used to annotate your %(contigs-db)s, then you can download that version by following [the directions for setting up a KEGG snapshot](https://anvio.org/help/main/programs/anvi-setup-kegg-data/#setting-up-an-earlier-kegg-snapshot). Provide the snapshot version name to the `--kegg-snapshot` parameter of %(anvi-setup-kegg-data)s. **I can't find KEGG data with a matching hash!** -If you don't have a matching metabolism database on your computer, and none of the snapshots in the `KEGG-SNAPSHOTS.yaml` file have the hash that you need, your %(contigs-db)s was probably annotated with KO and module data [downloaded directly from KEGG](https://anvio.org/help/main/programs/anvi-setup-kegg-kofams/#getting-the-most-up-to-date-kegg-data-downloading-directly-from-kegg). If you have obtained the %(contigs-db)s from a collaborator (i.e., situation 4 from above), ask them to also share their %(kegg-data)s with you, following [these steps](https://anvio.org/help/main/programs/anvi-setup-kegg-kofams/#how-do-i-share-this-data). Otherwise, anvi'o cannot really help you get this data back, and you may have to resort to option 1 described above. +If you don't have a matching metabolism database on your computer, and none of the snapshots in the `KEGG-SNAPSHOTS.yaml` file have the hash that you need, your %(contigs-db)s was probably annotated with KO and module data [downloaded directly from KEGG](https://anvio.org/help/main/programs/anvi-setup-kegg-data/#getting-the-most-up-to-date-kegg-data-downloading-directly-from-kegg). If you have obtained the %(contigs-db)s from a collaborator (i.e., situation 4 from above), ask them to also share their %(kegg-data)s with you, following [these steps](https://anvio.org/help/main/programs/anvi-setup-kegg-data/#how-do-i-share-this-data). Otherwise, anvi'o cannot really help you get this data back, and you may have to resort to option 1 described above. If none of these solutions help you to get rid of the version incompatibility error, please feel free to reach out to the anvi'o developers for help. @@ -528,7 +528,7 @@ Regardless of which input type is provided to this program, the basic requiremen #### Module Definitions One set of metabolic pathway definitions that can be used by this program is the [KEGG MODULE resource](https://www.genome.jp/kegg/module.html). You can also define your own set of metabolic modules, but the definition format and estimation strategy will be the same. So for brevity's sake, the following discussion will cover the KEGG data case. -The program %(anvi-setup-kegg-kofams)s acquires the definitions of these modules using the KEGG API and puts them into the %(modules-db)s. The definitions are strings of KEGG Ortholog (KO) identifiers, representing the functions necessary to carry out each step of the metabolic pathway. Let's use module [M00018](https://www.genome.jp/kegg-bin/show_module?M00018), Threonine Biosynthesis, as an example. Here is the module definition, in picture form: +The program %(anvi-setup-kegg-data)s acquires the definitions of these modules using the KEGG API and puts them into the %(modules-db)s. The definitions are strings of KEGG Ortholog (KO) identifiers, representing the functions necessary to carry out each step of the metabolic pathway. Let's use module [M00018](https://www.genome.jp/kegg-bin/show_module?M00018), Threonine Biosynthesis, as an example. Here is the module definition, in picture form: ![Module M00018 Definition](../../images/M00018.png){:.center-img .width-50} diff --git a/anvio/docs/programs/anvi-run-kegg-kofams.md b/anvio/docs/programs/anvi-run-kegg-kofams.md index 24aa2d88f7..e5c765b790 100644 --- a/anvio/docs/programs/anvi-run-kegg-kofams.md +++ b/anvio/docs/programs/anvi-run-kegg-kofams.md @@ -1,4 +1,4 @@ -Essentially, this program uses the KEGG database to annotate functions and metabolic pathways in a %(contigs-db)s. More specifically, %(anvi-run-kegg-kofams)s annotates a %(contigs-db)s with HMM hits from KOfam, a database of KEGG Orthologs (KOs). You must set up these HMMs on your computer using %(anvi-setup-kegg-kofams)s before you can use this program. Membership of KOfam functions in KEGG metabolic MODULES and BRITE hierarchies is also stored in the %(contigs-db)s. +Essentially, this program uses the KEGG database to annotate functions and metabolic pathways in a %(contigs-db)s. More specifically, %(anvi-run-kegg-kofams)s annotates a %(contigs-db)s with HMM hits from KOfam, a database of KEGG Orthologs (KOs). You must set up these HMMs on your computer using %(anvi-setup-kegg-data)s before you can use this program. If a %(modules-db)s is available, membership of KOfam functions in KEGG metabolic MODULES and BRITE hierarchies is also stored in the %(contigs-db)s. Running this program is a pre-requisite for metabolism estimation with %(anvi-estimate-metabolism)s. Note that if you are planning to run metabolism estimation, it must be run with the same %(kegg-data)s that is used in this program to annotate KOfam hits. @@ -20,7 +20,7 @@ For every gene without a KOfam annotation, we examine all the hits with an e-val Please note that this strategy is just a heuristic. We have tried to pick default parameters that seemed reasonable but by no means have we comprehensively tested and optimized them. This is why X and Y are mutable so that you can explore different values and see how they work for your data. It is always a good idea to double-check your annotations to make sure they are reasonable and as stringent as you'd like them to be. In addition, if you do not feel comfortable using this heuristic at all, you can always turn this behavior off and rely solely on KEGG's bitscore thresholds. :) **3) Put annotations in the database** -In the %(contigs-db)s functions table, annotated KO hits (%(kegg-functions)s) will have the source `KOfam`. Metabolic Modules and BRITE functional classifications containing these functions also have entries in the table, with sources labeled `KEGG_Module` and `KEGG_BRITE`. BRITE classification will not occur if %(anvi-setup-kegg-kofams)s was not set up with BRITE data (see the artifact for that program to see how to include BRITE). +In the %(contigs-db)s functions table, annotated KO hits (%(kegg-functions)s) will have the source `KOfam`. If a %(modules-db)s is available, metabolic modules and BRITE functional classifications containing these functions also have entries in the table, with sources labeled `KEGG_Module` and `KEGG_BRITE`. BRITE classification will not occur if %(anvi-setup-kegg-data)s was not set up with BRITE data (see the artifact for that program to see how to include BRITE). ## Standard usage @@ -29,7 +29,7 @@ anvi-run-kegg-kofams -c %(contigs-db)s {{ codestop }} ## Use a specific non-default KEGG data directory -If you have previously setup your KEGG data directory using `--kegg-data-dir` (see %(anvi-setup-kegg-kofams)s), or have moved the KEGG data directory that you wish to use to a non-default location (maybe you like keeping the older versions around when you update, we don't know how you roll), then you may need to specify where to find the KEGG data so that this program can use the right one. In that case, this is how you do it: +If you have previously setup your KEGG data directory using `--kegg-data-dir` (see %(anvi-setup-kegg-data)s), or have moved the KEGG data directory that you wish to use to a non-default location (maybe you like keeping the older versions around when you update, we don't know how you roll), then you may need to specify where to find the KEGG data so that this program can use the right one. In that case, this is how you do it: {{ codestart }} anvi-run-kegg-kofams -c %(contigs-db)s \ diff --git a/anvio/docs/programs/anvi-setup-kegg-kofams.md b/anvio/docs/programs/anvi-setup-kegg-data.md similarity index 71% rename from anvio/docs/programs/anvi-setup-kegg-kofams.md rename to anvio/docs/programs/anvi-setup-kegg-data.md index be18100c2d..00b0423daf 100644 --- a/anvio/docs/programs/anvi-setup-kegg-kofams.md +++ b/anvio/docs/programs/anvi-setup-kegg-data.md @@ -1,11 +1,33 @@ -%(anvi-setup-kegg-kofams)s downloads and organizes data from KEGG for use by other programs, namely %(anvi-run-kegg-kofams)s and %(anvi-estimate-metabolism)s. It downloads HMM profiles from the [KOfam](https://academic.oup.com/bioinformatics/article/36/7/2251/5631907) database as well as the metabolism information of [KEGG MODULES](https://www.genome.jp/kegg/module.html) and the functional classification information of [KEGG BRITE](https://www.genome.jp/kegg/brite.html). The KOfam profiles are prepared for later use by the HMMER software, and the information from MODULES and BRITE is made accessible to other anvi'o programs as a %(modules-db)s. This program generates a directory with these files (%(kegg-data)s), which by default is located at `anvio/anvio/data/misc/KEGG/`. +%(anvi-setup-kegg-data)s downloads and organizes data from KEGG for use by other programs, namely %(anvi-run-kegg-kofams)s, %(anvi-estimate-metabolism)s and %(anvi-reaction-network)s. Depending on what download mode you choose, it can download and setup one or more of the following: + +- HMM profiles from the [KOfam](https://academic.oup.com/bioinformatics/article/36/7/2251/5631907) database +- metabolic pathway information from [KEGG MODULES](https://www.genome.jp/kegg/module.html) +- functional classification information from [KEGG BRITE](https://www.genome.jp/kegg/brite.html) +- protein family information of the [KEGG Orthology database](https://www.genome.jp/kegg/ko.html) + + Typically, some processing is done following the data download to make the data work with downstream anvi'o programs. The KOfam profiles are prepared for later use by the HMMER software, and the information from MODULES and BRITE is made accessible to other anvi'o programs as a %(modules-db)s. The Orthology data is converted into a nice table that can be utilized by %(anvi-reaction-network)s. This program generates a directory with these files (%(kegg-data)s). + +## Choosing a download mode + +You need to pick a mode to work with this program to control which data will be downloaded from KEGG. You can see the available modes by running the following command: + +{{ codestart }} +anvi-setup-kegg-data --list-modes +{{ codestop }} + +You use the `--mode` parameter to tell the program which mode you want, for example: + +{{ codestart }} +anvi-setup-kegg-data --mode modules +{{ codestop }} + ## Default usage: downloading a KEGG snapshot -If you do not provide any arguments to this program, the KOfam profiles and KEGG information will be set up in the default KEGG data directory. +If you do not provide any arguments to this program, all KEGG data (ie, `--mode all`) will be set up in the default KEGG data directory. {{ codestart }} -anvi-setup-kegg-kofams +anvi-setup-kegg-data {{ codestop }} ### How does it work? @@ -21,14 +43,17 @@ Doing it this way ensures that almost everyone uses the same version of KEGG dat But the trade-off to this is that the default KEGG data version is tied to an anvi'o release, and it will not always include the most up-to-date information from KEGG. Luckily, **for those who want the most updated version of KEGG, you can still use this program to generate the KEGG data directory by downloading directly from KEGG** (see 'Getting the most up-to-date KEGG data' section below). {:.warning} -BRITE hierarchy data is not included in the default KEGG snapshot for anvi'o `v7`. Starting from the `v7.1-dev` version of anvi'o, there is a new default KEGG snapshot including BRITE information. This data can also be set up by using the option to download directly from KEGG in `v7.1-dev` or later. +BRITE hierarchy data is not included in the default KEGG snapshot for anvi'o `v7`. Starting from the `v7.1-dev` version of anvi'o, there is a new default KEGG snapshot including BRITE information. If you are missing this data, it can be acquired by either installing a later snapshot or by independently downloading it with this program using `--mode modules`. + +{:.warning} +The data for metabolic modeling are not included in the KEGG snapshots created before anvi'o `v8`. If you are missing this data, it can be acquired by either installing a later snapshot or by independently downloading it with this program using `--mode modeling`. ### Set up KEGG data in a non-default location You can specify a different directory in which to put this data, if you wish: {{ codestart }} -anvi-setup-kegg-kofams --kegg-data-dir /path/to/directory/KEGG +anvi-setup-kegg-data --kegg-data-dir /path/to/directory/KEGG {{ codestop }} This is helpful if you don't have write access to the default directory location, or if you want to keep several different versions of the KEGG data on your computer. Just remember that when you want to use this specific KEGG data directory with later programs such as %(anvi-run-kegg-kofams)s, you will have to specify its location with the `--kegg-data-dir` flag. @@ -38,7 +63,7 @@ This is helpful if you don't have write access to the default directory location By default, the KEGG snapshot that will be installed is the latest one, which is up-to-date with your current version of anvi'o. If, however, you want a snapshot from an earlier version, you can run something like the following to get it: {{ codestart }} -anvi-setup-kegg-kofams --kegg-data-dir /path/to/directory/KEGG \ +anvi-setup-kegg-data --kegg-data-dir /path/to/directory/KEGG \ --kegg-snapshot v2020-04-27 {{ codestop }} @@ -46,7 +71,7 @@ Just keep in mind that you may need to migrate the MODULES.db from these earlier Not sure what KEGG snapshots are available for you to request? Well, you could check out the YAML file at `anvio/anvio/data/misc/KEGG-SNAPSHOTS.yaml` in your anvi'o directory, or you could just give something random to the `--kegg-snapshot` parameter and watch anvi'o freak out and tell you what is available: {{ codestart }} -anvi-setup-kegg-kofams --kegg-snapshot hahaha +anvi-setup-kegg-data --kegg-snapshot hahaha {{ codestop }} @@ -55,9 +80,11 @@ anvi-setup-kegg-kofams --kegg-snapshot hahaha This program is also capable of downloading data directly from KEGG and converting it into an anvi'o-compatible format. In fact, this is how we generate the default KEGG archive. If you want the latest KEGG data instead of the default snapshot of KEGG, try the following: {{ codestart }} -anvi-setup-kegg-kofams --download-from-kegg +anvi-setup-kegg-data --download-from-kegg {{ codestop }} +Please note that this will download all the KEGG data (ie, `--mode all` is the default). If you want to independently download individual KEGG datasets, you should pick one of the other modes (the `--download-from-kegg` flag is implicitly turned on in these modes). + ### How does it work? KOfam profiles are downloadable from KEGG's [FTP site](ftp://ftp.genome.jp/pub/db/kofam/) and all other KEGG data is accessible as flat text files through their [API](https://www.kegg.jp/kegg/rest/keggapi.html). When you run this program it will first get all the files that it needs from these sources, and then it will process them by doing the following: @@ -66,54 +93,57 @@ KOfam profiles are downloadable from KEGG's [FTP site](ftp://ftp.genome.jp/pub/d - concatenate all remaining KOfam profiles into one file and run `hmmpress` on them - parse the flat text file for each KEGG module and the JSON file for each BRITE hierarchy - store the MODULE and BRITE information in the %(modules-db)s +- parse the flat text files from KEGG Orthology and organize these into a table for metabolic modeling An important thing to note about this option is that it has rigid expectations for the format of the KEGG data that it works with. Future updates to KEGG may break things such that the data can no longer be directly obtained from KEGG or properly processed. In the sad event that this happens, you will have to download KEGG from one of our archives instead. ### The --only-download option -Suppose you only want to download data from KEGG, but you don't need a %(modules-db)s - at least not right away. You can instruct this program to stop after downloading by providing the `--only-download` flag: +The `--only-download` flag works for `KOfam` mode and `modules` mode. + +Suppose you only want to download data from KEGG without processing it. For instance, perhaps you don't need a %(modules-db)s or you don't want `hmmpress` to be run on the KOfam profiles. You can instruct this program to stop after downloading by providing the `--only-download` flag: {{ codestart }} -anvi-setup-kegg-kofams --download-from-kegg \ +anvi-setup-kegg-data --mode modules \ --only-download \ --kegg-data-dir /path/to/directory/KEGG {{ codestop }} It's probably a good idea in this case to specify where you want this data to go using `--kegg-data-dir`, to make sure you can find it later. -Actually, in addition to downloading the data, the program will also do a bit of processing on the KOfam profiles: it will remove those without bitscore thresholds, concatenate the remaining profiles into one file, and run `hmmpress` on them. But no database will be created when this flag is used. - {:.notice} -This option is primarily useful for developers to test `anvi-setup-kegg-kofams` - for instance, so that you can download the data once and run the database setup option (`--only-database`) multiple times. However, if non-developers find another practical use-case for this flag, we'd be happy to add those ideas here. Send us a message, or feel free to edit this file and pull request your changes on the anvi'o Github repository. :) +This option is primarily useful for developers to test `anvi-setup-kegg-data` - for instance, so that you can download the data once and run the database setup option (`--only-processing`) multiple times. However, if non-developers find another practical use-case for this flag, we'd be happy to add those ideas here. Send us a message, or feel free to edit this file and pull request your changes on the anvi'o Github repository. :) + +### The --only-processing option -### The --only-database option +The `--only-processing` flag works for `KOfam` mode and `modules` mode. -Let's say you already have KEGG data on your computer that you got by running this program with the `--only-download` flag. Now you want to turn this data into a %(modules-db)s. To do that, run this program using the `--only-database` flag and provide the location of the pre-downloaded KEGG data: +Let's say you already have KEGG data on your computer that you got by running this program with the `--only-download` flag. Now you want to process the HMM files, or turn the MODULES data into a %(modules-db)s. To do that, run this program using the `--only-processing` flag and provide the location of the pre-downloaded KEGG data: {{ codestart }} -anvi-setup-kegg-kofams --download-from-kegg \ - --only-database \ +anvi-setup-kegg-data --mode modules \ + --only-processing \ --kegg-data-dir /path/to/directory/KEGG {{ codestop }} {:.notice} The KEGG data that you already have on your computer has to be in the format expected by this program, or you'll run into errors. Pretty much the only reasonable way to get the data into the proper format is to run this program with the `--only-download` option. Otherwise you would have to go through a lot of manual file-changing shenanigans - possible, but not advisable. -One more note: since this flag is most often used for testing the database setup capabilities of this program, which entails running `anvi-setup-kegg-kofams -D --only-database` multiple times on the same KEGG data directory, there is an additional flag that may be useful in this context. To avoid having to manually delete the created modules database each time you run, you can use the `--overwrite-output-destinations` flag: +One more note: since this flag is most often used for testing the database setup capabilities of this program, which entails running `anvi-setup-kegg-data --mode modules --only-processing` multiple times on the same KEGG data directory, there is an additional flag that may be useful in this context. To avoid having to manually delete the created modules database each time you run, you can use the `--overwrite-output-destinations` flag: {{ codestart }} -anvi-setup-kegg-kofams --download-from-kegg \ - --only-database \ +anvi-setup-kegg-data --mode modules \ + --only-processing \ --kegg-data-dir /path/to/directory/KEGG \ --overwrite-output-destinations {{ codestop }} ### Avoiding BRITE setup -As of anvi'o `v7.1-dev` or later, KEGG BRITE hierarchies are added to the %(modules-db)s when running this program with the `-D` (`--download-from-kegg`) option. If you don't want this cool new feature - because you are a rebel, or adverse to change, or something is not working on your computer, whatever - then fine. You can use the `--skip-brite-hierarchies` flag: +As of anvi'o `v7.1-dev` or later, KEGG BRITE hierarchies are added to the %(modules-db)s when running this program with `--mode modules`. If you don't want this cool new feature - because you are a rebel, or adverse to change, or something is not working on your computer, whatever - then fine. You can use the `--skip-brite-hierarchies` flag: {{ codestart }} -anvi-setup-kegg-kofams -D --skip-brite-hierarchies +anvi-setup-kegg-data --mode modules --skip-brite-hierarchies {{ codestop }} Hopefully it makes sense to you that this flag does not work when setting up from a KEGG snapshot that already includes BRITE data in it. @@ -121,9 +151,9 @@ Hopefully it makes sense to you that this flag does not work when setting up fro ### How do I share this data? Suppose you have been living on the edge and annotating your contigs databases with a non-default version of %(kegg-data)s, and you share these databases with a collaborator who wants to run downstream programs like %(anvi-estimate-metabolism)s on them. Your collaborator (who has a different version of %(kegg-data)s on their computer) will likely get version errors as detailed on the %(anvi-estimate-metabolism)s help page. -In order for your collaborator to be able to work with your dataset, they need to have the same %(kegg-data)s version as you did when you ran %(anvi-run-kegg-kofams)s. If you are very lucky and KEGG has not been updated since you set up your %(kegg-data)s, they may be able to run `anvi-setup-kegg-kofams -D` to get it. But if not, there are a few options for you to share your version of %(kegg-data)s: +In order for your collaborator to be able to work with your dataset, they need to have the same %(kegg-data)s version as you did when you ran %(anvi-run-kegg-kofams)s. If you are very lucky and KEGG has not been updated since you set up your %(kegg-data)s, they may be able to run `anvi-setup-kegg-data -D` to get it. But if not, there are a few options for you to share your version of %(kegg-data)s: -1. You could send them your KEGG data directory. First, run `tar -czvf kegg_archive.tar.gz ./KEGG` on the data directory to compress and archive it before sending it over (this command _must_ be run from its parent directory so that the archive has the expected directory structure when it is unpacked). Then your collaborator can just run `anvi-setup-kegg-kofams --kegg-archive kegg_archive.tar.gz --kegg-data-dir ./KEGG_ARCHIVE` and be good to go. They would just have to use `--kegg-data-dir ./KEGG_ARCHIVE` when running downstream programs. The problem here is that even the archived %(kegg-data)s is quite large, ~4-5GB, and may be unfeasible for you to send. +1. You could send them your KEGG data directory. First, run `tar -czvf kegg_archive.tar.gz ./KEGG` on the data directory to compress and archive it before sending it over (this command _must_ be run from its parent directory so that the archive has the expected directory structure when it is unpacked). Then your collaborator can just run `anvi-setup-kegg-data --kegg-archive kegg_archive.tar.gz --kegg-data-dir ./KEGG_ARCHIVE` and be good to go. They would just have to use `--kegg-data-dir ./KEGG_ARCHIVE` when running downstream programs. The problem here is that even the archived %(kegg-data)s is quite large, ~4-5GB, and may be unfeasible for you to send. 2. You could share with your collaborator just the %(modules-db)s. If all they want to do is to run %(anvi-estimate-metabolism)s on databases annotated by your version of the KEGG data directory, this should be all they need. They would need to pass the folder containing your %(modules-db)s to %(anvi-estimate-metabolism)s using the `--kegg-data-dir` parameter. 3. If your collaborator also wants to be able to annotate other databases with your version of %(kegg-data)s, then they need to have the KOfam profiles as well. You can send them your %(modules-db)s and have them download the KOfam profiles most similar to the ones you have from the [KOfam archives](https://www.genome.jp/ftp/db/kofam/archives/) (which are labeled by date). Then they would have to essentially construct their own KEGG data directory by copying the structure of the default one and putting the downloaded files (and the %(modules-db)s you sent them) into the correct locations. The KOfam profiles must be concatenated into a `Kofam.hmm` file and `hmmpress` must be run on that file to generate the required indices for `hmmsearch`. Your collaborator must also have the `ko_list.txt` file (which _should_ be downloaded with the profiles) in the right spot. Then they could pass their makeshift KEGG data directory to %(anvi-run-kegg-kofams)s using `--kegg-data-dir`, and they should be golden. (A word of warning: they may want to remove KOs without bitscore thresholds in the `ko_list.txt` before concatenating the profiles, otherwise they will likely get a lot of weak hits for these KOs.) @@ -132,7 +162,7 @@ In order for your collaborator to be able to work with your dataset, they need t If you have an archive (`.tar.gz`) of the KEGG data directory already on your computer (perhaps a colleague or Meren Lab developer gave you one), you can set up KEGG from this archive instead: {{ codestart }} -anvi-setup-kegg-kofams --kegg-archive KEGG_archive.tar.gz +anvi-setup-kegg-data --kegg-archive KEGG_archive.tar.gz {{ codestop }} This works the same way as the default, except that it bypasses the download step and instead uses the archive file you have provided with `--kegg-archive`. @@ -143,13 +173,13 @@ Periodically (especially before releasing a new version of anvi'o), we want to a Available KEGG snapshots are stored in the anvi'o code repository in `anvio/data/misc/KEGG-SNAPSHOTS.yaml`. To add a new snapshot, you first need to create one by downloading and processing the data from KEGG, testing to make sure it works, and then updating this file. Here are the steps: -1. Download the latest data directly from KEGG by running `anvi-setup-kegg-kofams -D --kegg-data-dir ./KEGG`. This will create the new KEGG data folder with its %(modules-db)s in your current working directory. Make sure you use the exact folder name of `./KEGG`, because that is what anvi'o expects to find when it unpacks a KEGG snapshot. +1. Download the latest data directly from KEGG by running `anvi-setup-kegg-data -D --kegg-data-dir ./KEGG -T 5`. This will create the new KEGG data folder with its %(modules-db)s in your current working directory. Make sure you use the exact folder name of `./KEGG`, because that is what anvi'o expects to find when it unpacks a KEGG snapshot. You may want to reduce or increase the number of threads (`-T`) according to your available compute resources. 2. Get the hash value and version info from the MODULES.db by running `anvi-db-info ./KEGG/MODULES.db`. 3. Archive the KEGG data directory by running `tar -czvf KEGG_build_YYYY-MM-DD_HASH.tar.gz ./KEGG`. Please remember to replace YYYY-MM-DD with the current date and replace HASH with the MODULES.db hash value obtained in step 2. This convention makes it easier to distinguish between KEGG snapshots by simply looking at the file name. -4. Test that setup works with this archive by running `anvi-setup-kegg-kofams --kegg-archive KEGG_build_YYYY-MM-DD_HASH.tar.gz --kegg-data-dir TEST_NEW_KEGG_ARCHIVE`. +4. Test that setup works with this archive by running `anvi-setup-kegg-data --kegg-archive KEGG_build_YYYY-MM-DD_HASH.tar.gz --kegg-data-dir TEST_NEW_KEGG_ARCHIVE`. 5. If setup worked in the last step without errors, upload the `.tar.gz` archive to [Figshare](https://figshare.com/). If you need inspiration for filling out the keywords, categories, and description fields for the archive, you can check the previous KEGG snapshots that have been uploaded - for instance, [this one](https://figshare.com/articles/dataset/KEGG_build_2023-01-10/21862494) or [this one](https://figshare.com/articles/dataset/KEGG_build_2022-04-14/19601761). At minimum, we typically indicate the database version and hash value, and an example setup command (ie, the one from step 4), in the description of the dataset. Once the archive is published on Figshare (warning: this usually takes a while due to the large file size), you can get the download url of the archive by right-clicking on the Download button and copying the address, which should be a URL with a format similar to this example (but different numbers): `https://figshare.com/ndownloader/files/34817812` 6. Add an entry to the bottom of the `anvio/data/misc/KEGG-SNAPSHOTS.yaml` file with the Figshare download URL, archive name, and MODULES.db hash and version. If you want this to become the default snapshot (which usually only changes before the next anvi'o release), you should also update the default `self.target_snapshot` variable in `anvio/kegg.py` to be this latest version that you have added. -7. Test it by running `anvi-setup-kegg-kofams --kegg-data-dir TEST_NEW_KEGG`, and if it works you are done, and can push your changes to the anvi'o repository. :) +7. Test it by running `anvi-setup-kegg-data --kegg-data-dir TEST_NEW_KEGG`, and if it works you are done, and can push your changes to the anvi'o repository. :) ## Downloading generic KEGG data in Python @@ -182,7 +212,7 @@ setup.download_kegg_files_from_hierarchy('br08001', download_dir='KEGG_COMPOUND' If you just want to get a KEGG `htext` file (with extension `.keg`), use the following function: ```python -etup.download_generic_htext('br08001', download_dir='KEGG_COMPOUND') +setup.download_generic_htext('br08001', download_dir='KEGG_COMPOUND') ``` ### Processing a hierarchical text file @@ -193,12 +223,13 @@ etup.download_generic_htext('br08001', download_dir='KEGG_COMPOUND') accession_list = setup.get_accessions_from_htext_file("br08001.keg") ``` - If you want to process the KEGG module `htext` file to get a dictionary of all modules and their names/classes/etc, use the following function. You will need to set the `kegg_module_file` attribute (of the KeggSetup class) to point to the location of the `modules.keg` file, and the function will store the module dictionary in the `module_dict` attribute. + If you want to process the KEGG module `htext` file to get a dictionary of all modules and their names/classes/etc, use the following code. You will need to set the `kegg_module_file` attribute (of the ModulesDownload class) to point to the location of the `modules.keg` file, and the function will store the module dictionary in the `module_dict` attribute. ```python -setup.kegg_module_file = "modules.keg" -setup.process_module_file() -setup.module_dict # this attribute now stores the module dictionary +modules_setup = kegg.ModulesDownload(args) +modules_setup.kegg_module_file = "modules.keg" +modules_setup.process_module_file() +modules_setup.module_dict # this attribute now stores the module dictionary ``` ### Downloading a flat file using the KEGG API diff --git a/anvio/docs/programs/anvi-setup-user-modules.md b/anvio/docs/programs/anvi-setup-user-modules.md index ecf6971224..32b69edd90 100644 --- a/anvio/docs/programs/anvi-setup-user-modules.md +++ b/anvio/docs/programs/anvi-setup-user-modules.md @@ -74,7 +74,7 @@ Why must we format the module files this way, you ask? Well, to be honest, KEGG ### Specifying KEGG data to be used for sanity checking -If you haven't yet run %(anvi-setup-kegg-kofams)s on your computer, you will get an error when you try to run this program. This is because KEGG data can be used in addition to user-defined modules, and we need to be aware of which KEGG modules exist so we can make sure none of the user-defined modules have the same identifiers as these. +If you haven't yet run %(anvi-setup-kegg-data)s on your computer, you will get an error when you try to run this program. This is because KEGG data can be used in addition to user-defined modules, and we need to be aware of which KEGG modules exist so we can make sure none of the user-defined modules have the same identifiers as these. By default, this program looks for the KEGG data in the default location, so if you have set up KEGG data in a non-default directory, you should specify the path to that directory using the `--kegg-data-dir` parameter: diff --git a/anvio/drivers/hmmer.py b/anvio/drivers/hmmer.py index a34e835885..00cc2a9341 100644 --- a/anvio/drivers/hmmer.py +++ b/anvio/drivers/hmmer.py @@ -87,7 +87,7 @@ def verify_hmmpress_output(self, hmm_path): if not os.path.exists(base_path + ext): raise ConfigError("It appears that hmmpress was not properly run on the hmm profiles at %s. The " "file %s does not exist. It is likely that you will have to set up your profiles " - "again by running a program such as `anvi-setup-pfams` or `anvi-setup-kegg-kofams`. " + "again by running a program such as `anvi-setup-pfams` or `anvi-setup-kegg-data`. " "We are very sorry about this." % (hmm_path, base_path + ext)) diff --git a/anvio/kegg.py b/anvio/kegg.py index 21f639fadd..df9000a61b 100644 --- a/anvio/kegg.py +++ b/anvio/kegg.py @@ -23,6 +23,7 @@ import anvio.filesnpaths as filesnpaths import anvio.tables as t import anvio.ccollections as ccollections +import anvio.biochemistry.reactionnetwork as reactionnetwork from anvio.errors import ConfigError from anvio.drivers.hmmer import HMMer @@ -492,7 +493,7 @@ class KeggSetup(KeggContext): Parameters ========== args: Namespace object - All the arguments supplied by user to anvi-setup-kegg-kofams. If using this class through the API, please + All the arguments supplied by user to anvi-setup-kegg-data. If using this class through the API, please provide a Namespace object with the Boolean 'reset' parameter. skip_init: Boolean Developers can use this flag to skip the sanity checks and creation of directories when testing this class @@ -503,13 +504,13 @@ def __init__(self, args, run=run, progress=progress, skip_init=False): self.args = args self.run = run self.progress = progress + self.num_threads = 1 if not A('num_threads') else A('num_threads') self.kegg_archive_path = A('kegg_archive') + self.kegg_snapshot = A('kegg_snapshot') self.download_from_kegg = True if A('download_from_kegg') else False self.only_download = True if A('only_download') else False - self.only_database = True if A('only_database') else False - self.kegg_snapshot = A('kegg_snapshot') - self.skip_brite_hierarchies = A('skip_brite_hierarchies') - self.overwrite_modules_db = A('overwrite_output_destinations') + self.only_processing = True if A('only_processing') else False + self.skip_init = skip_init if self.kegg_archive_path and self.download_from_kegg: raise ConfigError("You provided two incompatible input options, --kegg-archive and --download-from-kegg. " @@ -517,19 +518,15 @@ def __init__(self, args, run=run, progress=progress, skip_init=False): if self.kegg_snapshot and self.download_from_kegg or self.kegg_snapshot and self.kegg_archive_path: raise ConfigError("You cannot request setup from an anvi'o KEGG snapshot at the same time as from KEGG directly or from one of your " "KEGG archives. Please pick just one setup option and try again.") - if (not self.download_from_kegg) and (self.only_download or self.only_database): - raise ConfigError("Erm. The --only-download and --only-database options are only valid if you are also using the --download-from-kegg " + + if (not self.download_from_kegg) and (self.only_download or self.only_processing): + raise ConfigError("Erm. The --only-download and --only-processing options are only valid if you are also using the --download-from-kegg " "option. Sorry.") - if self.only_download and self.only_database: - raise ConfigError("The --only-download and --only-database options are incompatible. Please choose only one. Or, if you want both " + if self.only_download and self.only_processing: + raise ConfigError("The --only-download and --only-processing options are incompatible. Please choose only one. Or, if you want both " "download AND database setup to happen, then use only the -D flag without providing either of these two options.") - if (not self.download_from_kegg) and self.skip_brite_hierarchies: - self.run.warning("Just so you know, the --skip-brite-hierarchies flag does not do anything (besides suppress some warning output) when used " - "without the -D option. You are setting up from an archived KEGG snapshot which may already include BRITE data, and if it " - "does, this data will not be removed. You can always check if the resulting modules database contains BRITE data by " - "running `anvi-db-info` on it and looking at the `is_brite_setup` value (which will be 1 if the database contains BRITE data).") - + # initializing these to None here so that it doesn't break things downstream self.pathway_dict = None self.brite_dict = None @@ -537,56 +534,37 @@ def __init__(self, args, run=run, progress=progress, skip_init=False): # init the base class KeggContext.__init__(self, self.args) + # get KEGG snapshot info for default setup + self.target_snapshot = self.kegg_snapshot or 'v2023-09-22' + self.target_snapshot_yaml = os.path.join(os.path.dirname(anvio.__file__), 'data/misc/KEGG-SNAPSHOTS.yaml') + self.snapshot_dict = utils.get_yaml_as_dict(self.target_snapshot_yaml) + + if self.target_snapshot not in self.snapshot_dict.keys(): + self.run.warning(None, header="AVAILABLE KEGG SNAPSHOTS", lc="yellow") + available_snapshots = sorted(list(self.snapshot_dict.keys())) + for snapshot_name in available_snapshots: + self.run.info_single(snapshot_name + (' (latest)' if snapshot_name == available_snapshots[-1] else '')) + + raise ConfigError("Whoops. The KEGG snapshot you requested is not one that is known to anvi'o. Please try again, and " + "this time pick from the list shown above.") + + # default download path for KEGG snapshot + self.default_kegg_data_url = self.snapshot_dict[self.target_snapshot]['url'] + self.default_kegg_archive_file = self.snapshot_dict[self.target_snapshot]['archive_name'] + self.expect_modeling_files_in_archive = True if 'no_modeling_data' in self.snapshot_dict[self.target_snapshot].keys() and \ + (not self.snapshot_dict[self.target_snapshot]['no_modeling_data']) else False + if self.user_input_dir: self.run.warning(f"Just so you know, we will be setting up the metabolism data provided at the following " f"location: '{self.user_input_dir}'. The success of this will be determined by how well you " f"followed our formatting guidelines, so keep an eye out for errors below.") - filesnpaths.is_program_exists('hmmpress') if not self.user_input_dir: - if not args.reset and not anvio.DEBUG and not skip_init: - self.is_database_exists(fail_if_exists=(not self.only_database)) - - if self.download_from_kegg and not self.only_database and not self.kegg_archive_path and not skip_init: + # establish parent directory + if self.download_from_kegg and not self.only_processing and not self.kegg_archive_path and not skip_init: filesnpaths.gen_output_directory(self.kegg_data_dir, delete_if_exists=args.reset) - filesnpaths.gen_output_directory(self.kegg_hmm_data_dir, delete_if_exists=args.reset) - filesnpaths.gen_output_directory(self.orphan_data_dir, delete_if_exists=args.reset) - filesnpaths.gen_output_directory(self.kegg_module_data_dir, delete_if_exists=args.reset) - filesnpaths.gen_output_directory(self.brite_data_dir, delete_if_exists=args.reset) - - # get KEGG snapshot info for default setup - self.target_snapshot = self.kegg_snapshot or 'v2023-09-18' - self.target_snapshot_yaml = os.path.join(os.path.dirname(anvio.__file__), 'data/misc/KEGG-SNAPSHOTS.yaml') - self.snapshot_dict = utils.get_yaml_as_dict(self.target_snapshot_yaml) - - if self.target_snapshot not in self.snapshot_dict.keys(): - self.run.warning(None, header="AVAILABLE KEGG SNAPSHOTS", lc="yellow") - available_snapshots = sorted(list(self.snapshot_dict.keys())) - for snapshot_name in available_snapshots: - self.run.info_single(snapshot_name + (' (latest)' if snapshot_name == available_snapshots[-1] else '')) - - raise ConfigError("Whoops. The KEGG snapshot you requested is not one that is known to anvi'o. Please try again, and " - "this time pick from the list shown above.") - - # default download path for KEGG snapshot - self.default_kegg_data_url = self.snapshot_dict[self.target_snapshot]['url'] - self.default_kegg_archive_file = self.snapshot_dict[self.target_snapshot]['archive_name'] - - # download from KEGG option: ftp path for HMM profiles and KO list - # for ko list, add /ko_list.gz to end of url - # for profiles, add /profiles.tar.gz to end of url - self.database_url = "ftp://ftp.genome.jp/pub/db/kofam" - # dictionary mapping downloaded file name to final decompressed file name or folder location - self.files = {'ko_list.gz': self.ko_list_file_path, 'profiles.tar.gz': self.kegg_data_dir} - - # download from KEGG option: module/pathway map htext files and API link - self.kegg_module_download_path = "https://www.genome.jp/kegg-bin/download_htext?htext=ko00002.keg&format=htext&filedir=" - self.kegg_pathway_download_path = "https://www.genome.jp/kegg-bin/download_htext?htext=br08901.keg&format=htext&filedir=" - self.kegg_rest_api_get = "http://rest.kegg.jp/get" - # download a json file containing all BRITE hierarchies, which can then be downloaded themselves - self.kegg_brite_hierarchies_download_path = os.path.join(self.kegg_rest_api_get, "br:br08902/json") else: # user input setup filesnpaths.is_output_dir_writable(os.path.dirname(self.user_input_dir)) @@ -606,7 +584,7 @@ def __init__(self, args, run=run, progress=progress, skip_init=False): self.run.info("Successfully removed", path) - def is_database_exists(self, fail_if_exists=True): + def is_database_exists(self, files_to_check, fail_if_exists=True): """This function determines whether the user has already downloaded all required KEGG data. More specifically, it looks for the KEGG files that we use to learn what to download (as in @@ -615,22 +593,22 @@ def is_database_exists(self, fail_if_exists=True): PARAMETERS ========== + files_to_check : list of file paths + this list should contain the paths to all required KEGG data or directories. what those + files are depends on the download mode. fail_if_exists : Boolean if this is True, this function will fail if the KEGG data already exists on the user's computer. If it is False, AND the user has already downloaded all required KEGG data, - then this function will not fail. This is to enable the --only-database option. + then this function will not fail. This is to enable the --only-processing option. Note that in this case we require all KEGG data to be pre-downloaded to avoid mixing older and newer KEGG data - so if this data is only partially downloaded, the function will raise an error even if this parameter is False. """ - files_to_check = [self.kofam_hmm_file_path, - self.kegg_module_file, - self.kegg_module_data_dir, - ] - if not self.skip_brite_hierarchies: - files_to_check.append(self.kegg_brite_hierarchies_file) - files_to_check.append(self.brite_data_dir) + if anvio.DEBUG: + file_str = ", ".join(files_to_check) + self.run.warning(f"We are looking for the following files to see if the KEGG data already " + f"exists on you computer: {file_str}") files_that_exist = [] for f in files_to_check: @@ -638,7 +616,7 @@ def is_database_exists(self, fail_if_exists=True): if fail_if_exists: raise ConfigError(f"It seems you already have data at {f}, please use the `--reset` flag " "or delete the KEGG data directory manually if you want to re-download KEGG data. " - "See also the --only-database option, which you can use if you already " + "See also the --only-processing option, which you can use if you already " "have all required KEGG data in that folder. (API users: skip this sanity " "check by initializing this class with `skip_init=True`)") else: @@ -651,7 +629,7 @@ def is_database_exists(self, fail_if_exists=True): raise ConfigError(f"We found some, but not all, required KEGG data on your computer in the KEGG " f"data directory. Since you don't have everything you need, we need you to re-download " f"everything from scratch. Please re-run this program using the --reset flag, and if " - f"you were using the --only-database option, remove that flag. :) HOWEVER, if you notice that " + f"you were using the --only-processing option, remove that flag. :) HOWEVER, if you notice that " "KEGG BRITE data does not appear to be in the upcoming list, but you don't actually want " "to download BRITE data, then you can just add the --skip-brite-hierarchies to your previous " f"command and be on your way (ie, no --reset needed). Here is the KEGG data we found:\n{exist_str}") @@ -662,12 +640,227 @@ def is_database_exists(self, fail_if_exists=True): f"need to check it to make sure we are not using something that is too old:\n" f"{exist_str}") - if self.only_database and not files_that_exist: + if self.only_processing and not files_that_exist: raise ConfigError(f"We noticed that there is no KEGG data on your computer at {self.kegg_data_dir} even " - f"though you used the --only-database option. If you don't actually have KEGG data already " - f"downloaded, you should get rid of the --only-database flag and re-run this program. If you " + f"though you used the --only-processing option. If you don't actually have KEGG data already " + f"downloaded, you should get rid of the --only-processing flag and re-run this program. If you " f"know that you DO have KEGG data, perhaps you gave us the wrong data directory?") + + def setup_from_archive(self): + """This function sets up the KEGG data directory from an archive of a previously-setup KEGG data directory. + + To do so, it unpacks the archive and checks its structure and that all required components are there. + """ + + self.run.info("KEGG archive", self.kegg_archive_path) + self.progress.new('Unzipping KEGG archive file...') + if not self.kegg_archive_path.endswith("tar.gz"): + self.progress.reset() + raise ConfigError("The provided archive file %s does not appear to be an archive at all. Perhaps you passed " + "the wrong file to anvi'o?" % (self.kegg_archive_path)) + unpacked_archive_name = "KEGG_archive_unpacked" + utils.tar_extract_file(self.kegg_archive_path, output_file_path=unpacked_archive_name, keep_original=True) + + self.progress.update('Checking KEGG archive structure and contents...') + archive_is_ok = self.kegg_archive_is_ok(unpacked_archive_name, no_modeling_is_ok = (not self.expect_modeling_files_in_archive)) + archive_contains_brite = self.check_archive_for_brite(unpacked_archive_name) + self.progress.end() + if archive_is_ok: + if os.path.exists(self.kegg_data_dir): + shutil.rmtree(self.kegg_data_dir) + path_to_kegg_in_archive = os.path.join(unpacked_archive_name, "KEGG") + shutil.move(path_to_kegg_in_archive, self.kegg_data_dir) + shutil.rmtree(unpacked_archive_name) + + if not archive_contains_brite and not self.skip_brite_hierarchies: + self.run.warning("The KEGG data archive does not contain the necessary files to set up BRITE hierarchy classification. " + "This is not a problem, and KEGG set up proceeded without it. BRITE is guaranteed to be set up when " + "downloading the latest version of KEGG with `anvi-setup-kegg-data`.") + + # if necessary, warn user about migrating the modules db + self.check_modules_db_version() + + else: + debug_output = f"We kept the unpacked archive for you to take a look at it. It is at " \ + f"{os.path.abspath(unpacked_archive_name)} and you may want " \ + f"to delete it after you are done checking its contents." + if not anvio.DEBUG: + shutil.rmtree(unpacked_archive_name) + debug_output = "The unpacked archive has been deleted, but you can re-run the script with the --debug " \ + "flag to keep it if you want to see its contents." + else: + self.run.warning(f"The unpacked archive file {os.path.abspath(unpacked_archive_name)} was kept for " + f"debugging purposes. You may want to clean it up after you are done looking through it.") + + raise ConfigError(f"SETUP FAILED. The provided archive file is missing some critical files, " + f"so anvi'o is unable to use it. {debug_output}") + + + def check_modules_db_version(self): + """This function checks if the MODULES.db is out of date and if so warns the user to migrate it""" + + # get current version of db + db_conn = db.DB(self.kegg_modules_db_path, None, ignore_version=True) + current_db_version = int(db_conn.get_meta_value('version')) + db_conn.disconnect() + + # if modules.db is out of date, give warning + target_version = int(anvio.tables.versions_for_db_types['modules']) + if current_db_version != target_version: + self.run.warning(f"Just so you know, the KEGG archive that was just set up contains an outdated MODULES.db (version: " + f"{current_db_version}). You may want to run `anvi-migrate` on this database before you do anything else. " + f"Here is the path to the database: {self.kegg_modules_db_path}") + + + def check_archive_for_brite(self, unpacked_archive_path): + """Check the archive for the BRITE directory and 'hierarchy of hierarchies' json file. + + It is ok for archives not to have these present, but let the user know. + """ + + is_brite_included = True + + path_to_kegg_in_archive = os.path.join(unpacked_archive_path, "KEGG") + brite_directories_and_files = [self.brite_data_dir, + self.kegg_brite_hierarchies_file] + for f in brite_directories_and_files: + path_to_f_in_archive = os.path.join(path_to_kegg_in_archive, os.path.basename(f)) + if not os.path.exists(path_to_f_in_archive) and not self.skip_brite_hierarchies: + is_brite_included = False + if anvio.DEBUG: + self.run.warning(f"The KEGG archive does not contain the following optional BRITE file or directory: {path_to_f_in_archive}") + + return is_brite_included + + + def setup_kegg_snapshot(self): + """This is the default setup strategy in which we unpack a specific KEGG archive. + + We do this so that everyone who uses the same release of anvi'o will also have the same default KEGG + data, which facilitates sharing and also means they do not have to continuously re-annotate their datasets + when KEGG is updated. + + It is essentially a special case of setting up from an archive. + """ + + if anvio.DEBUG: + self.run.info("Downloading from: ", self.default_kegg_data_url) + self.run.info("Downloading to: ", self.default_kegg_archive_file) + utils.download_file(self.default_kegg_data_url, self.default_kegg_archive_file, progress=self.progress, run=self.run) + + # a hack so we can use the archive setup function + self.kegg_archive_path = self.default_kegg_archive_file + self.setup_from_archive() + + # if all went well, let's get rid of the archive we used and the log file + if not anvio.DEBUG: + os.remove(self.default_kegg_archive_file) + else: + self.run.warning(f"Because you used the --debug flag, the KEGG archive file at {self.default_kegg_archive_file} " + "has been kept. You may want to remove it later.") + + + def kegg_archive_is_ok(self, unpacked_archive_path, no_modeling_is_ok = False): + """This function checks the structure and contents of an unpacked KEGG archive and returns True if it is as expected. + + Please note that we check for existence of the files that are necessary to run KEGG scripts, but we don't check the file + formats. This means that people could technically trick this function into returning True by putting a bunch of crappy files + with the right names/paths into the archive file. But what would be the point of that? + + We also don't care about the contents of certain folders (ie modules) because they are not being directly used + when running KEGG scripts. In the case of modules, all the information should already be in the MODULES.db so we don't + waste our time checking that all the module files are there. We only check that the directory is there. If later changes + to the implementation require the direct use of the files in these folders, then this function should be updated + to check for those. + + Parameters + ========== + unpacked_archive_path : str + Path to the unpacked archive directory + no_modeling_is_ok : boolean + Whether or not we care if modeling data is not found in the archive. This is added for backwards compatibility to the + previous versions of KEGG archives that do not include this data. + """ + + is_ok = True + + # check top-level files and folders + path_to_kegg_in_archive = os.path.join(unpacked_archive_path, "KEGG") + expected_directories_and_files = [self.orphan_data_dir, + self.kegg_module_data_dir, + self.kegg_hmm_data_dir, + self.ko_list_file_path, + self.kegg_module_file, + self.kegg_modules_db_path] + for f in expected_directories_and_files: + path_to_f_in_archive = os.path.join(path_to_kegg_in_archive, os.path.basename(f)) + if not os.path.exists(path_to_f_in_archive): + is_ok = False + if anvio.DEBUG: + self.run.warning(f"The KEGG archive does not contain the following expected file or directory: " + f"{path_to_f_in_archive}") + + # check hmm files + path_to_hmms_in_archive = os.path.join(path_to_kegg_in_archive, os.path.basename(self.kegg_hmm_data_dir)) + kofam_hmm_basename = os.path.basename(self.kofam_hmm_file_path) + expected_hmm_files = [kofam_hmm_basename] + for h in expected_hmm_files: + path_to_h_in_archive = os.path.join(path_to_hmms_in_archive, h) + if not os.path.exists(path_to_h_in_archive): + is_ok = False + if anvio.DEBUG: + self.run.warning(f"The KEGG archive does not contain the following expected hmm file: " + f"{path_to_h_in_archive}") + expected_extensions = ['.h3f', '.h3i', '.h3m', '.h3p'] + for ext in expected_extensions: + path_to_expected_hmmpress_file = path_to_h_in_archive + ext + if not os.path.exists(path_to_expected_hmmpress_file): + is_ok = False + if anvio.DEBUG: + self.run.warning(f"The KEGG archive does not contain the following expected `hmmpress` output: " + f"{path_to_expected_hmmpress_file}") + + # check modeling files + # this section needs to be kept up to date with any changes to requirements in reactionnetwork.py + # which is a bit silly, but since these two classes don't know about each other it is the workaround we need :( + path_to_modeling_files_in_archive = os.path.join(path_to_kegg_in_archive, "KO_REACTION_NETWORK") + expected_modeling_files = reactionnetwork.KODatabase.expected_files + missing_modeling_files = [] + for f in expected_modeling_files: + path_to_f_in_archive = os.path.join(path_to_modeling_files_in_archive, f) + if not os.path.exists(path_to_f_in_archive): + is_ok = False or no_modeling_is_ok + missing_modeling_files.append(f) + if anvio.DEBUG: + self.run.warning(f"The KEGG archive does not contain the following expected modeling file: " + f"{path_to_f_in_archive}") + + if no_modeling_is_ok and missing_modeling_files: + self.run.warning("Modeling files are missing from the KEGG archive you have set up. However, somebody " + "upstream thinks this is okay. Likely you are setting up an early KEGG snapshot version " + "that doesn't contain this data. That's fine. But please keep in mind that you won't be " + "able to run metabolic modeling. If this is a problem, you should either pick a later " + "KEGG snapshot, or download the modeling data independently using the command " + "`anvi-setup-kegg-data --mode modeling`.") + + return is_ok + + + def setup_all_data_from_archive_or_snapshot(self): + """This driver function controls whether we download one of our KEGG snapshots and set that up, or + set up directly from an archive file already on the user's computer. + """ + + if os.path.exists(self.kegg_data_dir) and not self.args.reset: + raise ConfigError(f"The directory {self.kegg_data_dir} already exists. Are you sure you want to " + f"overwrite it? If yes, feel free to restart this program with the --reset flag.") + + if self.kegg_archive_path: + self.setup_from_archive() + else: + self.setup_kegg_snapshot() + def check_user_input_dir_format(self): """This function checks whether the user input directory exists and contains the required subfolders @@ -698,97 +891,6 @@ def is_user_database_exists(self): f"please use the --reset flag or delete this file manually if you want to re-generate it.") - def download_profiles(self): - """This function downloads the Kofam profiles.""" - - self.run.info("Kofam Profile Database URL", self.database_url) - - try: - for file_name in self.files.keys(): - utils.download_file(self.database_url + '/' + file_name, - os.path.join(self.kegg_data_dir, file_name), progress=self.progress, run=self.run) - except Exception as e: - print(e) - raise ConfigError("Anvi'o failed to download KEGG KOfam profiles from the KEGG website. Something " - "likely changed on the KEGG end. Please contact the developers to see if this is " - "a fixable issue. If it isn't, we may be able to provide you with a legacy KEGG " - "data archive that you can use to setup KEGG with the --kegg-archive flag.") - - - def process_module_file(self): - """This function reads the kegg module file into a dictionary. It should be called during setup to get the KEGG module numbers so that KEGG modules can be downloaded. - - The structure of this file is like this: - - +D Module - #

  KEGG Modules

- ! - APathway modules - B - B Carbohydrate metabolism - C Central carbohydrate metabolism - D M00001 Glycolysis (Embden-Meyerhof pathway), glucose => pyruvate [PATH:map00010 map01200 map01100] - D M00002 Glycolysis, core module involving three-carbon compounds [PATH:map00010 map01200 map01230 map01100] - D M00003 Gluconeogenesis, oxaloacetate => fructose-6P [PATH:map00010 map00020 map01100] - - In other words, a bunch of initial lines to be ignored, and thereafter the line's information can be determined by the one-letter code at the start. - A = Pathway modules (metabolic pathways) or signature modules (gene sets that indicate a phenotypic trait, ie toxins). - B = Category of module (a type of metabolism for pathway modules. For signature modules, either Gene Set or Module Set) - C = Sub-category of module - D = Module - - """ - self.module_dict = {} - - filesnpaths.is_file_exists(self.kegg_module_file) - filesnpaths.is_file_plain_text(self.kegg_module_file) - - f = open(self.kegg_module_file, 'rU') - self.progress.new("Parsing KEGG Module file") - - current_module_type = None - current_category = None - current_subcategory = None - - for line in f.readlines(): - line = line.strip('\n') - first_char = line[0] - - # garbage lines - if first_char in ["+", "#", "!"]: - continue - else: - # module type - if first_char == "A": - fields = re.split('<[^>]*>', line) # we split by the html tag here - current_module_type = fields[1] - # Category - elif first_char == "B": - fields = re.split('<[^>]*>', line) # we split by the html tag here - if len(fields) == 1: # sometimes this level has lines with only a B - continue - current_category = fields[1] - # Sub-category - elif first_char == "C": - fields = re.split('\s{2,}', line) # don't want to split the subcategory name, so we have to split at least 2 spaces - current_subcategory = fields[1] - # module - elif first_char == "D": - fields = re.split('\s{2,}', line) - mnum = fields[1] - self.module_dict[mnum] = {"name" : fields[2], "type" : current_module_type, "category" : current_category, "subcategory" : current_subcategory} - # unknown code - else: - raise ConfigError("While parsing the KEGG file %s, we found an unknown line code %s. This has " - "made the file unparseable. It is likely that an update to KEGG has broken " - "things such that anvi'o doesn't know what is going on anymore. Sad, we know. :( " - "Please contact the developers to see if this is a fixable issue, and in the " - "meantime use an older version of the KEGG data directory (if you have one). " - "If we cannot fix it, we may be able to provide you with a legacy KEGG " - "data archive that you can use to setup KEGG with the --kegg-archive flag." % (self.kegg_module_file, first_char)) - self.progress.end() - - def process_pathway_file(self): """This function reads the kegg pathway map file into a dictionary. It should be called during setup to get the KEGG pathway ids so the pathways can be downloaded. @@ -861,6 +963,7 @@ def process_pathway_file(self): "data archive that you can use to setup KEGG with the --kegg-archive flag." % (self.kegg_pathway_file, first_char)) self.progress.end() + def get_accessions_from_htext_file(self, htext_file): """This function can read generic KEGG htext files to get a list of accessions. @@ -1013,138 +1116,6 @@ def download_kegg_files_from_hierarchy(self, h_accession, download_dir="./"): self.download_generic_flat_file(acc, download_dir_name) - def process_brite_hierarchy_of_hierarchies(self): - """Read the KEGG BRITE 'br08902' 'hierarchy of hierarchies' json file into a dictionary. - - This method is called during setup to find all BRITE hierarchies to be downloaded. - Hierarchies of interest have accessions starting with 'ko' and classify genes/proteins. - Excluded hierarchies include those for modules, pathways, and other systems for reactions, - compounds, taxa, etc. - - The dictionary that is filled out, `self.brite_dict`, is keyed by the 'ko' hierarchy name - exactly as given in the 'br08902' json file. The values are the categorizations of the - hierarchy in 'br08902', going from most general to most specific category. - - Here is an example of an entry produced in self.brite_dict: - 'ko01000 Enzymes': - ['Genes and Proteins', 'Protein families: metabolism'] - """ - - filesnpaths.is_file_exists(self.kegg_brite_hierarchies_file) - filesnpaths.is_file_json_formatted(self.kegg_brite_hierarchies_file) - - self.progress.new("Parsing KEGG BRITE Hierarchies file") - - brite_hierarchies_dict = json.load(open(self.kegg_brite_hierarchies_file)) - # store the names of all of the 'ko' hierarchies for genes/proteins - self.brite_dict = {} - hierarchies_appearing_multiple_times = [] - hierarchies_with_unrecognized_accession = [] - for hierarchy, categorizations in self.invert_brite_json_dict(brite_hierarchies_dict).items(): - # we have observed the hierarchy label to have an accession followed by two spaces followed by the hierarchy name, - # but accommodate the possibility that the accession is separated from the name by a variable number of spaces - split_hierarchy = hierarchy.split(' ') - hierarchy_accession = split_hierarchy[0] - hierarchy_name = ' '.join(split_hierarchy[1: ]).lstrip() - if hierarchy_accession[: 2] == 'br': - # hierarchy accessions beginning with 'br' are for reactions, compounds, taxa, etc., not genes/proteins - continue - elif hierarchy_accession == 'ko00002' and hierarchy_name == 'KEGG modules': - # this hierarchy is for modules, not genes/proteins - continue - elif hierarchy_accession == 'ko00003' and hierarchy_name == 'KEGG reaction modules': - # this hierarchy is also for modules - continue - - if len(categorizations) > 1: - hierarchies_appearing_multiple_times.append((hierarchy, len(categorizations))) - - if hierarchy_accession[: 2] != 'ko': - hierarchies_with_unrecognized_accession.append(hierarchy) - continue - try: - int(hierarchy_accession[2: 7]) - except ValueError: - hierarchies_with_unrecognized_accession.append(hierarchy) - continue - self.brite_dict[hierarchy] = categorizations[0][1: ] - - error_first_part = "" - if hierarchies_appearing_multiple_times: - error_first_part = ("Each BRITE hierarchy should only appear once in the hierarchy of hierarchies, " - "but the following hierarchies appeared the given number of times: " - f"{', '.join([f'{hier}: {num_times}' for hier, num_times in hierarchies_appearing_multiple_times])}.") - error_second_part = "" - if hierarchies_with_unrecognized_accession: - error_second_part = ("Each BRITE hierarchy accession is expected to have an accession formatted 'koXXXXX', where 'XXXXX' are five digits, " - f"but the following hierarchies did not have this format: {', '.join(hierarchies_with_unrecognized_accession)}.") - if hierarchies_appearing_multiple_times or hierarchies_with_unrecognized_accession: - raise ConfigError("Please contact the developers to look into the following error. " - f"{error_first_part}{' ' if error_first_part and error_second_part else ''}{error_second_part}") - - self.progress.end() - - - def download_kegg_module_file(self): - """This function downloads the KEGG module file, which tells us which module files to download.""" - - # download the kegg module file, which lists all modules - try: - utils.download_file(self.kegg_module_download_path, self.kegg_module_file, progress=self.progress, run=self.run) - except Exception as e: - print(e) - raise ConfigError("Anvi'o failed to download the KEGG Module htext file from the KEGG website. Something " - "likely changed on the KEGG end. Please contact the developers to see if this is " - "a fixable issue. If it isn't, we may be able to provide you with a legacy KEGG " - "data archive that you can use to setup KEGG with the --kegg-archive flag.") - - - def download_modules(self): - """This function downloads the KEGG modules. - - To verify that each file has been downloaded properly, we check that the last line is '///'. - """ - - self.run.info("KEGG Module Database URL", self.kegg_rest_api_get) - self.run.info("Number of KEGG Modules to download", len(self.module_dict.keys())) - - # download all modules - for mnum in self.module_dict.keys(): - file_path = os.path.join(self.kegg_module_data_dir, mnum) - utils.download_file(self.kegg_rest_api_get + '/' + mnum, - file_path, progress=self.progress, run=self.run) - # verify entire file has been downloaded - f = open(file_path, 'rU') - f.seek(0, os.SEEK_END) - f.seek(f.tell() - 4, os.SEEK_SET) - last_line = f.readline().strip('\n') - if not last_line == '///': - raise ConfigError("The KEGG module file %s was not downloaded properly. We were expecting the last line in the file " - "to be '///', but instead it was %s. Formatting of these files may have changed on the KEGG website. " - "Please contact the developers to see if this is a fixable issue. If it isn't, we may be able to " - "provide you with a legacy KEGG data archive that you can use to setup KEGG with the --kegg-archive flag." - % (file_path, last_line)) - - - def confirm_downloaded_modules(self): - """This function verifies that all module files have been downloaded. - - It checks that there is a module file for every module in the self.module_dict dictionary; - for that reason, it must be called after the function that creates that attribute, - process_module_file(), has already been called. - """ - - for mnum in self.module_dict.keys(): - file_path = os.path.join(self.kegg_module_data_dir, mnum) - if not os.path.exists(file_path): - raise ConfigError(f"The module file for {mnum} does not exist at its expected location, {file_path}. " - f"This probably means that something is wrong with your downloaded data, since this " - f"module is present in the KEGG MODULE file that lists all modules you *should* have " - f"on your computer. Very sorry to tell you this, but you need to re-download the KEGG " - f"data. We recommend the --reset flag.") - self.run.info("Number of module files found", len(self.module_dict)) - - def download_pathways(self): """This function downloads the KEGG Pathways. @@ -1187,82 +1158,148 @@ def download_pathways(self): % (file_path, last_line)) - def download_brite_hierarchy_of_hierarchies(self): - """Download a json file of 'br08902', a "hierarchy of BRITE hierarchies." + def create_user_modules_dict(self): + """This function establishes the self.module_dict parameter for user modules. - This hierarchy contains the names of other hierarchies which are subsequently used for - downloading those hierarchy json files. + It is essentially a replacement for the process_module_file() function. + Since users will not have a modules file to process, we simply create the dictionary from the + file names they provide for their module definitions. We don't add any dictionary values, + but we won't need them (we hope). """ - # note that this is the same as the REST API for modules and pathways - perhaps at some point this should be printed elsewhere so we don't repeat ourselves. - self.run.info("KEGG BRITE Database URL", self.kegg_rest_api_get) + user_module_list = [os.path.basename(k) for k in glob.glob(os.path.join(self.user_module_data_dir, '*'))] + self.module_dict = {key: {} for key in user_module_list} + + # sanity check that they also have KEGG data since we need to compare module names + if not os.path.exists(self.kegg_modules_db_path): + raise ConfigError(f"Wait a second. We understand that you are setting up user-defined metabolism data, but " + f"unfortunately you need to FIRST have KEGG data set up on your computer. Why, you ask? " + f"Well, we need to make sure none of your module names overlap with those " + f"in the KEGG MODULES database. Long story short, we looked for KEGG data at " + f"{self.kegg_modules_db_path} but we couldn't find it. If this is the wrong place for us to be " + f"looking, please run this program again and use the --kegg-data-dir parameter to tell us where " + f"to find it.") + + # sanity check that user module names are distinct + kegg_modules_db = ModulesDatabase(self.kegg_modules_db_path, args=self.args, quiet=True) + kegg_mods = set(kegg_modules_db.get_all_modules_as_list()) + user_mods = set(user_module_list) + bad_user_mods = kegg_mods.intersection(user_mods) + if bad_user_mods: + bad_mods_str = ", ".join(bad_user_mods) + n = len(bad_user_mods) + raise ConfigError(f"Hol'up a minute. You see, there {P('is a module', n, alt='are some modules')} " + f"in your user-defined modules data (at {self.user_module_data_dir}) which {P('has', n, alt='have')} " + f"the same name as an existing KEGG module. This is not allowed, for reasons. Please name {P('that module', n, alt='those modules')} " + f"differently. Append an underscore and your best friend's name to {P('it', n, alt='them')} or something. Just make sure it's " + f"unique. OK? ok. Here is the list of module names you should change: {bad_mods_str}") + + def setup_modules_db(self, db_path, module_data_directory, brite_data_directory=None, source='KEGG', skip_brite_hierarchies=False): + """This function creates a Modules DB at the specified path.""" + + if filesnpaths.is_file_exists(db_path, dont_raise=True): + if self.overwrite_modules_db: + os.remove(db_path) + else: + raise ConfigError(f"Woah there. There is already a modules database at {db_path}. If you really want to make a new modules database " + f"in this folder, you should either delete the existing database yourself, or re-run this program with the " + f"--overwrite-output-destinations flag. But the old database will go away forever in that case. Just making " + f"sure you are aware of that, so that you have no regrets.") try: - utils.download_file(self.kegg_brite_hierarchies_download_path, self.kegg_brite_hierarchies_file, progress=self.progress, run=self.run) + mod_db = ModulesDatabase(db_path, module_data_directory=module_data_directory, brite_data_directory=brite_data_directory, data_source=source, args=self.args, module_dictionary=self.module_dict, pathway_dictionary=self.pathway_dict, brite_dictionary=self.brite_dict, skip_brite_hierarchies=skip_brite_hierarchies, run=run, progress=progress) + mod_db.create() except Exception as e: print(e) - raise ConfigError("Anvi'o failed to download the KEGG BRITE hierarchies json file from the KEGG website. " - "Something likely changed on the KEGG end. Please contact the developers to see if this is " - "a fixable issue. If it isn't, we may be able to provide you with a legacy KEGG " - "data archive that you can use to setup KEGG with the --kegg-archive flag.") + raise ConfigError("While attempting to build the MODULES.db, anvi'o encountered an error, which should be printed above. " + "If you look at that error and it seems like something you cannot handle, please contact the developers " + "for assistance. :) ") - def download_brite_hierarchies(self): - """This function downloads a json file for every BRITE hierarchy of interest. + def setup_user_data(self): + """This function sets up user metabolism data from the provided input directory. - Hierarchies of interest classify genes/proteins and have accessions starting with 'ko'. + It processes the user's module files into the USER_MODULES.db. """ - self.run.info("Number of BRITE hierarchies to download", len(self.brite_dict)) - unexpected_hierarchies = [] - for hierarchy in self.brite_dict: - hierarchy_accession = hierarchy[: 7] - brite_system = hierarchy_accession[: 2] - if brite_system != 'ko': - unexpected_hierarchies.append(hierarchy) - if not unexpected_hierarchies: - file_path = os.path.join(self.brite_data_dir, hierarchy_accession) - utils.download_file(self.kegg_rest_api_get + '/br:' + hierarchy_accession + '/json', - file_path, progress=self.progress, run=self.run) - # verify that the whole json file was downloaded - filesnpaths.is_file_json_formatted(file_path) - if unexpected_hierarchies: - raise ConfigError("Accessions for BRITE hierarchies of genes/proteins should begin with 'ko'. " - f"Hierarchies were found that defy our assumptions; please contact a developer to investigate this: '{', '.join(unexpected_hierarchies)}'.") + self.create_user_modules_dict() + self.setup_modules_db(db_path=self.user_modules_db_path, module_data_directory=self.user_module_data_dir, source='USER', skip_brite_hierarchies=True) + + +class KOfamDownload(KeggSetup): + """Class for setting up KOfam HMM profiles. + + Parameters + ========== + args: Namespace object + All the arguments supplied by user to command-line programs relying on this + class, such as `anvi-setup-kegg-data`. If using this class through the API, please + provide a Namespace object with the Boolean 'reset' parameter. + skip_init: Boolean + Developers can use this flag to skip the sanity checks and creation of directories + when testing this class. + """ + + def __init__(self, args, run=run, progress=progress, skip_init=False): + self.args = args + self.run = run + self.progress = progress + self.skip_init = skip_init + + KeggSetup.__init__(self, self.args, skip_init=self.skip_init) + + filesnpaths.is_program_exists('hmmpress') + + # ftp path for HMM profiles and KO list + # for ko list, add /ko_list.gz to end of url + # for profiles, add /profiles.tar.gz to end of url + self.database_url = "ftp://ftp.genome.jp/pub/db/kofam" + # dictionary mapping downloaded file name to final decompressed file name or folder location + self.kofam_files = {'ko_list.gz': self.ko_list_file_path, 'profiles.tar.gz': self.kegg_data_dir} + + expected_files_for_kofams = [self.ko_list_file_path] + if self.only_processing: + expected_files_for_kofams.append(os.path.join(self.kegg_data_dir, 'profiles.tar.gz')) + else: + expected_files_for_kofams.append(self.kofam_hmm_file_path) + + if not args.reset and not anvio.DEBUG and not self.skip_init: + self.is_database_exists(expected_files_for_kofams, fail_if_exists=(not self.only_processing)) + if self.download_from_kegg and not self.only_processing and not self.kegg_archive_path and not self.skip_init: + filesnpaths.gen_output_directory(self.kegg_hmm_data_dir, delete_if_exists=args.reset) + filesnpaths.gen_output_directory(self.orphan_data_dir, delete_if_exists=args.reset) - def confirm_downloaded_brite_hierarchies(self): - """This function verifies that all BRITE hierarchy files have been downloaded. + + def download_profiles(self): + """This function downloads the Kofam profiles.""" - It checks that there is a hierarchy file for every hierarchy in the self.brite_dict dictionary; - for that reason, it must be called after the function that creates that attribute, - process_brite_hierarchy_of_hierarchies(), has already been called. - """ + self.run.info("Kofam Profile Database URL", self.database_url) - for hierarchy in self.brite_dict.keys(): - hierarchy_accession = hierarchy[: 7] - file_path = os.path.join(self.brite_data_dir, hierarchy_accession) - if not os.path.exists(file_path): - raise ConfigError(f"The BRITE hierarchy file for {hierarchy} does not exist at its expected location, {file_path}. " - f"This probably means that something is wrong with your downloaded data, since this " - f"hierarchy is present in the file that lists all BRITE hierarchies you *should* have " - f"on your computer. Very sorry to tell you this, but you need to re-download the KEGG " - f"data. We recommend the --reset flag.") - self.run.info("Number of BRITE hierarchy files found", len(self.brite_dict)) + try: + for file_name in self.kofam_files.keys(): + utils.download_file(self.database_url + '/' + file_name, + os.path.join(self.kegg_data_dir, file_name), progress=self.progress, run=self.run) + except Exception as e: + print(e) + raise ConfigError("Anvi'o failed to download KEGG KOfam profiles from the KEGG website. Something " + "likely changed on the KEGG end. Please contact the developers to see if this is " + "a fixable issue. If it isn't, we may be able to provide you with a legacy KEGG " + "data archive that you can use to setup KEGG with the --kegg-archive flag.") - def decompress_files(self): + def decompress_profiles(self): """This function decompresses the Kofam profiles.""" self.progress.new('Decompressing files') - for file_name in self.files.keys(): + for file_name in self.kofam_files.keys(): self.progress.update('Decompressing file %s' % file_name) full_path = os.path.join(self.kegg_data_dir, file_name) if full_path.endswith("tar.gz"): - utils.tar_extract_file(full_path, output_file_path=self.files[file_name], keep_original=False) + utils.tar_extract_file(full_path, output_file_path=self.kofam_files[file_name], keep_original=False) else: - utils.gzip_decompress_file(full_path, output_file_path=self.files[file_name], keep_original=False) + utils.gzip_decompress_file(full_path, output_file_path=self.kofam_files[file_name], keep_original=False) self.progress.update("File decompressed. Yay.") self.progress.end() @@ -1283,7 +1320,7 @@ def confirm_downloaded_profiles(self): hmm_path = os.path.join(self.kegg_data_dir, "profiles/%s.hmm" % k) if not os.path.exists(hmm_path): raise ConfigError("The KOfam HMM profile at %s does not exist. This probably means that something went wrong " - "while downloading the KOfam database. Please run `anvi-setup-kegg-kofams` with the --reset " + "while downloading the KOfam database. Please run `anvi-setup-kegg-data` with the --reset " "flag. If that still doesn't work, please contact the developers to see if the issue is fixable. " "If it isn't, we may be able to provide you with a legacy KEGG data archive that you can use to " "setup KEGG with the --kegg-archive flag." % (hmm_path)) @@ -1346,7 +1383,7 @@ def move_orphan_files(self): "We have removed those HMM profiles from the final database. You can find them under the directory '%s'." % (len(no_data_file_list), self.orphan_data_dir)) - + def run_hmmpress(self): """This function concatenates the Kofam profiles and runs hmmpress on them.""" @@ -1381,263 +1418,262 @@ def run_hmmpress(self): self.progress.end() - def create_user_modules_dict(self): - """This function establishes the self.module_dict parameter for user modules. - - It is essentially a replacement for the process_module_file() function. - Since users will not have a modules file to process, we simply create the dictionary from the - file names they provide for their module definitions. We don't add any dictionary values, - but we won't need them (we hope). - """ - - user_module_list = [os.path.basename(k) for k in glob.glob(os.path.join(self.user_module_data_dir, '*'))] - self.module_dict = {key: {} for key in user_module_list} - - # sanity check that they also have KEGG data since we need to compare module names - if not os.path.exists(self.kegg_modules_db_path): - raise ConfigError(f"Wait a second. We understand that you are setting up user-defined metabolism data, but " - f"unfortunately you need to FIRST have KEGG data set up on your computer. Why, you ask? " - f"Well, we need to make sure none of your module names overlap with those " - f"in the KEGG MODULES database. Long story short, we looked for KEGG data at " - f"{self.kegg_modules_db_path} but we couldn't find it. If this is the wrong place for us to be " - f"looking, please run this program again and use the --kegg-data-dir parameter to tell us where " - f"to find it.") - - # sanity check that user module names are distinct - kegg_modules_db = ModulesDatabase(self.kegg_modules_db_path, args=self.args, quiet=True) - kegg_mods = set(kegg_modules_db.get_all_modules_as_list()) - user_mods = set(user_module_list) - bad_user_mods = kegg_mods.intersection(user_mods) - if bad_user_mods: - bad_mods_str = ", ".join(bad_user_mods) - n = len(bad_user_mods) - raise ConfigError(f"Hol'up a minute. You see, there {P('is a module', n, alt='are some modules')} " - f"in your user-defined modules data (at {self.user_module_data_dir}) which {P('has', n, alt='have')} " - f"the same name as an existing KEGG module. This is not allowed, for reasons. Please name {P('that module', n, alt='those modules')} " - f"differently. Append an underscore and your best friend's name to {P('it', n, alt='them')} or something. Just make sure it's " - f"unique. OK? ok. Here is the list of module names you should change: {bad_mods_str}") - - - def setup_modules_db(self, db_path, module_data_directory, brite_data_directory=None, source='KEGG', skip_brite_hierarchies=False): - """This function creates a Modules DB at the specified path.""" - - if filesnpaths.is_file_exists(db_path, dont_raise=True): - if self.overwrite_modules_db: - os.remove(db_path) - else: - raise ConfigError(f"Woah there. There is already a modules database at {db_path}. If you really want to make a new modules database " - f"in this folder, you should either delete the existing database yourself, or re-run this program with the " - f"--overwrite-output-destinations flag. But the old database will go away forever in that case. Just making " - f"sure you are aware of that, so that you have no regrets.") - try: - mod_db = ModulesDatabase(db_path, module_data_directory=module_data_directory, brite_data_directory=brite_data_directory, data_source=source, args=self.args, module_dictionary=self.module_dict, pathway_dictionary=self.pathway_dict, brite_dictionary=self.brite_dict, skip_brite_hierarchies=skip_brite_hierarchies, run=run, progress=progress) - mod_db.create() - except Exception as e: - print(e) - raise ConfigError("While attempting to build the MODULES.db, anvi'o encountered an error, which should be printed above. " - "If you look at that error and it seems like something you cannot handle, please contact the developers " - "for assistance. :) ") - - - def kegg_archive_is_ok(self, unpacked_archive_path): - """This function checks the structure and contents of an unpacked KEGG archive and returns True if it is as expected. - - Please note that we check for existence of the files that are necessary to run KEGG scripts, but we don't check the file - formats. This means that people could technically trick this function into returning True by putting a bunch of crappy files - with the right names/paths into the archive file. But what would be the point of that? + def setup_kofams(self): + """This function downloads, decompresses, and runs `hmmpress` on KOfam profiles.""" - We also don't care about the contents of certain folders (ie modules) because they are not being directly used - when running KEGG scripts. In the case of modules, all the information should already be in the MODULES.db so we don't - waste our time checking that all the module files are there. We only check that the directory is there. If later changes - to the implementation require the direct use of the files in these folders, then this function should be updated - to check for those. - """ + if not self.only_processing: + self.download_profiles() - is_ok = True + if not self.only_download: + self.decompress_profiles() + self.setup_ko_dict() # get ko dict attribute + self.run_hmmpress() - # check top-level files and folders - path_to_kegg_in_archive = os.path.join(unpacked_archive_path, "KEGG") - expected_directories_and_files = [self.orphan_data_dir, - self.kegg_module_data_dir, - self.kegg_hmm_data_dir, - self.ko_list_file_path, - self.kegg_module_file, - self.kegg_modules_db_path] - for f in expected_directories_and_files: - path_to_f_in_archive = os.path.join(path_to_kegg_in_archive, os.path.basename(f)) - if not os.path.exists(path_to_f_in_archive): - is_ok = False - if anvio.DEBUG: - self.run.warning("The KEGG archive does not contain the following expected file or directory: %s" - % (path_to_f_in_archive)) - # check hmm files - path_to_hmms_in_archive = os.path.join(path_to_kegg_in_archive, os.path.basename(self.kegg_hmm_data_dir)) - kofam_hmm_basename = os.path.basename(self.kofam_hmm_file_path) - expected_hmm_files = [kofam_hmm_basename] - for h in expected_hmm_files: - path_to_h_in_archive = os.path.join(path_to_hmms_in_archive, h) - if not os.path.exists(path_to_h_in_archive): - is_ok = False - if anvio.DEBUG: - self.run.warning("The KEGG archive does not contain the folllowing expected hmm file: %s" - % (path_to_h_in_archive)) - expected_extensions = ['.h3f', '.h3i', '.h3m', '.h3p'] - for ext in expected_extensions: - path_to_expected_hmmpress_file = path_to_h_in_archive + ext - if not os.path.exists(path_to_expected_hmmpress_file): - is_ok = False - if anvio.DEBUG: - self.run.warning("The KEGG archive does not contain the folllowing expected `hmmpress` output: %s" - % (path_to_expected_hmmpress_file)) +class ModulesDownload(KeggSetup): + """Class for setting up all KEGG data related to pathway prediction, namely KOfam profiles and KEGG MODULES. + + Parameters + ========== + args: Namespace object + All the arguments supplied by user to command-line programs relying on this + class, such as `anvi-setup-kegg-data`. If using this class through the API, please + provide a Namespace object with the Boolean 'reset' parameter. + skip_init: Boolean + Developers can use this flag to skip the sanity checks and creation of directories + when testing this class. + """ - return is_ok + def __init__(self, args, run=run, progress=progress, skip_init=False): + A = lambda x: args.__dict__[x] if x in args.__dict__ else None + self.args = args + self.run = run + self.progress = progress + self.skip_init = skip_init + self.skip_brite_hierarchies = A('skip_brite_hierarchies') + self.overwrite_modules_db = A('overwrite_output_destinations') + # we also need the init of the superclass + KeggSetup.__init__(self, self.args, skip_init=self.skip_init) - def check_archive_for_brite(self, unpacked_archive_path): - """Check the archive for the BRITE directory and 'hierarchy of hierarchies' json file. + if (not self.download_from_kegg) and self.skip_brite_hierarchies: + self.run.warning("Just so you know, the --skip-brite-hierarchies flag does not do anything (besides suppress some warning output) when used " + "without the -D option. You are setting up from an archived KEGG snapshot which may already include BRITE data, and if it " + "does, this data will not be removed. You can always check if the resulting modules database contains BRITE data by " + "running `anvi-db-info` on it and looking at the `is_brite_setup` value (which will be 1 if the database contains BRITE data).") - It is ok for archives not to have these present, but let the user know. - """ + # download from KEGG option: module/pathway map htext files and API link + self.kegg_module_download_path = "https://www.genome.jp/kegg-bin/download_htext?htext=ko00002.keg&format=htext&filedir=" + self.kegg_pathway_download_path = "https://www.genome.jp/kegg-bin/download_htext?htext=br08901.keg&format=htext&filedir=" + self.kegg_rest_api_get = "http://rest.kegg.jp/get" + # download a json file containing all BRITE hierarchies, which can then be downloaded themselves + self.kegg_brite_hierarchies_download_path = os.path.join(self.kegg_rest_api_get, "br:br08902/json") + + # check if the data is already downloaded + expected_files_for_modules = [self.kegg_module_file, + self.kegg_module_data_dir] + if not self.skip_brite_hierarchies: + expected_files_for_modules.append(self.kegg_brite_hierarchies_file) + expected_files_for_modules.append(self.brite_data_dir) - is_brite_included = True + if not args.reset and not anvio.DEBUG and not self.skip_init: + self.is_database_exists(expected_files_for_modules, fail_if_exists=(not self.only_processing)) - path_to_kegg_in_archive = os.path.join(unpacked_archive_path, "KEGG") - brite_directories_and_files = [self.brite_data_dir, - self.kegg_brite_hierarchies_file] - for f in brite_directories_and_files: - path_to_f_in_archive = os.path.join(path_to_kegg_in_archive, os.path.basename(f)) - if not os.path.exists(path_to_f_in_archive) and not self.skip_brite_hierarchies: - is_brite_included = False - if anvio.DEBUG: - self.run.warning(f"The KEGG archive does not contain the following optional BRITE file or directory: {path_to_f_in_archive}") + # generate subfolders if necessary + if self.download_from_kegg and not self.only_processing and not self.kegg_archive_path and not self.skip_init: + filesnpaths.gen_output_directory(self.kegg_module_data_dir, delete_if_exists=args.reset) + if not self.skip_brite_hierarchies: + filesnpaths.gen_output_directory(self.brite_data_dir, delete_if_exists=args.reset) - return is_brite_included + + def download_kegg_module_file(self): + """This function downloads the KEGG module file, which tells us which module files to download.""" + # download the kegg module file, which lists all modules + try: + utils.download_file(self.kegg_module_download_path, self.kegg_module_file, progress=self.progress, run=self.run) + except Exception as e: + print(e) + raise ConfigError("Anvi'o failed to download the KEGG Module htext file from the KEGG website. Something " + "likely changed on the KEGG end. Please contact the developers to see if this is " + "a fixable issue. If it isn't, we may be able to provide you with a legacy KEGG " + "data archive that you can use to setup KEGG with the --kegg-archive flag.") - def check_modules_db_version(self): - """This function checks if the MODULES.db is out of date and if so warns the user to migrate it""" - # get current version of db - db_conn = db.DB(self.kegg_modules_db_path, None, ignore_version=True) - current_db_version = int(db_conn.get_meta_value('version')) - db_conn.disconnect() + def process_module_file(self): + """This function reads the kegg module file into a dictionary. It should be called during setup to get the KEGG module numbers so that KEGG modules can be downloaded. - # if modules.db is out of date, give warning - target_version = int(anvio.tables.versions_for_db_types['modules']) - if current_db_version != target_version: - self.run.warning(f"Just so you know, the KEGG archive that was just set up contains an outdated MODULES.db (version: " - f"{current_db_version}). You may want to run `anvi-migrate` on this database before you do anything else. " - f"Here is the path to the database: {self.kegg_modules_db_path}") + The structure of this file is like this: + +D Module + #

  KEGG Modules

+ ! + APathway modules + B + B Carbohydrate metabolism + C Central carbohydrate metabolism + D M00001 Glycolysis (Embden-Meyerhof pathway), glucose => pyruvate [PATH:map00010 map01200 map01100] + D M00002 Glycolysis, core module involving three-carbon compounds [PATH:map00010 map01200 map01230 map01100] + D M00003 Gluconeogenesis, oxaloacetate => fructose-6P [PATH:map00010 map00020 map01100] - def setup_from_archive(self): - """This function sets up the KEGG data directory from an archive of a previously-setup KEGG data directory. + In other words, a bunch of initial lines to be ignored, and thereafter the line's information can be determined by the one-letter code at the start. + A = Pathway modules (metabolic pathways) or signature modules (gene sets that indicate a phenotypic trait, ie toxins). + B = Category of module (a type of metabolism for pathway modules. For signature modules, either Gene Set or Module Set) + C = Sub-category of module + D = Module - To do so, it unpacks the archive and checks its structure and that all required components are there. """ + self.module_dict = {} - self.run.info("KEGG archive", self.kegg_archive_path) - self.progress.new('Unzipping KEGG archive file...') - if not self.kegg_archive_path.endswith("tar.gz"): - self.progress.reset() - raise ConfigError("The provided archive file %s does not appear to be an archive at all. Perhaps you passed " - "the wrong file to anvi'o?" % (self.kegg_archive_path)) - unpacked_archive_name = "KEGG_archive_unpacked" - utils.tar_extract_file(self.kegg_archive_path, output_file_path=unpacked_archive_name, keep_original=True) - - self.progress.update('Checking KEGG archive structure and contents...') - archive_is_ok = self.kegg_archive_is_ok(unpacked_archive_name) - archive_contains_brite = self.check_archive_for_brite(unpacked_archive_name) - self.progress.end() - if archive_is_ok: - if os.path.exists(self.kegg_data_dir): - shutil.rmtree(self.kegg_data_dir) - path_to_kegg_in_archive = os.path.join(unpacked_archive_name, "KEGG") - shutil.move(path_to_kegg_in_archive, self.kegg_data_dir) - shutil.rmtree(unpacked_archive_name) - - if not archive_contains_brite and not self.skip_brite_hierarchies: - self.run.warning("The KEGG data archive does not contain the necessary files to set up BRITE hierarchy classification. " - "This is not a problem, and KEGG set up proceeded without it. BRITE is guaranteed to be set up when " - "downloading the latest version of KEGG with `anvi-setup-kegg-kofams -D`.") - - # if necessary, warn user about migrating the modules db - self.check_modules_db_version() - - else: - debug_output = "We kept the unpacked archive for you to take a look at it. It is at %s and you may want " \ - "to delete it after you are done checking its contents." % os.path.abspath(unpacked_archive_name) - if not anvio.DEBUG: - shutil.rmtree(unpacked_archive_name) - debug_output = "The unpacked archive has been deleted, but you can re-run the script with the --debug " \ - "flag to keep it if you want to see its contents." - else: - self.run.warning("The unpacked archive file %s was kept for debugging purposes. You may want to " - "clean it up after you are done looking through it." % (os.path.abspath(unpacked_archive_name))) - raise ConfigError("The provided archive file %s does not appear to be a KEGG data directory, so anvi'o is unable " - "to use it. %s" % (self.kegg_archive_path, debug_output)) - + filesnpaths.is_file_exists(self.kegg_module_file) + filesnpaths.is_file_plain_text(self.kegg_module_file) - def setup_kegg_snapshot(self): - """This is the default setup strategy in which we unpack a specific KEGG archive. + f = open(self.kegg_module_file, 'rU') + self.progress.new("Parsing KEGG Module file") - We do this so that everyone who uses the same release of anvi'o will also have the same default KEGG - data, which facilitates sharing and also means they do not have to continuously re-annotate their datasets - when KEGG is updated. + current_module_type = None + current_category = None + current_subcategory = None - It is essentially a special case of setting up from an archive. - """ + for line in f.readlines(): + line = line.strip('\n') + first_char = line[0] - if anvio.DEBUG: - self.run.info("Downloading from: ", self.default_kegg_data_url) - self.run.info("Downloading to: ", self.default_kegg_archive_file) - utils.download_file(self.default_kegg_data_url, self.default_kegg_archive_file, progress=self.progress, run=self.run) + # garbage lines + if first_char in ["+", "#", "!"]: + continue + else: + # module type + if first_char == "A": + fields = re.split('<[^>]*>', line) # we split by the html tag here + current_module_type = fields[1] + # Category + elif first_char == "B": + fields = re.split('<[^>]*>', line) # we split by the html tag here + if len(fields) == 1: # sometimes this level has lines with only a B + continue + current_category = fields[1] + # Sub-category + elif first_char == "C": + fields = re.split('\s{2,}', line) # don't want to split the subcategory name, so we have to split at least 2 spaces + current_subcategory = fields[1] + # module + elif first_char == "D": + fields = re.split('\s{2,}', line) + mnum = fields[1] + self.module_dict[mnum] = {"name" : fields[2], "type" : current_module_type, "category" : current_category, "subcategory" : current_subcategory} + # unknown code + else: + raise ConfigError("While parsing the KEGG file %s, we found an unknown line code %s. This has " + "made the file unparseable. It is likely that an update to KEGG has broken " + "things such that anvi'o doesn't know what is going on anymore. Sad, we know. :( " + "Please contact the developers to see if this is a fixable issue, and in the " + "meantime use an older version of the KEGG data directory (if you have one). " + "If we cannot fix it, we may be able to provide you with a legacy KEGG " + "data archive that you can use to setup KEGG with the --kegg-archive flag." % (self.kegg_module_file, first_char)) + self.progress.end() - # a hack so we can use the archive setup function - self.kegg_archive_path = self.default_kegg_archive_file - self.setup_from_archive() - # if all went well, let's get rid of the archive we used and the log file - if not anvio.DEBUG: - os.remove(self.default_kegg_archive_file) - else: - self.run.warning(f"Because you used the --debug flag, the KEGG archive file at {self.default_kegg_archive_file} " - "has been kept. You may want to remove it later.") + def download_modules(self): + """This function downloads the KEGG modules.""" + # import the function for multithreaded download + import multiprocessing as mp + from anvio.biochemistry.reactionnetwork import _download_worker + + total = len(self.module_dict.keys()) + self.run.info("KEGG Module Database URL", self.kegg_rest_api_get) + self.run.info("Number of KEGG Modules to download", total) + self.run.info("Number of threads used for download", self.num_threads) - def setup_user_data(self): - """This function sets up user metabolism data from the provided input directory. + self.progress.new("Downloading KEGG Module files") + manager = mp.Manager() + input_queue = manager.Queue() + output_queue = manager.Queue() + for mnum in self.module_dict.keys(): + file_path = os.path.join(self.kegg_module_data_dir, mnum) + url = self.kegg_rest_api_get + '/' + mnum + input_queue.put((url, file_path)) + workers: List[mp.Process] = [] + for _ in range(self.num_threads): + worker = mp.Process(target=_download_worker, args=(input_queue, output_queue)) + workers.append(worker) + worker.start() + + downloaded_count = 0 + undownloaded_count = 0 + undownloaded = [] + while downloaded_count + undownloaded_count < total: + output = output_queue.get() + if output is True: + downloaded_count += 1 + self.progress.update(f"{downloaded_count} / {total} module files downloaded") + else: + undownloaded_count += 1 + undownloaded.append(os.path.splitext(os.path.basename(output))[0]) + + for worker in workers: + worker.terminate() + if undownloaded: + raise ConfigError( + "Unfortunately, files for the following modules failed to download despite multiple attempts, " + f"and so the database needs to be set up again: {', '.join(undownloaded)}" + ) + self.progress.end() - It processes the user's module files into the USER_MODULES.db. - """ - self.create_user_modules_dict() - self.setup_modules_db(db_path=self.user_modules_db_path, module_data_directory=self.user_module_data_dir, source='USER', skip_brite_hierarchies=True) + def confirm_downloaded_modules(self): + """This function verifies that all module files have been downloaded. + + It checks that there is a module file for every module in the self.module_dict dictionary; + for that reason, it must be called after the function that creates that attribute, + process_module_file(), has already been called. To verify that each file has been downloaded + properly, we check that the last line is '///'. + """ + for mnum in self.module_dict.keys(): + file_path = os.path.join(self.kegg_module_data_dir, mnum) + if not os.path.exists(file_path): + raise ConfigError(f"The module file for {mnum} does not exist at its expected location, {file_path}. " + f"This probably means that something is wrong with your downloaded data, since this " + f"module is present in the KEGG MODULE file that lists all modules you *should* have " + f"on your computer. Very sorry to tell you this, but you need to re-download the KEGG " + f"data. We recommend the --reset flag.") + # verify entire file has been downloaded + f = open(file_path, 'rU') + f.seek(0, os.SEEK_END) + f.seek(f.tell() - 4, os.SEEK_SET) + last_line = f.readline().strip('\n') + if not last_line == '///': + raise ConfigError("The KEGG module file %s was not downloaded properly. We were expecting the last line in the file " + "to be '///', but instead it was %s. Formatting of these files may have changed on the KEGG website. " + "Please contact the developers to see if this is a fixable issue. If it isn't, we may be able to " + "provide you with a legacy KEGG data archive that you can use to setup KEGG with the --kegg-archive flag." + % (file_path, last_line)) + self.run.info("Number of module files found", len(self.module_dict)) - def setup_data(self): - """This is a driver function which executes the KEGG setup process.""" - if self.kegg_archive_path: - self.setup_from_archive() + def setup_modules_data(self): + """This is a driver function which executes the setup process for pathway prediction data from KEGG.""" - elif self.download_from_kegg: - # mostly for developers and the adventurous - if not self.only_database: - # this downloads, decompresses, and hmmpresses the KOfam profiles - self.download_profiles() - self.decompress_files() - self.setup_ko_dict() # get ko dict attribute - self.run_hmmpress() - # it also downloads and processes the KEGG Module files into the MODULES.db + # FIXME: we will have to move user setup to a completely separate program at some point + # PS. user setup related functions belong to the superclass for now + if self.user_input_dir: + self.setup_user_data() + else: + # download the data first + # unless user requested only processing (mostly for developers and the adventurous) + if not self.only_processing: self.download_kegg_module_file() self.process_module_file() # get module dict attribute self.download_modules() + self.confirm_downloaded_modules() + if not self.skip_brite_hierarchies: self.download_brite_hierarchy_of_hierarchies() self.process_brite_hierarchy_of_hierarchies() # get brite dict attribute self.download_brite_hierarchies() + self.confirm_downloaded_brite_hierarchies() else: # get required attributes for database setup and make sure all expected files were downloaded self.process_module_file() @@ -1646,15 +1682,182 @@ def setup_data(self): self.process_brite_hierarchy_of_hierarchies() self.confirm_downloaded_brite_hierarchies() + # process the modules file into a database if not self.only_download: self.setup_modules_db(db_path=self.kegg_modules_db_path, module_data_directory=self.kegg_module_data_dir, brite_data_directory=self.brite_data_dir, skip_brite_hierarchies=self.skip_brite_hierarchies) - elif self.user_input_dir: - self.setup_user_data() + + ###### BRITE-related functions below ###### + def download_brite_hierarchy_of_hierarchies(self): + """Download a json file of 'br08902', a "hierarchy of BRITE hierarchies." - else: - # the default, set up from frozen KEGG release - self.setup_kegg_snapshot() + This hierarchy contains the names of other hierarchies which are subsequently used for + downloading those hierarchy json files. + """ + + # note that this is the same as the REST API for modules and pathways - perhaps at some point this should be printed elsewhere so we don't repeat ourselves. + self.run.info("KEGG BRITE Database URL", self.kegg_rest_api_get) + + try: + utils.download_file(self.kegg_brite_hierarchies_download_path, self.kegg_brite_hierarchies_file, progress=self.progress, run=self.run) + except Exception as e: + print(e) + raise ConfigError("Anvi'o failed to download the KEGG BRITE hierarchies json file from the KEGG website. " + "Something likely changed on the KEGG end. Please contact the developers to see if this is " + "a fixable issue. If it isn't, we may be able to provide you with a legacy KEGG " + "data archive that you can use to setup KEGG with the --kegg-archive flag.") + + + def process_brite_hierarchy_of_hierarchies(self): + """Read the KEGG BRITE 'br08902' 'hierarchy of hierarchies' json file into a dictionary. + + This method is called during setup to find all BRITE hierarchies to be downloaded. + Hierarchies of interest have accessions starting with 'ko' and classify genes/proteins. + Excluded hierarchies include those for modules, pathways, and other systems for reactions, + compounds, taxa, etc. + + The dictionary that is filled out, `self.brite_dict`, is keyed by the 'ko' hierarchy name + exactly as given in the 'br08902' json file. The values are the categorizations of the + hierarchy in 'br08902', going from most general to most specific category. + + Here is an example of an entry produced in self.brite_dict: + 'ko01000 Enzymes': + ['Genes and Proteins', 'Protein families: metabolism'] + """ + + filesnpaths.is_file_exists(self.kegg_brite_hierarchies_file) + filesnpaths.is_file_json_formatted(self.kegg_brite_hierarchies_file) + + self.progress.new("Parsing KEGG BRITE Hierarchies file") + + brite_hierarchies_dict = json.load(open(self.kegg_brite_hierarchies_file)) + # store the names of all of the 'ko' hierarchies for genes/proteins + self.brite_dict = {} + hierarchies_appearing_multiple_times = [] + hierarchies_with_unrecognized_accession = [] + for hierarchy, categorizations in self.invert_brite_json_dict(brite_hierarchies_dict).items(): + # we have observed the hierarchy label to have an accession followed by two spaces followed by the hierarchy name, + # but accommodate the possibility that the accession is separated from the name by a variable number of spaces + split_hierarchy = hierarchy.split(' ') + hierarchy_accession = split_hierarchy[0] + hierarchy_name = ' '.join(split_hierarchy[1: ]).lstrip() + if hierarchy_accession[: 2] == 'br': + # hierarchy accessions beginning with 'br' are for reactions, compounds, taxa, etc., not genes/proteins + continue + elif hierarchy_accession == 'ko00002' and hierarchy_name == 'KEGG modules': + # this hierarchy is for modules, not genes/proteins + continue + elif hierarchy_accession == 'ko00003' and hierarchy_name == 'KEGG reaction modules': + # this hierarchy is also for modules + continue + + if len(categorizations) > 1: + hierarchies_appearing_multiple_times.append((hierarchy, len(categorizations))) + + if hierarchy_accession[: 2] != 'ko': + hierarchies_with_unrecognized_accession.append(hierarchy) + continue + try: + int(hierarchy_accession[2: 7]) + except ValueError: + hierarchies_with_unrecognized_accession.append(hierarchy) + continue + self.brite_dict[hierarchy] = categorizations[0][1: ] + + error_first_part = "" + if hierarchies_appearing_multiple_times: + error_first_part = ("Each BRITE hierarchy should only appear once in the hierarchy of hierarchies, " + "but the following hierarchies appeared the given number of times: " + f"{', '.join([f'{hier}: {num_times}' for hier, num_times in hierarchies_appearing_multiple_times])}.") + error_second_part = "" + if hierarchies_with_unrecognized_accession: + error_second_part = ("Each BRITE hierarchy accession is expected to have an accession formatted 'koXXXXX', where 'XXXXX' are five digits, " + f"but the following hierarchies did not have this format: {', '.join(hierarchies_with_unrecognized_accession)}.") + if hierarchies_appearing_multiple_times or hierarchies_with_unrecognized_accession: + raise ConfigError("Please contact the developers to look into the following error. " + f"{error_first_part}{' ' if error_first_part and error_second_part else ''}{error_second_part}") + + self.progress.end() + + + def download_brite_hierarchies(self): + """This function downloads a json file for every BRITE hierarchy of interest. + + Hierarchies of interest classify genes/proteins and have accessions starting with 'ko'. + """ + + # import the function for multithreaded download + import multiprocessing as mp + from anvio.biochemistry.reactionnetwork import _download_worker + + total = len(self.brite_dict) + self.run.info("Number of BRITE hierarchies to download", total) + self.progress.new("Downloading BRITE files") + manager = mp.Manager() + input_queue = manager.Queue() + output_queue = manager.Queue() + unexpected_hierarchies = [] + for hierarchy in self.brite_dict: + hierarchy_accession = hierarchy[: 7] + brite_system = hierarchy_accession[: 2] + if brite_system != 'ko': + unexpected_hierarchies.append(hierarchy) + if not unexpected_hierarchies: + file_path = os.path.join(self.brite_data_dir, hierarchy_accession) + url = self.kegg_rest_api_get + '/br:' + hierarchy_accession + '/json' + input_queue.put((url, file_path)) + workers: List[mp.Process] = [] + for _ in range(self.num_threads): + worker = mp.Process(target=_download_worker, args=(input_queue, output_queue)) + workers.append(worker) + worker.start() + + downloaded_count = 0 + undownloaded_count = 0 + undownloaded = [] + while downloaded_count + undownloaded_count < total: + output = output_queue.get() + if output is True: + downloaded_count += 1 + self.progress.update(f"{downloaded_count} / {total} files downloaded") + else: + undownloaded_count += 1 + undownloaded.append(os.path.splitext(os.path.basename(output))[0]) + + for worker in workers: + worker.terminate() + if undownloaded: + raise ConfigError( + "Unfortunately, files for the following BRITE hierarchies failed to download despite multiple attempts, " + f"and so the database needs to be set up again: {', '.join(undownloaded)}" + ) + self.progress.end() + + if unexpected_hierarchies: + raise ConfigError("Accessions for BRITE hierarchies of genes/proteins should begin with 'ko'. " + f"Hierarchies were found that defy our assumptions; please contact a developer to investigate this: '{', '.join(unexpected_hierarchies)}'.") + + + def confirm_downloaded_brite_hierarchies(self): + """This function verifies that all BRITE hierarchy files have been downloaded. + + It checks that there is a hierarchy file for every hierarchy in the self.brite_dict dictionary; + for that reason, it must be called after the function that creates that attribute, + process_brite_hierarchy_of_hierarchies(), has already been called. + """ + + for hierarchy in self.brite_dict.keys(): + hierarchy_accession = hierarchy[: 7] + file_path = os.path.join(self.brite_data_dir, hierarchy_accession) + if not os.path.exists(file_path): + raise ConfigError(f"The BRITE hierarchy file for {hierarchy} does not exist at its expected location, {file_path}. " + f"This probably means that something is wrong with your downloaded data, since this " + f"hierarchy is present in the file that lists all BRITE hierarchies you *should* have " + f"on your computer. Very sorry to tell you this, but you need to re-download the KEGG " + f"data. We recommend the --reset flag.") + # verify that the whole json file was downloaded + filesnpaths.is_file_json_formatted(file_path) + self.run.info("Number of BRITE hierarchy files found", len(self.brite_dict)) class RunKOfams(KeggContext): @@ -1691,8 +1894,8 @@ def __init__(self, args, run=run, progress=progress): # verify that Kofam HMM profiles have been set up if not os.path.exists(self.kofam_hmm_file_path): raise ConfigError(f"Anvi'o is unable to find any KEGG files around :/ It is likely you need to first run the program " - f"`anvi-setup-kegg-kofams` to set things up. If you already have run it, but instructed anvi'o to " - f"store the output to a specific directory, then instead of running `anvi-setup-kegg-kofams` again, " + f"`anvi-setup-kegg-data` to set things up. If you already have run it, but instructed anvi'o to " + f"store the output to a specific directory, then instead of running `anvi-setup-kegg-data` again, " f"you simply need to specify the location of the KEGG data using the flag `--kegg-data-dir`. Just for " f"your information, anvi'o was looking for the KEGG data here: {self.kegg_data_dir}") @@ -1700,13 +1903,21 @@ def __init__(self, args, run=run, progress=progress): self.setup_ko_dict() # read the ko_list file into self.ko_dict - # load existing kegg modules db - self.kegg_modules_db = ModulesDatabase(self.kegg_modules_db_path, module_data_directory=self.kegg_module_data_dir, brite_data_directory=self.brite_data_dir, skip_brite_hierarchies=self.skip_brite_hierarchies, args=self.args) + # load existing kegg modules db, if one exists + if os.path.exists(self.kegg_modules_db_path): + self.kegg_modules_db = ModulesDatabase(self.kegg_modules_db_path, module_data_directory=self.kegg_module_data_dir, brite_data_directory=self.brite_data_dir, skip_brite_hierarchies=self.skip_brite_hierarchies, args=self.args) - if not self.skip_brite_hierarchies and not self.kegg_modules_db.db.get_meta_value('is_brite_setup'): - self.run.warning("The KEGG Modules database does not contain BRITE hierarchy data, " + if not self.skip_brite_hierarchies and not self.kegg_modules_db.db.get_meta_value('is_brite_setup'): + self.run.warning("The KEGG Modules database does not contain BRITE hierarchy data, " "which could very well be useful to you. BRITE is guaranteed to be set up " - "when downloading the latest version of KEGG with `anvi-setup-kegg-kofams -D`.") + "when downloading the latest version of KEGG with `anvi-setup-kegg-data`.") + else: + self.run.warning("No modules database was found in the KEGG data directory you specified. This is fine, but " + "you will not get functional annotations related to KEGG MODULES or BRITE hierarchies in your " + "contigs database. If you want to include these annotations later, you will have to rerun this " + "program with a data directory including a modules database (which you can obtain by running " + "`anvi-setup-kegg-data` again with the right mode(s).") + self.kegg_modules_db = None # reminder to be a good citizen self.run.warning("Anvi'o will annotate your database with the KEGG KOfam database, as described in " @@ -1737,8 +1948,12 @@ def set_hash_in_contigs_db(self): A = lambda x: self.args.__dict__[x] if x in self.args.__dict__ else None self.contigs_db_path = A('contigs_db') + hash_to_add = "only_KOfams_were_annotated" + if self.kegg_modules_db: + hash_to_add = self.kegg_modules_db.db.get_meta_value('hash') + contigs_db = ContigsDatabase(self.contigs_db_path) - contigs_db.db.set_meta_value('modules_db_hash', self.kegg_modules_db.db.get_meta_value('hash')) + contigs_db.db.set_meta_value('modules_db_hash', hash_to_add) contigs_db.disconnect() @@ -1856,9 +2071,11 @@ def parse_kofam_hits(self, hits_dict): self.gcids_to_functions_dict[gcid].append(counter) # add associated KEGG module information to database - mods = self.kegg_modules_db.get_modules_for_knum(knum) - names = self.kegg_modules_db.get_module_names_for_knum(knum) - classes = self.kegg_modules_db.get_module_classes_for_knum_as_list(knum) + mods = None + if self.kegg_modules_db: + mods = self.kegg_modules_db.get_modules_for_knum(knum) + names = self.kegg_modules_db.get_module_names_for_knum(knum) + classes = self.kegg_modules_db.get_module_classes_for_knum_as_list(knum) if mods: mod_annotation = "!!!".join(mods) @@ -1886,7 +2103,7 @@ def parse_kofam_hits(self, hits_dict): 'e_value': None, } - if not self.skip_brite_hierarchies: + if self.kegg_modules_db and not self.skip_brite_hierarchies: # get BRITE categorization information in the form to be added to the contigs database ortholog_categorizations_dict = self.get_ortholog_categorizations_dict(knum, gcid) if ortholog_categorizations_dict: @@ -1986,9 +2203,11 @@ def update_dict_for_genes_with_missing_annotations(self, gcids_list, hits_dict, self.gcids_to_functions_dict[gcid] = [next_key] # add associated KEGG module information to database - mods = self.kegg_modules_db.get_modules_for_knum(best_knum) - names = self.kegg_modules_db.get_module_names_for_knum(best_knum) - classes = self.kegg_modules_db.get_module_classes_for_knum_as_list(best_knum) + mods = None + if self.kegg_modules_db: + mods = self.kegg_modules_db.get_modules_for_knum(best_knum) + names = self.kegg_modules_db.get_module_names_for_knum(best_knum) + classes = self.kegg_modules_db.get_module_classes_for_knum_as_list(best_knum) if mods: mod_annotation = "!!!".join(mods) @@ -2016,11 +2235,11 @@ def update_dict_for_genes_with_missing_annotations(self, gcids_list, hits_dict, 'e_value': None, } - if not self.skip_brite_hierarchies: - # get BRITE categorization information in the form to be added to the contigs database - ortholog_categorizations_dict = self.get_ortholog_categorizations_dict(knum, gcid) - if ortholog_categorizations_dict: - self.kegg_brite_categorizations_dict[next_key] = ortholog_categorizations_dict + if self.kegg_modules_db and not self.skip_brite_hierarchies: + # get BRITE categorization information in the form to be added to the contigs database + ortholog_categorizations_dict = self.get_ortholog_categorizations_dict(knum, gcid) + if ortholog_categorizations_dict: + self.kegg_brite_categorizations_dict[next_key] = ortholog_categorizations_dict next_key += 1 num_annotations_added += 1 @@ -2809,7 +3028,7 @@ def __init__(self, args, run=run, progress=progress): if not os.path.exists(self.kegg_modules_db_path): raise ConfigError(f"It appears that a KEGG modules database ({self.kegg_modules_db_path}) does not exist in the provided data directory. " f"Perhaps you need to specify a different data directory using --kegg-data-dir. Or perhaps you didn't run " - f"`anvi-setup-kegg-kofams`, though we are not sure how you got to this point in that case." + f"`anvi-setup-kegg-data`, though we are not sure how you got to this point in that case." f"But fine. Hopefully you now know what you need to do to make this message go away.") if not self.estimate_from_json and not self.enzymes_txt: @@ -2824,7 +3043,25 @@ def __init__(self, args, run=run, progress=progress): mod_db_hash = kegg_modules_db.db.get_meta_value('hash') kegg_modules_db.disconnect() - if contigs_db_mod_hash != mod_db_hash: + if contigs_db_mod_hash == "only_KOfams_were_annotated": + if not self.just_do_it: + raise ConfigError("The contigs DB that you are working with has only been annotated with KOfams, and not with a modules database. " + "Since the KEGG data directory used for that annotation did not contain the modules database, we have no way of " + "knowing if the set of KOfams used for annotation matches to the set of KOfams associated with your current " + "modules database. Theoretically, we can still estimate metabolism even if there is a mismatch, but you risk " + "getting erroneous results since 1) KOs used to define the pathways could be missing from your collection, " + "and 2) KO functions could have been changed such that your KOs don't correspond to the real enzymes required " + "for the pathways. If you are willing to take this risk, you can restart this program with the --just-do-it " + "flag and move on with your life. But if you really want to do things properly, you should re-annotate your " + "contigs database with `anvi-run-kegg-kofams`, using a KEGG data directory that includes a modules database.") + else: + self.run.warning("ALERT. ALERT. The contigs DB does not include a modules database hash, which means we can't " + "tell if it was annotated with set of KOfams that match to the current modules database. Since you " + "have used the --just-do-it flag, we will assume you know what you are doing. But please keep in " + "mind that the metabolism estimation results could be wrong due to mismatches between the modules " + "database and your set of KOfams.") + + elif contigs_db_mod_hash != mod_db_hash: raise ConfigError(f"The contigs DB that you are working with has been annotated with a different version of the MODULES.db " f"than you are working with now. Basically, this means that the annotations are not compatible with the " f"metabolism data to be used for estimation. There are several ways this can happen. Please visit the " @@ -5511,7 +5748,7 @@ def __init__(self, args, run=run, progress=progress): if not os.path.exists(self.kegg_modules_db_path): raise ConfigError(f"It appears that a KEGG modules database ({self.kegg_modules_db_path}) does not exist in the provided data directory. " f"Perhaps you need to specify a different data directory using --kegg-data-dir. Or perhaps you didn't run " - f"`anvi-setup-kegg-kofams`, though we are not sure how you got to this point in that case." + f"`anvi-setup-kegg-data`, though we are not sure how you got to this point in that case." f"But fine. Hopefully you now know what you need to do to make this message go away.") else: # USER data only @@ -6141,11 +6378,11 @@ def __init__(self, db_path, args, module_data_directory=None, brite_data_directo # if self.module_dict is None, then we tried to initialize the DB outside of setup if not self.module_dict: raise ConfigError("ERROR - a new ModulesDatabase() cannot be initialized without providing a modules dictionary. This " - "usually happens when you try to access a Modules DB before one has been setup. Running `anvi-setup-kegg-kofams` may fix this.") + "usually happens when you try to access a Modules DB before one has been setup. Running `anvi-setup-kegg-data` may fix this.") if not self.skip_brite_hierarchies and not self.brite_dict: raise ConfigError("ERROR - a new ModulesDatabase() cannot be initialized without providing a BRITE dictionary. This " - "usually happens when you try to access a Modules DB before one has been setup. Running `anvi-setup-kegg-kofams` may fix this.") + "usually happens when you try to access a Modules DB before one has been setup. Running `anvi-setup-kegg-data` may fix this.") ######### DB GENERATION FUNCTIONS ######### diff --git a/anvio/migrations/modules/v3_to_v4.py b/anvio/migrations/modules/v3_to_v4.py index 9dd3fb42f7..6740404d51 100644 --- a/anvio/migrations/modules/v3_to_v4.py +++ b/anvio/migrations/modules/v3_to_v4.py @@ -65,7 +65,7 @@ def migrate(db_path): run.info_single(f"The modules database is now {next_version}. An empty table of KEGG BRITE hierarchy " "categorizations of all orthologs was created, and related self table attributes were added. " - "We suggest generating a new modules database from `anvi-setup-kegg-kofams` and re-running " + "We suggest generating a new modules database from `anvi-setup-kegg-data` and re-running " "`anvi-run-kegg-kofams` on your contigs databases to benefit from these useful annotations :)", nl_after=1, nl_before=1, mc='green') diff --git a/anvio/tests/run_component_tests_for_metabolism.sh b/anvio/tests/run_component_tests_for_metabolism.sh index 5b99c0807b..0c88a4a1b6 100755 --- a/anvio/tests/run_component_tests_for_metabolism.sh +++ b/anvio/tests/run_component_tests_for_metabolism.sh @@ -16,13 +16,14 @@ cd $output_dir/metabolism_test INFO "Migrating all databases" anvi-migrate *db --migrate-quickly -# generate a temporary directory to store anvi-setup-kegg-kofams output, +# generate a temporary directory to store anvi-setup-kegg-data output, # and remove it immediately to make sure it doesn't exist: kegg_data_dir=`mktemp -d` rm -rf $kegg_data_dir INFO "Setting up KEGG data" -anvi-setup-kegg-kofams --kegg-data-dir $kegg_data_dir \ +anvi-setup-kegg-data --mode all + --kegg-data-dir $kegg_data_dir \ --no-progress ## BASIC TESTS diff --git a/bin/anvi-estimate-metabolism b/bin/anvi-estimate-metabolism index 6cb60a892a..dd14473c25 100755 --- a/bin/anvi-estimate-metabolism +++ b/bin/anvi-estimate-metabolism @@ -115,6 +115,7 @@ if __name__ == '__main__': groupD.add_argument(*anvio.A('get-raw-data-as-json'), **anvio.K('get-raw-data-as-json')) groupD.add_argument(*anvio.A('store-json-without-estimation'), **anvio.K('store-json-without-estimation')) groupD.add_argument(*anvio.A('estimate-from-json'), **anvio.K('estimate-from-json')) + groupD.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it')) args = parser.get_args(parser) diff --git a/bin/anvi-reaction-network b/bin/anvi-reaction-network index f54868a399..064dfddd5b 100755 --- a/bin/anvi-reaction-network +++ b/bin/anvi-reaction-network @@ -14,7 +14,7 @@ __copyright__ = "Copyleft 2015-2023, the Meren Lab (http://merenlab.org/)" __license__ = "GPL 3.0" __version__ = VERSION __authors__ = ["semiller10"] -__requires__ = ["contigs-db", "kegg-functions", "reaction-ref-data"] +__requires__ = ["contigs-db", "kegg-functions", "reaction-ref-data", "kegg-data"] __provides__ = ["reaction-network"] __description__ = "Generate a metabolic reaction network in an anvi'o contigs database" @@ -26,14 +26,14 @@ def main(): '--ko-dir', type=str, metavar='PATH', help=( "Path to KEGG KO database directory. If this option is not used, the program expects a " - "database set up in the default location used by 'anvi-setup-protein-reference-database'." + "database set up in the default location used by 'anvi-setup-kegg-data'." ) ) parser.add_argument( '--modelseed-dir', type=str, metavar='PATH', help=( "Path to ModelSEED Biochemistry database directory. If this option is not used, the program " - "expects a database set up in the default location used by 'anvi-setup-protein-reference-database'." + "expects a database set up in the default location used by 'anvi-setup-modelseed-database'." ) ) parser.add_argument( diff --git a/bin/anvi-setup-kegg-data b/bin/anvi-setup-kegg-data new file mode 100755 index 0000000000..56e59be271 --- /dev/null +++ b/bin/anvi-setup-kegg-data @@ -0,0 +1,181 @@ +#!/usr/bin/env python +# -*- coding: utf-8 + +import sys +import argparse + +import anvio +import anvio.kegg as kegg + +from anvio.biochemistry.reactionnetwork import KODatabase +from anvio.errors import ConfigError, FilesNPathsError +from anvio.terminal import time_program +from anvio.ttycolors import color_text as c + +__author__ = "Developers of anvi'o (see AUTHORS.txt)" +__copyright__ = "Copyleft 2015-2023, the Meren Lab (http://merenlab.org/)" +__license__ = "GPL 3.0" +__version__ = anvio.__version__ +__authors__ = ['ivagljiva', 'semiller10'] +__provides__ = ["kegg-data", "modules-db"] +__description__ = "Download and setup various databases from KEGG" + +## AVAILABLE DOWNLOAD MODES (and their parameters) +DOWNLOAD_MODES = {'KOfam': {'description': 'only KOfam annotation models (HMMs). Use this mode if ' + 'you only want to run `anvi-run-kegg-kofams`.', + 'arguments': {'only-download': {'flags': anvio.A('only-download'), + 'definition': anvio.K('only-download')}, + 'only-processing': {'flags': anvio.A('only-processing'), + 'definition': anvio.K('only-processing')} + } + }, + 'modules': {'description': 'metabolic pathways from the KEGG MODULES database and BRITE hierarchies. Use this mode AND "KOfam" ' + 'mode if you want to run pathway prediction with `anvi-estimate-metabolism`.', + 'arguments': {'only-download': {'flags': anvio.A('only-download'), + 'definition': anvio.K('only-download')}, + 'only-processing': {'flags': anvio.A('only-processing'), + 'definition': anvio.K('only-processing')}, + 'overwrite-output-destinations': {'flags': anvio.A('overwrite-output-destinations'), + 'definition': anvio.K('overwrite-output-destinations', + {'help': "Overwrite any existing modules database " + "in the KEGG data directory " + "[USE WITH CAUTION]. Only relevant if you " + "are using the --only-processing flag"})}, + 'skip-brite-hierarchies': {'flags': anvio.A('skip-brite-hierarchies'), + 'definition': anvio.K('skip-brite-hierarchies')}, + } + }, + 'modeling': {'description': 'KEGG orthologs and reactions. Use this mode if ' + 'you want to run metabolic modeling with `anvi-reaction-network`.', + 'arguments': {'dir': {'flags': ['--dir'], + 'definition': {'default': None, + 'type': str, + 'help': "You have the option to store the modeling data in a different location " + "on your computer than other KEGG stuff (ie, NOT --kegg-data-dir). " + "Use this argument to select a custom directory in which to store " + "the modeling data." + f"(default: {KODatabase.default_dir})"}} + }, + }, + 'all': {'description': 'Download ALL KEGG data. This is the default mode.', + 'arguments': {'kegg-snapshot': {'flags': anvio.A('kegg-snapshot'), + 'definition': anvio.K('kegg-snapshot')}, + 'download-from-kegg': {'flags': anvio.A('download-from-kegg'), + 'definition': anvio.K('download-from-kegg')}, + 'kegg-archive': {'flags': anvio.A('kegg-archive'), + 'definition': anvio.K('kegg-archive')} + }, + } + +} + + +@time_program +def main(args, unknown_args): + + if args.list_modes: + import anvio.terminal as terminal + run = terminal.Run() + run.warning(None, header="AVAILABLE DOWNLOAD MODES", lc="green") + for mode, info_dict in DOWNLOAD_MODES.items(): + run.info(mode, info_dict['description']) + sys.exit(0) + + # Here we parse mode-specific parameters that aren't recognized by the parent parser + mode = args.mode + mode_args, mode_unknown = subparsers[mode].parse_known_args(unknown_args) + args = argparse.Namespace(**vars(args), **vars(mode_args)) + # this flag is already handled by anvi'o and shouldn't be in the unknown list + if '--debug' in mode_unknown: + mode_unknown.remove('--debug') + if len(mode_unknown): + raise ConfigError(f"Unrecognized parameters: {' '.join(mode_unknown)}. Did you perhaps fail to specify the right mode?") + + if mode == "all" and not args.download_from_kegg: + setup = kegg.KeggSetup(args) + setup.setup_all_data_from_archive_or_snapshot() + else: + if mode == "KOfam" or mode == "all": + args.download_from_kegg = True + setup = kegg.KOfamDownload(args) + setup.setup_kofams() + if mode == "modules" or mode == "all": + # do not reset the directory if it already happened + if mode == "all" and args.reset: + args.reset = False + args.skip_init = True + args.download_from_kegg = True + setup = kegg.ModulesDownload(args) + setup.setup_modules_data() + if mode == "modeling" or mode == "all": + # we ignore the dir parameter for all mode + if mode == "all": + setup_directory = args.kegg_data_dir + else: + # the --dir parameter overrides --kegg-data-dir + if args.dir: + setup_directory = args.dir + elif not args.dir and args.kegg_data_dir: + setup_directory = args.kegg_data_dir + + KODatabase.set_up(num_threads = args.num_threads, dir = setup_directory, reset = args.reset) + + +if __name__ == '__main__': + from anvio.argparse import ArgumentParser + + parser = ArgumentParser(description=__description__) + + show_help = ('--help' in sys.argv) or ('-h' in sys.argv) + + groupM = parser.add_argument_group('MODE', "Select which data you want to download.") + mode_help = "Depending on your choice here, this program will download and set up " + \ + "certain subsets of the data available from KEGG. Use --list-modes to see " + \ + f"a description of the options. Available modes: {', '.join(DOWNLOAD_MODES.keys())}" + groupM.add_argument('--mode', choices=DOWNLOAD_MODES.keys(), help=mode_help, default='all') + groupM.add_argument('--list-modes', **{'default': False, 'action': 'store_true', + 'help': "List the available modes and their descriptions."}) + + # common arguments + groupE = parser.add_argument_group('COMMON PARAMETERS', "These parameters apply to any mode.") + groupE.add_argument(*anvio.A('kegg-data-dir'), **anvio.K('kegg-data-dir')) + groupE.add_argument(*anvio.A('num-threads'), **anvio.K('num-threads')) + groupE.add_argument(*anvio.A('reset'), **anvio.K('reset')) + groupE.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it')) + + groupA = parser.add_argument_group('MODE-SPECIFIC PARAMS', "Each section (underneath the program details) " + "below lists the parameters for one mode.") + + + if show_help: + parser.print_help() + + + subparsers = {} + for mode, info_dict in DOWNLOAD_MODES.items(): + subparser = argparse.ArgumentParser(usage=argparse.SUPPRESS, add_help=False) + + subparser._optionals.title = " \n%s\n%s" % (c(mode.upper(), "green"), ':' * 79) + for arg_name, arg_dict in info_dict['arguments'].items(): + subparser.add_argument(*arg_dict['flags'], **arg_dict['definition']) + + if show_help: + subparser.print_help() + + subparsers[mode] = subparser + + + if show_help: + sys.exit() + + args, unknown_args = parser.parse_known_args() + + try: + main(args, unknown_args) + + except ConfigError as e: + print(e) + sys.exit(-1) + except FilesNPathsError as e: + print(e) + sys.exit(-1) diff --git a/bin/anvi-setup-kegg-kofams b/bin/anvi-setup-kegg-kofams deleted file mode 100755 index bb9b6b26cc..0000000000 --- a/bin/anvi-setup-kegg-kofams +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 - -import sys - -import anvio -import anvio.kegg as kegg - -from anvio.errors import ConfigError, FilesNPathsError -from anvio.terminal import time_program - -__author__ = "Developers of anvi'o (see AUTHORS.txt)" -__copyright__ = "Copyleft 2015-2020, the Meren Lab (http://merenlab.org/)" -__license__ = "GPL 3.0" -__version__ = anvio.__version__ -__authors__ = ['ivagljiva', 'semiller10'] -__provides__ = ["kegg-data", "modules-db"] -__description__ = "Download and setup KEGG KOfam HMM profiles plus KEGG MODULE and KEGG BRITE data" - -@time_program -def main(args): - setup = kegg.KeggSetup(args) - setup.setup_data() - -if __name__ == '__main__': - from anvio.argparse import ArgumentParser - - parser = ArgumentParser(description=__description__) - groupS = parser.add_argument_group('DEFAULT SETUP - KEGG SNAPSHOT', "This program will set up metabolism " - "data from the KEGG resource of databases. By default, it will download " - "the snapshot of KEGG that is associated with the latest release of anvi'o. " - "But if you want a different snapshot, you can specify that. :) ") - groupS.add_argument(*anvio.A('kegg-snapshot'), **anvio.K('kegg-snapshot')) - - groupD = parser.add_argument_group('DOWNLOAD FROM KEGG WEBSITE', "Choose this option for the most up-to-date " - "version of KEGG data, downloaded using the KEGG API. May fail if KEGG " - "updates its formatting (but if that happens, you can always try the default " - "option).") - groupD.add_argument(*anvio.A('download-from-kegg'), **anvio.K('download-from-kegg')) - groupD.add_argument(*anvio.A('only-download'), **anvio.K('only-download')) - groupD.add_argument(*anvio.A('only-database'), **anvio.K('only-database')) - groupD.add_argument(*anvio.A('overwrite-output-destinations'), **anvio.K('overwrite-output-destinations', - {'help': "Overwrite any existing modules database " - "in the KEGG data directory " - "[USE WITH CAUTION]. Only relevant if you " - "are using the --only-database flag"})) - - groupA = parser.add_argument_group('SETUP FROM KEGG ARCHIVE', "Choose this option if you already have a " - ".tar.gz archive of anvi'o-formatted KEGG data on your computer. We'll " - "setup from this archive instead of downloading one.") - groupA.add_argument(*anvio.A('kegg-archive'), **anvio.K('kegg-archive')) - - groupE = parser.add_argument_group('EXTRAS', "Extras for the extra.") - groupE.add_argument(*anvio.A('kegg-data-dir'), **anvio.K('kegg-data-dir')) - groupE.add_argument(*anvio.A('skip-brite-hierarchies'), **anvio.K('skip-brite-hierarchies')) - groupE.add_argument(*anvio.A('reset'), **anvio.K('reset')) - groupE.add_argument(*anvio.A('just-do-it'), **anvio.K('just-do-it')) - - args = parser.get_args(parser) - - try: - main(args) - - except ConfigError as e: - print(e) - sys.exit(-1) - except FilesNPathsError as e: - print(e) - sys.exit(-1) diff --git a/bin/anvi-setup-protein-reference-database b/bin/anvi-setup-modelseed-database similarity index 82% rename from bin/anvi-setup-protein-reference-database rename to bin/anvi-setup-modelseed-database index ff9c12a63c..4c92c3346f 100755 --- a/bin/anvi-setup-protein-reference-database +++ b/bin/anvi-setup-modelseed-database @@ -20,7 +20,7 @@ __license__ = "GPL 3.0" __version__ = VERSION __authors__ = ['semiller10'] __requires__ = ['functions'] -__provides__ = [] +__provides__ = ["reaction-ref-data"] __description__ = DESCRIPTION @@ -28,8 +28,7 @@ def main() -> None: args = get_args() if args.db == 'ModelSEED': db = refdbs.ModelSEEDDatabase(args.dir) - elif args.db == 'KEGG': - db = refdbs.KEGGDatabase(args.dir, args.num_threads) + db.download(reset=args.reset) def get_args() -> Namespace: @@ -37,13 +36,11 @@ def get_args() -> Namespace: parser.add_argument( '--db', - choices=('KEGG', 'ModelSEED'), + choices=('ModelSEED',), type=str, help=( "The protein reference database of interest. " - "1) KEGG: All items from the Ortholog and Reaction databases are downloaded and " - "processed. " - "2) ModelSEED: The ModelSEED Biochemistry database harmonizes multiple reference " + "1) ModelSEED: The ModelSEED Biochemistry database harmonizes multiple reference " "databases, including KEGG, MetaCyc, and BiGG. The reactions and compounds tables are " "downloaded and processed." ) @@ -51,7 +48,7 @@ def get_args() -> Namespace: parser.add_argument( '--dir', default=refdbs.ProteinReferenceDatabase.default_superdir, type=str, help=( - "Directory in which a new subdirectory with the name of the database (e.g., 'kegg', " + "Directory in which a new subdirectory with the name of the database (e.g., " "'modelseed') is created containing database files." ) ) diff --git a/sandbox/anvi-script-estimate-metabolic-independence b/sandbox/anvi-script-estimate-metabolic-independence index 1999371a86..5d7b933947 100755 --- a/sandbox/anvi-script-estimate-metabolic-independence +++ b/sandbox/anvi-script-estimate-metabolic-independence @@ -83,7 +83,7 @@ def main(args): raise ConfigError("Anvi'o is unable to find the MODULES.db in the KEGG data directory you have specified :/") else: raise ConfigError("Anvi'o is unable to find the MODULES.db in the default KEGG data directory. You either " - "need to run `anvi-setup-kegg-kofams` to install the KEGG data in your environment, or " + "need to run `anvi-setup-kegg-data` to install the KEGG data in your environment, or " "you need to use the `--kegg-data-dir` parameter to specify where should anvi'o find it.") run.info("KEGG data directory", kegg_ctx.kegg_data_dir)