Merge pull request #587 from monarch-initiative/update-koza

Update Koza, imports, and tests. Fix lint/format make targets
monarch-initiative · May 16, 2024 · e137505 · e137505
2 parents 0d51e68 + 4544837
commit e137505
Show file tree

Hide file tree

Showing 90 changed files with 1,549 additions and 1,427 deletions.
diff --git a/Makefile b/Makefile
@@ -12,6 +12,7 @@ WGET = /usr/bin/env wget --timestamping --no-verbose
 .DEFAULT_GOAL := all
 SHELL := bash
 
+RUN = poetry run
 
 .PHONY: all
 all: install format test clean
@@ -29,12 +30,12 @@ install-full:
 
 .PHONY: test
 test: install
-	poetry run python -m pytest tests
+	$(RUN) python -m pytest tests
 
 
 .PHONY: docs
 docs: install-full
-	poetry run typer src/monarch_ingest/main.py utils docs --name ingest --output docs/CLI.md
+	$(RUN) typer src/monarch_ingest/main.py utils docs --name ingest --output docs/CLI.md
 
 
 .PHONY: clean
@@ -47,18 +48,11 @@ clean:
 
 .PHONY: lint
 lint: install-full
-	poetry run flake8 --exit-zero --max-line-length 120 src/monarch_ingest/ tests/
-	poetry run black --check --diff monarch_ingest tests
-	poetry run isort --check-only --diff monarch_ingest tests
+	$(RUN) ruff check --diff --exit-zero src/ tests/
+	$(RUN) black --check --diff -l 120 src/ tests/
 
 
 .PHONY: format
 format: install-full
-	poetry run autoflake \
-		--recursive \
-		--remove-all-unused-imports \
-		--remove-unused-variables \
-		--ignore-init-module-imports \
-		--in-place monarch_ingest tests
-	poetry run isort monarch_ingest tests
-	poetry run black monarch_ingest tests
+	$(RUN) ruff check --fix --exit-zero src/ tests/
+	$(RUN) black -l 120 src/ tests/
diff --git a/docs/Create-an-Ingest/4. Implement.md b/docs/Create-an-Ingest/4. Implement.md
@@ -7,7 +7,7 @@ Most Koza scripts can run in flat mode, which means that the transform code itse
 Start with the imports, and make sure to set the source_name, which will be used for communicating with the reader and writer.
 
 ```python
-from koza.cli_runner import koza_app
+from koza.cli_utils import koza_app
 from biolink.pydanticmodel_v2 import Gene
 
 # The source name is used for reading and writing

diff --git a/docs/Create-an-Ingest/5. Test.md b/docs/Create-an-Ingest/5. Test.md
@@ -8,7 +8,7 @@ First, set up your basic fixtures, taking care to set the correct source name an
 
 ```python
 import pytest
-from koza.cli_runner import get_translation_table
+from koza.cli_utils import get_translation_table
 
 @pytest.fixture
 def tt():

diff --git a/ingest_template/example_test.py b/ingest_template/example_test.py
@@ -1,5 +1,5 @@
 import pytest
-from koza.cli_runner import get_translation_table
+from koza.cli_utils import get_translation_table
 
 
 @pytest.fixture

diff --git a/ingest_template/source-file-template-csv.py b/ingest_template/source-file-template-csv.py
@@ -1,7 +1,7 @@
 from typing import List
 
 from biolink.pydanticmodel_v2 import Gene
-from koza.cli_runner import koza_app
+from koza.cli_utils import koza_app
 
 # You've got 'NCBI_Gene:' and you want 'NCBIGene:'? clean it up.
 curie_cleaner = koza_app.curie_cleaner

diff --git a/ingest_template/source-file-template-json.py b/ingest_template/source-file-template-json.py
@@ -7,7 +7,7 @@
     GeneToPhenotypicFeatureAssociation,
     PhenotypicFeature,
 )
-from koza.cli_runner import koza_app
+from koza.cli_utils import koza_app
 
 # include logging if necessary
 from loguru import logger

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -15,37 +15,36 @@ packages = [
 ]
 
 [tool.poetry.dependencies]
-python = ">=3.10,<3.12"
-# biolink-model = "^4.2.0"
-# When 4.2.0 (or any release after 4.1.6) is released, we can remove the git dependency
-biolink-model = { git = "https://github.com/biolink/biolink-model", branch = "master" }
+python = "^3.10"
+biolink-model = "^4.2.0"
 bmt = "^1.0.15"
 cat-merge = "0.2.1"
 closurizer = "0.5.1"
 kghub-downloader = "^0.3.2"
-kgx = { git = "https://github.com/biolink/kgx", branch = "master" } # ">=2.1"
-koza = ">=0.5.2"
-linkml = "1.6.3"
-linkml-solr = "0.1.5" # "^0.1.3"
+kgx = ">=2.4.0"
+koza = ">=0.6.0"
+linkml = "^1.7.8"
+linkml-solr = ">=0.1.5"
 multi-indexer = "0.0.5"
 # Other Dependencies
 botocore = "^1.31"
 importlib-metadata = ">=4.6.1"
 loguru = "*"
 pydantic = "^2.5"
 sh = "^1.14.3"
-typer = "^0.7"
-typer-cli = "^0.0.13"
+typer = "^0.12"
 yamllint = "^1.35.1"
-linkml-runtime = "1.6.3"
+linkml-runtime = "^1.7.5"
+# Remove this once cat-merge fixes its pandas dependency
+pandas = "2.0.3"
 
 [tool.poetry.group.dev]
 optional = true
 
 [tool.poetry.group.dev.dependencies]
-pytest = "^7.1.1"
-mkdocs = "^1.3.0"
-mkdocs-material = "^8.2.9"
+pytest = "^8.1.1"
+mkdocs = "^1.4"
+mkdocs-material = ">=9.5"
 black = "^24.3"
 ruff = "*"
 
@@ -57,11 +56,14 @@ requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.black]
-line_length = 120
+line-length = 120
 skip-string-normalization = true
 
 [tool.ruff]
 line-length = 120
 ignore = [
     "F541", # f-strings with no placeholders
-]
+
+]
+[tool.ruff.lint.per-file-ignores]
+"tests/**.py" = ["F811"] # redefinition of unused imports (mock_koza)
diff --git a/src/monarch_ingest/__init__.py b/src/monarch_ingest/__init__.py
@@ -1,2 +1,3 @@
 from importlib import metadata
-__version__ = metadata.version("monarch_ingest")
+
+__version__ = metadata.version("monarch_ingest")
diff --git a/src/monarch_ingest/cli_utils.py b/src/monarch_ingest/cli_utils.py
@@ -1,7 +1,6 @@
 import csv
 import gc
 import os
-import pkgutil
 import sys
 import tarfile
 import yaml
@@ -18,7 +17,7 @@
 from cat_merge.merge import merge
 from closurizer.closurizer import add_closure
 from kgx.cli.cli_utils import transform as kgx_transform
-from koza.cli_runner import transform_source
+from koza.cli_utils import transform_source
 from koza.model.config.source_config import OutputFormat
 from linkml_runtime.utils.formatutils import camelcase
 
@@ -192,7 +191,7 @@ def transform_phenio(
                 "primary_knowledge_source",
                 "aggregator_knowledge_source",
                 "knowledge_level",
-                "agent_type"
+                "agent_type",
             ]
         ),
         axis=1,
@@ -524,15 +523,9 @@ def do_release(dir: str = OUTPUT_DIR, kghub: bool = False):
             )
             # index files on s3 after upload
             sh.multi_indexer(
-                *f"-v --prefix https://kghub.io/kg-monarch/ -b kg-hub-public-data -r kg-monarch -x".split(
-                    " "
-                )
-            )
-            sh.gsutil(
-                *f"-q -m cp -a public-read ./index.html s3://kg-hub-public-data/kg-monarch".split(
-                    " "
-                )
+                *f"-v --prefix https://kghub.io/kg-monarch/ -b kg-hub-public-data -r kg-monarch -x".split(" ")
             )
+            sh.gsutil(*f"-q -m cp -a public-read ./index.html s3://kg-hub-public-data/kg-monarch".split(" "))
 
         logger.debug("Cleaning up files...")
         sh.rm(f"output/{release_ver}")

diff --git a/src/monarch_ingest/ingests/alliance/gene.py b/src/monarch_ingest/ingests/alliance/gene.py
@@ -1,4 +1,4 @@
-from koza.cli_runner import get_koza_app
+from koza.cli_utils import get_koza_app
 from source_translation import source_map
 
 from biolink_model.datamodel.pydanticmodel_v2 import Gene
@@ -48,21 +48,18 @@
         id=gene_id,
         symbol=row["symbol"],
         name=row["symbol"],
-        full_name=row["name"].replace("\r",""), # Replacement to remove stray carriage returns in XenBase files
+        full_name=row["name"].replace("\r", ""),  # Replacement to remove stray carriage returns in XenBase files
         # No place in the schema for gene type (SO term) right now
         # type=row["soTermId"],
         in_taxon=[in_taxon],
         in_taxon_label=in_taxon_label,
-        provided_by=[source]
+        provided_by=[source],
     )
 
     if row["basicGeneticEntity"]["crossReferences"]:
-        gene.xref = [
-            koza_app.curie_cleaner.clean(xref["id"])
-            for xref in row["basicGeneticEntity"]["crossReferences"]
-        ]
+        gene.xref = [koza_app.curie_cleaner.clean(xref["id"]) for xref in row["basicGeneticEntity"]["crossReferences"]]
     if "synonyms" in row["basicGeneticEntity"].keys():
         # more handling for errant carriage returns
-        gene.synonym = [synonym.replace("\r","") for synonym in row["basicGeneticEntity"]["synonyms"] ]
+        gene.synonym = [synonym.replace("\r", "") for synonym in row["basicGeneticEntity"]["synonyms"]]
 
     koza_app.write(gene)
diff --git a/src/monarch_ingest/ingests/alliance/gene_to_expression.py b/src/monarch_ingest/ingests/alliance/gene_to_expression.py
@@ -1,7 +1,6 @@
-
 import uuid
 
-from koza.cli_runner import get_koza_app
+from koza.cli_utils import get_koza_app
 from source_translation import source_map
 
 from biolink_model.datamodel.pydanticmodel_v2 import GeneToExpressionSiteAssociation, KnowledgeLevelEnum, AgentTypeEnum
@@ -33,30 +32,27 @@
         #       but may have an UBERON term that we can use
         # stage_term_id = get_data(row, "whenExpressed.stageUberonSlimTerm.uberonTerm")
 
-
-
         publication_ids = [get_data(row, "evidence.publicationId")]
 
         xref = get_data(row, "crossReference.id")
         if xref:
             publication_ids.append(xref)
 
-
         # Our current ingest policy is to first use a reported Anatomical structure term...
         if anatomical_entity_id:
             koza_app.write(
                 GeneToExpressionSiteAssociation(
                     id="uuid:" + str(uuid.uuid1()),
                     subject=gene_id,
-                    predicate='biolink:expressed_in',
+                    predicate="biolink:expressed_in",
                     object=anatomical_entity_id,
                     stage_qualifier=stage_term_id,
                     qualifiers=([get_data(row, "assay")] if get_data(row, "assay") else None),
                     publications=publication_ids,
                     aggregator_knowledge_source=["infores:monarchinitiative", "infores:alliancegenome"],
                     primary_knowledge_source=source,
                     knowledge_level=KnowledgeLevelEnum.knowledge_assertion,
-                    agent_type=AgentTypeEnum.manual_agent
+                    agent_type=AgentTypeEnum.manual_agent,
                 )
             )
 
@@ -67,15 +63,15 @@
                 GeneToExpressionSiteAssociation(
                     id="uuid:" + str(uuid.uuid1()),
                     subject=gene_id,
-                    predicate='biolink:expressed_in',
+                    predicate="biolink:expressed_in",
                     object=cellular_component_id,
                     stage_qualifier=stage_term_id,
                     qualifiers=([get_data(row, "assay")] if get_data(row, "assay") else None),
                     publications=publication_ids,
                     aggregator_knowledge_source=["infores:monarchinitiative", "infores:alliancegenome"],
                     primary_knowledge_source=source,
                     knowledge_level=KnowledgeLevelEnum.knowledge_assertion,
-                    agent_type=AgentTypeEnum.manual_agent
+                    agent_type=AgentTypeEnum.manual_agent,
                 )
             )
         else:
@@ -85,6 +81,4 @@
             )
 
     except Exception as exc:
-        logger.error(
-            f"Alliance gene expression ingest parsing exception for data row:\n\t'{str(row)}'\n{str(exc)}"
-        )
+        logger.error(f"Alliance gene expression ingest parsing exception for data row:\n\t'{str(row)}'\n{str(exc)}")
diff --git a/src/monarch_ingest/ingests/alliance/gene_to_phenotype.py b/src/monarch_ingest/ingests/alliance/gene_to_phenotype.py
@@ -2,10 +2,14 @@
 
 import uuid
 
-from koza.cli_runner import get_koza_app
+from koza.cli_utils import get_koza_app
 from source_translation import source_map
 
-from biolink_model.datamodel.pydanticmodel_v2 import GeneToPhenotypicFeatureAssociation, KnowledgeLevelEnum, AgentTypeEnum
+from biolink_model.datamodel.pydanticmodel_v2 import (
+    GeneToPhenotypicFeatureAssociation,
+    KnowledgeLevelEnum,
+    AgentTypeEnum,
+)
 
 
 from loguru import logger
@@ -42,9 +46,8 @@
             publications=[row["evidence"]["publicationId"]],
             aggregator_knowledge_source=["infores:monarchinitiative", "infores:alliancegenome"],
             primary_knowledge_source=source,
-            knowledge_level = KnowledgeLevelEnum.knowledge_assertion,
-            agent_type = AgentTypeEnum.manual_agent
-
+            knowledge_level=KnowledgeLevelEnum.knowledge_assertion,
+            agent_type=AgentTypeEnum.manual_agent,
         )
 
         if "conditionRelations" in row.keys() and row["conditionRelations"] is not None:

diff --git a/src/monarch_ingest/ingests/alliance/publication.py b/src/monarch_ingest/ingests/alliance/publication.py
@@ -1,4 +1,4 @@
-from koza.cli_runner import get_koza_app
+from koza.cli_utils import get_koza_app
 from dateutil.parser import parse, ParserError
 
 from biolink_model.datamodel.pydanticmodel_v2 import Publication
@@ -9,9 +9,7 @@
 while (row := koza_app.get_row()) is not None:
 
     # TODO: remove DOI exclusion once curie regex can handle them
-    xrefs = [
-        xref["id"] for xref in row["crossReferences"] if not xref["id"].startswith("DOI:")
-    ]
+    xrefs = [xref["id"] for xref in row["crossReferences"] if not xref["id"].startswith("DOI:")]
 
     # Parse creation date for different time formats
     creation_date = row["datePublished"]
@@ -21,10 +19,12 @@
         creation_date = None
 
     source: str
-    if 'MODReferenceTypes' in row and \
-            len(row['MODReferenceTypes']) > 0 and \
-            'source' in row['MODReferenceTypes'][0] and \
-            row['MODReferenceTypes'][0]['source'] in source_map:
+    if (
+        'MODReferenceTypes' in row
+        and len(row['MODReferenceTypes']) > 0
+        and 'source' in row['MODReferenceTypes'][0]
+        and row['MODReferenceTypes'][0]['source'] in source_map
+    ):
         source = source_map[row['MODReferenceTypes'][0]['source']]
     else:  # default source
         source = "infores:alliancegenome"
@@ -36,7 +36,7 @@
         xref=xrefs,
         type=[koza_app.translation_table.resolve_term("publication")],
         creation_date=creation_date,
-        provided_by=[source]
+        provided_by=[source],
     )
 
     if "authors" in row.keys():

diff --git a/src/monarch_ingest/ingests/bgee/gene_to_expression.py b/src/monarch_ingest/ingests/bgee/gene_to_expression.py
@@ -1,4 +1,4 @@
-from koza.cli_runner import get_koza_app
+from koza.cli_utils import get_koza_app
 from monarch_ingest.ingests.bgee.gene_to_expression_utils import process_koza_source