Skip to content

Commit

Permalink
format, update linting rule
Browse files Browse the repository at this point in the history
  • Loading branch information
glass-ships committed May 16, 2024
1 parent e5084f1 commit 4544837
Show file tree
Hide file tree
Showing 81 changed files with 480 additions and 445 deletions.
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,4 +63,7 @@ skip-string-normalization = true
line-length = 120
ignore = [
"F541", # f-strings with no placeholders
]

]
[tool.ruff.lint.per-file-ignores]
"tests/**.py" = ["F811"] # redefinition of unused imports (mock_koza)
3 changes: 2 additions & 1 deletion src/monarch_ingest/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from importlib import metadata
__version__ = metadata.version("monarch_ingest")

__version__ = metadata.version("monarch_ingest")
13 changes: 3 additions & 10 deletions src/monarch_ingest/cli_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import csv
import gc
import os
import pkgutil
import sys
import tarfile
import yaml
Expand Down Expand Up @@ -192,7 +191,7 @@ def transform_phenio(
"primary_knowledge_source",
"aggregator_knowledge_source",
"knowledge_level",
"agent_type"
"agent_type",
]
),
axis=1,
Expand Down Expand Up @@ -524,15 +523,9 @@ def do_release(dir: str = OUTPUT_DIR, kghub: bool = False):
)
# index files on s3 after upload
sh.multi_indexer(
*f"-v --prefix https://kghub.io/kg-monarch/ -b kg-hub-public-data -r kg-monarch -x".split(
" "
)
)
sh.gsutil(
*f"-q -m cp -a public-read ./index.html s3://kg-hub-public-data/kg-monarch".split(
" "
)
*f"-v --prefix https://kghub.io/kg-monarch/ -b kg-hub-public-data -r kg-monarch -x".split(" ")
)
sh.gsutil(*f"-q -m cp -a public-read ./index.html s3://kg-hub-public-data/kg-monarch".split(" "))

logger.debug("Cleaning up files...")
sh.rm(f"output/{release_ver}")
Expand Down
11 changes: 4 additions & 7 deletions src/monarch_ingest/ingests/alliance/gene.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,21 +48,18 @@
id=gene_id,
symbol=row["symbol"],
name=row["symbol"],
full_name=row["name"].replace("\r",""), # Replacement to remove stray carriage returns in XenBase files
full_name=row["name"].replace("\r", ""), # Replacement to remove stray carriage returns in XenBase files
# No place in the schema for gene type (SO term) right now
# type=row["soTermId"],
in_taxon=[in_taxon],
in_taxon_label=in_taxon_label,
provided_by=[source]
provided_by=[source],
)

if row["basicGeneticEntity"]["crossReferences"]:
gene.xref = [
koza_app.curie_cleaner.clean(xref["id"])
for xref in row["basicGeneticEntity"]["crossReferences"]
]
gene.xref = [koza_app.curie_cleaner.clean(xref["id"]) for xref in row["basicGeneticEntity"]["crossReferences"]]
if "synonyms" in row["basicGeneticEntity"].keys():
# more handling for errant carriage returns
gene.synonym = [synonym.replace("\r","") for synonym in row["basicGeneticEntity"]["synonyms"] ]
gene.synonym = [synonym.replace("\r", "") for synonym in row["basicGeneticEntity"]["synonyms"]]

koza_app.write(gene)
11 changes: 7 additions & 4 deletions src/monarch_ingest/ingests/alliance/gene_to_phenotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@
from koza.cli_utils import get_koza_app
from source_translation import source_map

from biolink_model.datamodel.pydanticmodel_v2 import GeneToPhenotypicFeatureAssociation, KnowledgeLevelEnum, AgentTypeEnum
from biolink_model.datamodel.pydanticmodel_v2 import (
GeneToPhenotypicFeatureAssociation,
KnowledgeLevelEnum,
AgentTypeEnum,
)


from loguru import logger
Expand Down Expand Up @@ -42,9 +46,8 @@
publications=[row["evidence"]["publicationId"]],
aggregator_knowledge_source=["infores:monarchinitiative", "infores:alliancegenome"],
primary_knowledge_source=source,
knowledge_level = KnowledgeLevelEnum.knowledge_assertion,
agent_type = AgentTypeEnum.manual_agent

knowledge_level=KnowledgeLevelEnum.knowledge_assertion,
agent_type=AgentTypeEnum.manual_agent,
)

if "conditionRelations" in row.keys() and row["conditionRelations"] is not None:
Expand Down
16 changes: 8 additions & 8 deletions src/monarch_ingest/ingests/alliance/publication.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@
while (row := koza_app.get_row()) is not None:

# TODO: remove DOI exclusion once curie regex can handle them
xrefs = [
xref["id"] for xref in row["crossReferences"] if not xref["id"].startswith("DOI:")
]
xrefs = [xref["id"] for xref in row["crossReferences"] if not xref["id"].startswith("DOI:")]

# Parse creation date for different time formats
creation_date = row["datePublished"]
Expand All @@ -21,10 +19,12 @@
creation_date = None

source: str
if 'MODReferenceTypes' in row and \
len(row['MODReferenceTypes']) > 0 and \
'source' in row['MODReferenceTypes'][0] and \
row['MODReferenceTypes'][0]['source'] in source_map:
if (
'MODReferenceTypes' in row
and len(row['MODReferenceTypes']) > 0
and 'source' in row['MODReferenceTypes'][0]
and row['MODReferenceTypes'][0]['source'] in source_map
):
source = source_map[row['MODReferenceTypes'][0]['source']]
else: # default source
source = "infores:alliancegenome"
Expand All @@ -36,7 +36,7 @@
xref=xrefs,
type=[koza_app.translation_table.resolve_term("publication")],
creation_date=creation_date,
provided_by=[source]
provided_by=[source],
)

if "authors" in row.keys():
Expand Down
47 changes: 24 additions & 23 deletions src/monarch_ingest/ingests/bgee/gene_to_expression_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,17 @@
def filter_group_by_rank(rows: List, col: str, largest_n: int = 0, smallest_n: int = 0) -> List[Dict]:
"""Function to filter a group of Koza rows by values largest or smallest values in column:
Get the top and/or bottom n rows ranked based on column:
Get the top and/or bottom n rows ranked based on column:
Args:
rows (List): The Koza object to read rows from.
col (str): The column to perform ranking and filtering.
largest_n (int): The number of rows to return from the largest ranking.
smallest_n (int): The number of rows to return from the smallest ranking.
Args:
rows (List): The Koza object to read rows from.
col (str): The column to perform ranking and filtering.
largest_n (int): The number of rows to return from the largest ranking.
smallest_n (int): The number of rows to return from the smallest ranking.
Returns:
List[Dict]: Returns a list of n rows in Koza dict format sorted by rank in column.
Returns:
List[Dict]: Returns a list of n rows in Koza dict format sorted by rank in column.
"""
df = pd.DataFrame(rows)
largest_df = df.nlargest(largest_n, col, keep="first")
Expand All @@ -29,11 +29,11 @@ def filter_group_by_rank(rows: List, col: str, largest_n: int = 0, smallest_n: i
def write_group(rows: List, koza_app: KozaApp):
"""Function to write a group of Koza rows to KozaApp object output:
Write list of rows in Koza format to KozaApp output:
Write list of rows in Koza format to KozaApp output:
Args:
rows (List): A list of rows to output to KozaApp.
koza_app (KozaApp): The KozaApp to use for output of rows.
Args:
rows (List): A list of rows to output to KozaApp.
koza_app (KozaApp): The KozaApp to use for output of rows.
"""
for row in rows:
association = GeneToExpressionSiteAssociation(
Expand All @@ -44,23 +44,24 @@ def write_group(rows: List, koza_app: KozaApp):
primary_knowledge_source="infores:bgee",
aggregator_knowledge_source=["infores:monarchinitiative"],
knowledge_level=KnowledgeLevelEnum.knowledge_assertion,
agent_type=AgentTypeEnum.not_provided)
agent_type=AgentTypeEnum.not_provided,
)

koza_app.write(association)


def get_row_group(koza_app: KozaApp, col: str = 'Gene ID') -> Union[List, None]:
"""Function to read a group of Koza rows from a KozaApp:
Get a group of rows from KozaApp grouped on column:
Get a group of rows from KozaApp grouped on column:
Args:
koza_app (KozaApp): The Koza object to read rows from.
col (str): The column to group rows based on.
Args:
koza_app (KozaApp): The Koza object to read rows from.
col (str): The column to group rows based on.
Returns:
List/None: Returns a list of rows in Koza dict format grouped by column.
Returns:
List/None: Returns a list of rows in Koza dict format grouped by column.
"""
if not hasattr(koza_app, 'previous_row'):
koza_app.previous_row = koza_app.get_row()
Expand All @@ -81,11 +82,11 @@ def get_row_group(koza_app: KozaApp, col: str = 'Gene ID') -> Union[List, None]:
def process_koza_source(koza_app: KozaApp):
"""Function to filter a group of Koza rows by values largest or smallest values in column:
Get the top and/or bottom n rows ranked based on column:
Get the top and/or bottom n rows ranked based on column:
Args:
koza_app (KozaApp): The Koza object to process for ingest.
Args:
koza_app (KozaApp): The Koza object to process for ingest.
"""
while(row_group := get_row_group(koza_app)) is not None:
while (row_group := get_row_group(koza_app)) is not None:
rank_filtered_rows = filter_group_by_rank(row_group, col='Expression rank', smallest_n=10)
write_group(rank_filtered_rows, koza_app)
10 changes: 7 additions & 3 deletions src/monarch_ingest/ingests/biogrid/biogrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,12 @@
publications = get_publication_ids(row['Publication Identifiers'])

# Only keep interactions using NCBIGene or UniProtKB identifiers, could also filter on taxid
if gid_a.startswith("NCBIGene:") or gid_a.startswith("UniProtKB:") \
and gid_b.startswith("NCBIGene:") or gid_b.startswith("UniProtKB:"):
if (
gid_a.startswith("NCBIGene:")
or gid_a.startswith("UniProtKB:")
and gid_b.startswith("NCBIGene:")
or gid_b.startswith("UniProtKB:")
):
association = PairwiseGeneToGeneInteraction(
id="uuid:" + str(uuid.uuid1()),
subject=gid_a,
Expand All @@ -27,7 +31,7 @@
primary_knowledge_source="infores:biogrid",
aggregator_knowledge_source=["infores:monarchinitiative"],
knowledge_level=KnowledgeLevelEnum.knowledge_assertion,
agent_type=AgentTypeEnum.not_provided
agent_type=AgentTypeEnum.not_provided,
)

koza_app.write(association)
20 changes: 10 additions & 10 deletions src/monarch_ingest/ingests/biogrid/biogrid_util.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
Some functions to assist parsing of BioGRID fields.
"""
from sys import stderr

from typing import List, Optional

from loguru import logger
Expand All @@ -14,9 +14,7 @@ def get_gene_id(raw_id: str) -> str:
:param raw_id: str, raw BioGRID input string (a pseudo-CURIE)
:return:
"""
gid = (raw_id
.replace("entrez gene/locuslink:", "NCBIGene:")
.replace("uniprot/swiss-prot:", "UniProtKB:"))
gid = raw_id.replace("entrez gene/locuslink:", "NCBIGene:").replace("uniprot/swiss-prot:", "UniProtKB:")

return gid

Expand All @@ -27,14 +25,14 @@ def get_gene_id(raw_id: str) -> str:
"two hybrid": "ECO:0000024",
"affinity chromatography technology": "ECO:0000079",
"genetic interference": "ECO:0000011",
"pull down": "ECO:0000025", # not totally sure about this one
"pull down": "ECO:0000025", # not totally sure about this one
"enzymatic study": "ECO:0000005",
"x-ray crystallography": "ECO:0001823",
"far western blotting": "ECO:0000076",
"fluorescent resonance energy transfer": "ECO:0001048",
"imaging technique": "ECO:0000324", # not totally sure about this one
"imaging technique": "ECO:0000324", # not totally sure about this one
"protein complementation assay": "ECO:0006256", # not totally sure about this one
"biochemical": "ECO:0000172" # not totally sure about this one
"biochemical": "ECO:0000172", # not totally sure about this one
}


Expand All @@ -54,11 +52,13 @@ def get_evidence(methods: str) -> Optional[List[str]]:
# databaseName:identifier(methodName)
method = method.rstrip(")").split('(')[-1]
if method not in EVIDENCE_CODE_MAPPINGS.keys():
err_msg = f"Unknown interaction detection method '{method}'. " +\
"Assigning default code ECO:0000006 == 'experimental evidence', the ECO root."
err_msg = (
f"Unknown interaction detection method '{method}'. "
+ "Assigning default code ECO:0000006 == 'experimental evidence', the ECO root."
)
logger.warning(err_msg)
EVIDENCE_CODE_MAPPINGS[method] = "ECO:0000006"

evidence_codes.append(EVIDENCE_CODE_MAPPINGS[method])

return evidence_codes if evidence_codes else None
Expand Down
9 changes: 6 additions & 3 deletions src/monarch_ingest/ingests/ctd/chemical_to_disease.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@

from koza.cli_utils import get_koza_app

from biolink_model.datamodel.pydanticmodel_v2 import ChemicalToDiseaseOrPhenotypicFeatureAssociation, \
KnowledgeLevelEnum, AgentTypeEnum
from biolink_model.datamodel.pydanticmodel_v2 import (
ChemicalToDiseaseOrPhenotypicFeatureAssociation,
KnowledgeLevelEnum,
AgentTypeEnum,
)
from monarch_ingest.constants import BIOLINK_TREATS_OR_APPLIED_OR_STUDIED_TO_TREAT

koza_app = get_koza_app("ctd_chemical_to_disease")
Expand All @@ -28,7 +31,7 @@
aggregator_knowledge_source=["infores:monarchinitiative"],
primary_knowledge_source="infores:ctd",
knowledge_level=KnowledgeLevelEnum.knowledge_assertion,
agent_type=AgentTypeEnum.manual_agent
agent_type=AgentTypeEnum.manual_agent,
)

koza_app.write(association)
5 changes: 2 additions & 3 deletions src/monarch_ingest/ingests/dictybase/gene.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,11 @@
in_taxon_label = taxon_labels[in_taxon]['label'] if in_taxon in taxon_labels else "Dictyostelium discoideum"

while (row := koza_app.get_row()) is not None:

synonyms = []
if row['Synonyms'] is not None:
synonyms = row['Synonyms'].split(", ")


gene = Gene(
id='dictyBase:' + row['GENE ID'],
symbol=row['Gene Name'],
Expand All @@ -23,7 +22,7 @@
synonym=synonyms,
in_taxon=[in_taxon],
in_taxon_label=in_taxon_label,
provided_by=["infores:dictybase"]
provided_by=["infores:dictybase"],
)

koza_app.write(gene)
12 changes: 7 additions & 5 deletions src/monarch_ingest/ingests/dictybase/gene_to_phenotype.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import uuid
from typing import Optional, Tuple

from koza.cli_utils import get_koza_app
from monarch_ingest.ingests.dictybase.utils import parse_gene_id, parse_phenotypes
from monarch_ingest.ingests.dictybase.utils import parse_phenotypes

from biolink_model.datamodel.pydanticmodel_v2 import GeneToPhenotypicFeatureAssociation, KnowledgeLevelEnum, \
AgentTypeEnum
from biolink_model.datamodel.pydanticmodel_v2 import (
GeneToPhenotypicFeatureAssociation,
KnowledgeLevelEnum,
AgentTypeEnum,
)

koza_app = get_koza_app("dictybase_gene_to_phenotype")

Expand All @@ -32,7 +34,7 @@
aggregator_knowledge_source=["infores:monarchinitiative"],
primary_knowledge_source="infores:dictybase",
knowledge_level=KnowledgeLevelEnum.knowledge_assertion,
agent_type=AgentTypeEnum.manual_agent
agent_type=AgentTypeEnum.manual_agent,
)

koza_app.write(association)
1 change: 1 addition & 0 deletions src/monarch_ingest/ingests/dictybase/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
A few Dictybase parse utility functions
"""

from typing import Optional, Tuple, Dict, List


Expand Down
Loading

0 comments on commit 4544837

Please sign in to comment.