merged main

monarch-initiative · Aug 25, 2023 · 99cabad · 99cabad
2 parents c2fdc8f + 265e23a
commit 99cabad
Show file tree

Hide file tree

Showing 7 changed files with 143 additions and 116 deletions.
diff --git a/docs/Sources/index.md b/docs/Sources/index.md
@@ -0,0 +1,6 @@
+# Data Sources
+
+This section contains detailed information on all datasets and ontologies  
+ingested to create the Monarch knowledge graph.  
+
+To learn more about a specific dataset/ontology, click on the source name in the list to the left.  
diff --git a/mkdocs.yaml b/mkdocs.yaml
@@ -17,6 +17,7 @@ nav:
   - KG Build Process: 'KG-Build-Process/kg-build-process.md'
   - Principles: 'Principles/modeling-principles.md'
   - Sources:
+    - Overview: 'Sources/index.md'
     - Alliance: 'Sources/alliance.md'
     - BGee: 'Sources/bgee.md'
     - CTD: 'Sources/ctd.md'

diff --git a/poetry.lock b/poetry.lock
diff --git a/src/monarch_ingest/cli_utils.py b/src/monarch_ingest/cli_utils.py
@@ -350,6 +350,10 @@ def load_jsonl():
             edges_df = pandas.read_csv(edge_file, sep="\t", dtype="string", lineterminator="\n", quoting=csv.QUOTE_NONE,
                                    comment='#')
             edges_df["category"] = edges_df["category"].map(class_ancestor_dict)
+            # Prefixing only these two fields is an odd thing that Translator needs, so
+            # they're being duplicated with the prefixes here
+            edges_df["biolink:primary_knowledge_source"] = edges_df["primary_knowledge_source"]
+            edges_df["biolink:aggregator_knowledge_source"] = edges_df["aggregator_knowledge_source"]
             edges_df.to_json("output/monarch-kg_edges.jsonl", orient="records", lines=True)
             del edges_df
             gc.collect()

diff --git a/src/monarch_ingest/ingests.yaml b/src/monarch_ingest/ingests.yaml
@@ -20,8 +20,8 @@ goa_go_annotation:
   config: 'ingests/goa/go_annotation.yaml'
 hgnc_gene:
   config: 'ingests/hgnc/gene.yaml'
-hpoa_disease_phenotype:
-  config: 'ingests/hpoa/disease_phenotype.yaml'
+hpoa_disease_to_phenotype:
+  config: 'ingests/hpoa/disease_to_phenotype.yaml'
 hpoa_gene_to_disease:
   config: 'ingests/hpoa/gene_to_disease.yaml'
 hpoa_disease_mode_of_inheritance:

diff --git a/src/monarch_ingest/ingests/dictybase/gene.py b/src/monarch_ingest/ingests/dictybase/gene.py
@@ -5,13 +5,15 @@
 koza_app = get_koza_app("dictybase_gene")
 taxon_labels = koza_app.get_map("taxon-labels")
 
+in_taxon = "NCBITaxon:44689"
+in_taxon_label = taxon_labels[in_taxon]['label'] if in_taxon in taxon_labels else "Dictyostelium discoideum"
+
 while (row := koza_app.get_row()) is not None:
 
     synonyms = []
     if row['Synonyms'] is not None:
         synonyms = row['Synonyms'].split(", ")
 
-    in_taxon = "NCBITaxon:44689"
 
     gene = Gene(
         id='dictyBase:' + row['GENE ID'],
@@ -20,7 +22,7 @@
         full_name=row['Gene Name'],
         synonym=synonyms,
         in_taxon=[in_taxon],
-        in_taxon_label=taxon_labels[in_taxon]['label'],
+        in_taxon_label=in_taxon_label,
         provided_by=["infores:dictybase"]
     )
 

diff --git a/src/monarch_ingest/ingests/ncbi/gene.py b/src/monarch_ingest/ingests/ncbi/gene.py
@@ -5,10 +5,23 @@
 koza_app = get_koza_app("ncbi_gene")
 taxon_labels = koza_app.get_map("taxon-labels")
 
+# If a taxon label we need isn't in phenio's NCBITaxon subset, we can add it here
+extra_taxon_labels = {
+    'NCBITaxon:227321': 'Dictyostelium discoideum'
+}
+
 while (row := koza_app.get_row()) is not None:
 
     in_taxon = 'NCBITaxon:' + row["tax_id"]
-    in_taxon_label = taxon_labels[in_taxon]["label"]
+
+
+    if in_taxon in taxon_labels:
+        in_taxon_label = taxon_labels[in_taxon]['label']
+    elif in_taxon in extra_taxon_labels:
+        in_taxon_label = extra_taxon_labels[in_taxon]
+    else:
+        raise ValueError(f"Taxon {in_taxon} not found in taxon-labels")
+
     gene = Gene(
         id='NCBIGene:' + row["GeneID"],
         symbol=row["Symbol"],