From 3774dec204f9d6aa47ef338553f40b28ec060493 Mon Sep 17 00:00:00 2001 From: cmungall Date: Mon, 31 Jul 2023 14:26:25 -0700 Subject: [PATCH] Adding generate-extract command, 158. Add cell type templates #159 This PR does two things: - Add a combined generate-extract command, fixes #158 - Adds cell type templates, fixes #159 --- src/ontogpt/cli.py | 28 ++- src/ontogpt/engines/knowledge_engine.py | 6 +- src/ontogpt/engines/spires_engine.py | 8 + src/ontogpt/templates/cell_type.py | 203 ++++++++++++++++++ src/ontogpt/templates/cell_type.yaml | 143 ++++++++++++ .../input/cases/cell-type-salivary-acinar.txt | 5 + 6 files changed, 387 insertions(+), 6 deletions(-) create mode 100644 src/ontogpt/templates/cell_type.py create mode 100644 src/ontogpt/templates/cell_type.yaml create mode 100644 tests/input/cases/cell-type-salivary-acinar.txt diff --git a/src/ontogpt/cli.py b/src/ontogpt/cli.py index 85c4917a5..e21cf4108 100644 --- a/src/ontogpt/cli.py +++ b/src/ontogpt/cli.py @@ -157,6 +157,11 @@ def write_extraction( default="yaml", help="Output format.", ) +auto_prefix_option = click.option( + "--auto-prefix", + default="AUTO", + help="Prefix to use for auto-generated classes. Default is AUTO.", +) @click.group() @@ -201,11 +206,7 @@ def main(verbose: int, quiet: bool, cache_db: str, skip_annotator): @click.option("--dictionary") @output_format_options @use_textract_options -@click.option( - "--auto-prefix", - default="AUTO", - help="Prefix to use for auto-generated classes. Default is AUTO.", -) +@auto_prefix_option @click.option( "--set-slot-value", "-S", @@ -304,6 +305,23 @@ def extract( write_extraction(results, output, output_format, ke) +@main.command() +@template_option +@model_option +@recurse_option +@output_option_wb +@output_format_options +@auto_prefix_option +@click.argument("entity") +def generate_extract(entity, template, output, output_format, **kwargs): + """Generate text using GPT and then extract knowledge from it.""" + logging.info(f"Creating for {template}") + ke = SPIRESEngine(template, **kwargs) + logging.debug(f"Input entity: {entity}") + results = ke.generate_and_extract(entity) + write_extraction(results, output, output_format) + + @main.command() @template_option @model_option diff --git a/src/ontogpt/engines/knowledge_engine.py b/src/ontogpt/engines/knowledge_engine.py index 018a4e704..71c1c67c3 100644 --- a/src/ontogpt/engines/knowledge_engine.py +++ b/src/ontogpt/engines/knowledge_engine.py @@ -155,7 +155,11 @@ def __post_init__(self): self.mappers = [get_adapter("translator:")] self.set_up_client() - self.encoding = tiktoken.encoding_for_model(self.client.model) + try: + self.encoding = tiktoken.encoding_for_model(self.client.model) + except KeyError: + self.encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") + logger.error(f"Could not find encoding for model {self.client.model}") def set_api_key(self, key: str): self.api_key = key diff --git a/src/ontogpt/engines/spires_engine.py b/src/ontogpt/engines/spires_engine.py index 61423b634..6ed31ed81 100644 --- a/src/ontogpt/engines/spires_engine.py +++ b/src/ontogpt/engines/spires_engine.py @@ -92,10 +92,18 @@ def extract_from_text( named_entities=self.named_entities, ) + def _extract_from_text_to_dict(self, text: str, cls: ClassDefinition = None) -> RESPONSE_DICT: raw_text = self._raw_extract(text, cls) return self._parse_response_to_dict(raw_text, cls) + def generate_and_extract( + self, entity: str, **kwargs + ) -> ExtractionResult: + prompt = f"Generate a comprehensive description of {entity}.\n" + payload = self.client.complete(prompt) + return self.extract_from_text(payload, **kwargs) + def generalize( self, object: Union[pydantic.BaseModel, dict], examples: List[EXAMPLE] ) -> ExtractionResult: diff --git a/src/ontogpt/templates/cell_type.py b/src/ontogpt/templates/cell_type.py new file mode 100644 index 000000000..f992f21b4 --- /dev/null +++ b/src/ontogpt/templates/cell_type.py @@ -0,0 +1,203 @@ +from __future__ import annotations +from datetime import datetime, date +from enum import Enum +from typing import List, Dict, Optional, Any, Union, Literal +from pydantic import BaseModel as BaseModel, Field +from linkml_runtime.linkml_model import Decimal + +metamodel_version = "None" +version = "None" + +class WeakRefShimBaseModel(BaseModel): + __slots__ = '__weakref__' + +class ConfiguredBaseModel(WeakRefShimBaseModel, + validate_assignment = True, + validate_all = True, + underscore_attrs_are_private = True, + extra = 'forbid', + arbitrary_types_allowed = True): + pass + + +class BrainRegionIdentifier(str, Enum): + + + dummy = "dummy" + + +class NullDataOptions(str, Enum): + + UNSPECIFIED_METHOD_OF_ADMINISTRATION = "UNSPECIFIED_METHOD_OF_ADMINISTRATION" + NOT_APPLICABLE = "NOT_APPLICABLE" + NOT_MENTIONED = "NOT_MENTIONED" + + + +class CellTypeDocument(ConfiguredBaseModel): + + cell_type: Optional[str] = Field(None, description="""the name of the cell type described""") + range: Optional[str] = Field(None) + parents: Optional[List[str]] = Field(default_factory=list, description="""categorization""") + subtypes: Optional[List[str]] = Field(default_factory=list) + localizations: Optional[List[str]] = Field(default_factory=list) + genes: Optional[List[str]] = Field(default_factory=list) + diseases: Optional[List[str]] = Field(default_factory=list) + + + +class InterneuronDocument(CellTypeDocument): + + projects_to_or_from: Optional[List[str]] = Field(default_factory=list, description="""Brain structures from which this cell type projects into or receives projections from""") + cell_type: Optional[str] = Field(None, description="""the name of the cell type described""") + range: Optional[str] = Field(None) + parents: Optional[List[str]] = Field(default_factory=list, description="""categorization""") + subtypes: Optional[List[str]] = Field(default_factory=list) + localizations: Optional[List[str]] = Field(default_factory=list) + genes: Optional[List[str]] = Field(default_factory=list) + diseases: Optional[List[str]] = Field(default_factory=list) + + + +class ExtractionResult(ConfiguredBaseModel): + """ + A result of extracting knowledge on text + """ + input_id: Optional[str] = Field(None) + input_title: Optional[str] = Field(None) + input_text: Optional[str] = Field(None) + raw_completion_output: Optional[str] = Field(None) + prompt: Optional[str] = Field(None) + extracted_object: Optional[Any] = Field(None, description="""The complex objects extracted from the text""") + named_entities: Optional[List[Any]] = Field(default_factory=list, description="""Named entities extracted from the text""") + + + +class NamedEntity(ConfiguredBaseModel): + + id: str = Field(None, description="""A unique identifier for the named entity""") + label: Optional[str] = Field(None, description="""The label (name) of the named thing""") + + + +class Gene(NamedEntity): + + id: str = Field(None, description="""A unique identifier for the named entity""") + label: Optional[str] = Field(None, description="""The label (name) of the named thing""") + + + +class Pathway(NamedEntity): + + id: str = Field(None, description="""A unique identifier for the named entity""") + label: Optional[str] = Field(None, description="""The label (name) of the named thing""") + + + +class AnatomicalStructure(NamedEntity): + + id: str = Field(None, description="""A unique identifier for the named entity""") + label: Optional[str] = Field(None, description="""The label (name) of the named thing""") + + + +class BrainRegion(AnatomicalStructure): + + id: str = Field(None, description="""A unique identifier for the named entity""") + label: Optional[str] = Field(None, description="""The label (name) of the named thing""") + + + +class CellType(NamedEntity): + + id: str = Field(None, description="""A unique identifier for the named entity""") + label: Optional[str] = Field(None, description="""The label (name) of the named thing""") + + + +class Disease(NamedEntity): + + id: str = Field(None, description="""A unique identifier for the named entity""") + label: Optional[str] = Field(None, description="""The label (name) of the named thing""") + + + +class Drug(NamedEntity): + + id: str = Field(None, description="""A unique identifier for the named entity""") + label: Optional[str] = Field(None, description="""The label (name) of the named thing""") + + + +class CompoundExpression(ConfiguredBaseModel): + + None + + + +class Triple(CompoundExpression): + """ + Abstract parent for Relation Extraction tasks + """ + subject: Optional[str] = Field(None) + predicate: Optional[str] = Field(None) + object: Optional[str] = Field(None) + qualifier: Optional[str] = Field(None, description="""A qualifier for the statements, e.g. \"NOT\" for negation""") + subject_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the subject of the statement, e.g. \"high dose\" or \"intravenously administered\"""") + object_qualifier: Optional[str] = Field(None, description="""An optional qualifier or modifier for the object of the statement, e.g. \"severe\" or \"with additional complications\"""") + + + +class TextWithTriples(ConfiguredBaseModel): + + publication: Optional[Publication] = Field(None) + triples: Optional[List[Triple]] = Field(default_factory=list) + + + +class RelationshipType(NamedEntity): + + id: str = Field(None, description="""A unique identifier for the named entity""") + label: Optional[str] = Field(None, description="""The label (name) of the named thing""") + + + +class Publication(ConfiguredBaseModel): + + id: Optional[str] = Field(None, description="""The publication identifier""") + title: Optional[str] = Field(None, description="""The title of the publication""") + abstract: Optional[str] = Field(None, description="""The abstract of the publication""") + combined_text: Optional[str] = Field(None) + full_text: Optional[str] = Field(None, description="""The full text of the publication""") + + + +class AnnotatorResult(ConfiguredBaseModel): + + subject_text: Optional[str] = Field(None) + object_id: Optional[str] = Field(None) + object_text: Optional[str] = Field(None) + + + + +# Update forward refs +# see https://pydantic-docs.helpmanual.io/usage/postponed_annotations/ +CellTypeDocument.update_forward_refs() +InterneuronDocument.update_forward_refs() +ExtractionResult.update_forward_refs() +NamedEntity.update_forward_refs() +Gene.update_forward_refs() +Pathway.update_forward_refs() +AnatomicalStructure.update_forward_refs() +BrainRegion.update_forward_refs() +CellType.update_forward_refs() +Disease.update_forward_refs() +Drug.update_forward_refs() +CompoundExpression.update_forward_refs() +Triple.update_forward_refs() +TextWithTriples.update_forward_refs() +RelationshipType.update_forward_refs() +Publication.update_forward_refs() +AnnotatorResult.update_forward_refs() + diff --git a/src/ontogpt/templates/cell_type.yaml b/src/ontogpt/templates/cell_type.yaml new file mode 100644 index 000000000..770e62657 --- /dev/null +++ b/src/ontogpt/templates/cell_type.yaml @@ -0,0 +1,143 @@ +id: http://w3id.org/ontogpt/cell_type +name: cell_type +title: Composite Disease +description: >- + A template for representing cell types +license: https://creativecommons.org/publicdomain/zero/1.0/ +prefixes: + linkml: https://w3id.org/linkml/ + cell_type: http://w3id.org/ontogpt/cell_type/ +keywords: + - disease + - indication + +default_prefix: cell_type +default_range: string + +imports: + - linkml:types + - core + +classes: + CellTypeDocument: + tree_root: true + attributes: + cell_type: + description: the name of the cell type described + range: CellType + + parents: + description: categorization + annotations: + prompt: semicolon-separated list of parent (broader) cell types + multivalued: true + range: CellType + + subtypes: + annotations: + prompt: semicolon-separated list of the main subtypes + multivalued: true + range: CellType + + localizations: + annotations: + prompt: semicolon-separated list of anatomical structures in which this cell type is localized + multivalued: true + range: AnatomicalStructure + + genes: + annotations: + prompt: semicolon-separated list of genes expressed in cells of this type + multivalued: true + range: Gene + + diseases: + annotations: + prompt: semicolon-separated list of diseases in which this cell type is implicated + multivalued: true + range: Disease + + InterneuronDocument: + is_a: CellTypeDocument + + attributes: + projects_to_or_from: + description: Brain structures from which this cell type projects into or receives projections from + annotations: + prompt: semicolon-separated list of anatomical structures from which this cell type projects from or into + multivalued: true + range: BrainRegion + + Gene: + is_a: NamedEntity + id_prefixes: + - HGNC + - MGI + - PR + - UniProtKB + annotations: + annotators: sqlite:obo:hgnc, bioportal:hgnc-nr + + Pathway: + is_a: NamedEntity + id_prefixes: + - GO + - PW + annotations: + annotators: sqlite:obo:go, sqlite:obo:pw + + AnatomicalStructure: + is_a: NamedEntity + id_prefixes: + - UBERON + - FBbt + - WBbt + annotations: + annotators: sqlite:obo:uberon, sqlite:obo:fbbt, sqlite:obo:wbbt + + BrainRegion: + is_a: AnatomicalStructure + id_prefixes: + - UBERON + - FBbt + - WBbt + annotations: + annotators: sqlite:obo:uberon, sqlite:obo:fbbt, sqlite:obo:wbbt + slot_usage: + id: + values_from: + - BrainRegionIdentifier + + CellType: + is_a: NamedEntity + id_prefixes: + - CL + - FBbt + - WBbt + annotations: + annotators: sqlite:obo:cl, sqlite:obo:fbbt, sqlite:obo:wbbt + + Disease: + is_a: NamedEntity + id_prefixes: + - MONDO + - HP + annotations: + annotators: sqlite:obo:mondo, sqlite:obo:hp + + Drug: + is_a: NamedEntity + annotations: + annotators: sqlite:obo:chebi, sqlite:obo:drugbank + +enums: + + BrainRegionIdentifier: + description: Brain region (or for now, any nervous system part) + reachable_from: + source_ontology: obo:uberon + relationship_types: + - rdfs:subClassOf + - BFO:0000050 + source_nodes: + - UBERON:0001016 ## nervous system \ No newline at end of file diff --git a/tests/input/cases/cell-type-salivary-acinar.txt b/tests/input/cases/cell-type-salivary-acinar.txt new file mode 100644 index 000000000..fadf89f2d --- /dev/null +++ b/tests/input/cases/cell-type-salivary-acinar.txt @@ -0,0 +1,5 @@ +Acinar cells of the salivary gland are a critical part of the oral physiology and the digestive mechanism. These specialized cells are concentrated towards the terminal ends of the salivary glands, where they form sac-like acini or clusters, giving them their name. They are amphicrine in nature, which means they play both secretory and endocrine roles. + +The primary function of the acinar cells of the salivary gland is to manufacture and secrete saliva. This substance, comprised mainly of water, electrolytes, mucins, and enzymes, is vital for the initial stages of digestion and for the regular maintenance of oral health. The acinar cells produce the two main types of saliva: serous and mucinous. The serous type, predominantly produced by the parotid gland, contains the enzyme amylase responsible for the initial breakdown of complex carbohydrates. The mucinous type, more common in the submandibular and sublingual glands, contains mucin that aids in lubrication, facilitating the process of mastication and bolus formation. + +Moreover, acinar cells also express and respond to various endocrine, paracrine, and autocrine hormones that govern the quantity and composition of the saliva produced. This implies sensitivity and modulation according to the internal and external environment, dietary habits, and even circadian rhythms. Impairment of acinar cells, whether due to damage or related to a systemic condition such as Sjögren's Syndrome, could disrupt the optimal functioning of the oral and digestive systems. In conclusion, acinar cells of the salivary gland, despite their diminutive size, have a substantial role in oral health and overall homeostasis.