In [1]:
"""
Create a Forte pipeline.
"""
import forte
import os
import yaml

nlp = forte.pipeline.Pipeline()
# An example configuration for the pipeline is provided in this repo
config_file = os.path.join('examples/pipeline/inference/config.yml')
config = yaml.safe_load(open(config_file, "r"))
config = forte.common.configuration.Config(config, default_hparams=None)
print(config)

  from .autonotebook import tqdm as notebook_tqdm


{
  "allennlp": {
    "allow_parallel_entries": true,
    "cuda_devices": [
      0,
      1
    ],
    "overwrite_entries": false,
    "processors": "tokenize, srl",
    "srl_url": "https://storage.googleapis.com/allennlp-public-models/openie-model.2020.03.26.tar.gz",
    "tag_formalism": "srl"
  },
  "allennlp_query": {
    "allow_parallel_entries": true,
    "overwrite_entries": false,
    "processors": [
      "tokenize",
      "pos",
      "srl"
    ],
    "srl_url": "https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz",
    "tag_formalism": "srl"
  },
  "boxer": {
    "pack_name": "query"
  },
  "indexer": {
    "field": "content",
    "index_config": {
      "algorithm": "bm25",
      "hosts": "localhost:9200",
      "index_name": "elastic_index"
    },
    "indexed_text_only": false,
    "query_pack_name": "query",
    "response_pack_name_prefix": "passage"
  },
  "query_creator": {
    "field": "content",
    "query_pack_name"

In [2]:
"""
Attach a reader to the pipeline.
"""
from forte.data.readers import MultiPackTerminalReader
nlp.set_reader(MultiPackTerminalReader(), config=config.reader).initialize()
# process_dataset() returns a Python iterator of data (MultiPack in this case)
# returned by `MultiPackTerminalReader`
print(next(nlp.process_dataset()).get_pack_at(0).text)



what does covid cause


In [3]:
"""
Add NLP processors to extract necessary language features such as
POS (Parts of Speech), Lemma and SRL (Semantic Role Labeling).
"""
from fortex.nltk.nltk_processors import NLTKLemmatizer, \
    NLTKSentenceSegmenter, NLTKWordTokenizer, NLTKPOSTagger
from fortex.allennlp.allennlp_processors import AllenNLPProcessor
from forte.data.selector import NameMatchSelector

selector = NameMatchSelector(select_name=config.reader.pack_name)
nlp.add(NLTKSentenceSegmenter(), selector=selector)
nlp.add(NLTKWordTokenizer(), selector=selector)
nlp.add(NLTKPOSTagger(), selector=selector)
nlp.add(NLTKLemmatizer(), selector=selector)
nlp.add(AllenNLPProcessor(), config=config.allennlp_query, selector=selector)

# See what's in the result data pack
nlp.initialize()
from ft.onto.base_ontology import Token, Sentence, PredicateLink
data_pack = next(nlp.process_dataset()).get_pack_at(0)
for sent in data_pack.get(Sentence):
    print("Tokens created by NLTK:")
    for token in data_pack.get(Token, sent, components=["fortex.nltk.nltk_processors.NLTKWordTokenizer"]):
        print(f"    text: {data_pack.text[token.begin:token.end]}, pos: {token.pos}, lemma: {token.lemma}")
    print("Semantic role labels created by AllenNLP:")
    for pred in data_pack.get(PredicateLink, sent, components=["fortex.allennlp.allennlp_processors.AllenNLPProcessor"]):
        verb = pred.get_parent()
        noun = pred.get_child()
        print(f"    verb: {data_pack.text[verb.begin:verb.end]}, noun: {data_pack.text[noun.begin:noun.end]}, noun_type: {pred.arg_type}")

  "Passing parameters through __init__ is deprecated,"
[nltk_data] Downloading package punkt to /home/murphy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/murphy/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/murphy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Tokens created by NLTK:
    text: what, pos: WP, lemma: what
    text: does, pos: VBZ, lemma: do
    text: covid, pos: NN, lemma: covid
    text: cause, pos: NN, lemma: cause
Semantic role labels created by AllenNLP:
    verb: cause, noun: what, noun_type: ARG1
    verb: cause, noun: covid, noun_type: ARG0


In [4]:
"""
Use above extracted language features to create an Elastic Search (ES) query
and retrieve relevant documents from an ES database.
"""
from typing import Any, Dict, Tuple
from forte.data.data_pack import DataPack
from forte.data.multi_pack import MultiPack
from forte.processors.base import QueryProcessor
from fortex.elastic.elastic_search_processor import ElasticSearchProcessor
# from composable_source.processors.elasticsearch_query_creator import ElasticSearchQueryCreator
from composable_source.utils.utils import query_preprocess


class ElasticSearchQueryCreator(QueryProcessor):
    """
    Complete implementation is available at composable_source/processors/elasticsearch_query_creator.py
    """

    @classmethod
    def default_configs(cls) -> Dict[str, Any]:
        return {
            "size": 1000,
            "field": "content",
            "query_pack_name": "query"
        }

    def _process_query(self, input_pack: MultiPack) -> Tuple[DataPack, Dict[str, Any]]:
        """
        process query datapack and return query
        :param input_pack:
        :return:
        """
        query_pack = input_pack.get_pack(self.configs.query_pack_name)
        query_pack.pack_name = self.configs.query_pack_name
        query = self._build_query_nlp(query_pack)
        return query_pack, query
    
    def _build_query_nlp(self, input_pack: DataPack) -> Dict[str, Any]:
        query, arg0, arg1, verb, _, is_answer_arg0 = query_preprocess(input_pack)
        if not arg0 or not arg1:
            processed_query = query
        if is_answer_arg0 is None:
            processed_query = f'{arg0} {verb} {arg1}'.lower()
        elif is_answer_arg0:
            processed_query = f'{arg1} {verb}'.lower()
        else:
            processed_query = f'{arg0} {verb}'.lower()
        return {
            "query": {
                "match_phrase": {
                    self.configs.field: {
                        "query": processed_query,
                        "slop": 10  # how far we allow the terms to be
                    }
                }
            },
            "size": self.configs.size
        }

nlp.add(ElasticSearchQueryCreator(), config=config.query_creator)
nlp.add(ElasticSearchProcessor(), config=config.indexer)

# See what's in the result data pack
nlp.initialize()
data_pack = next(nlp.process_dataset()).get_pack_at(1)
from ft.onto.base_ontology import Document
data_pack.get_single(Document)

[nltk_data] Downloading package punkt to /home/murphy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/murphy/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/murphy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
INFO:elasticsearch:GET http://localhost:9200/ [status:200 request:0.004s]
INFO:elasticsearch:POST http://localhost:9200/elastic_index/_search [status:200 request:0.061s]


Document(document_class=[], sentiment={}, classifications=<forte.data.ontology.core.FDict object at 0x7f6d1d822a58>)

In [5]:
"""
Add NLP processors to extract necessary language features such as Entity links and SRL.
"""
from fortex.spacy.spacy_processors import SpacyProcessor
from forte.data.selector import RegexNameMatchSelector
pattern = rf"{config.indexer.response_pack_name_prefix}_\d"
selector = RegexNameMatchSelector(select_name=pattern)
nlp.add(component=SpacyProcessor(), config=config.spacy1, selector=selector)
nlp.add(component=SpacyProcessor(), config=config.spacy2, selector=selector)
nlp.add(AllenNLPProcessor(), config=config.allennlp, selector=selector)
nlp.add(NLTKPOSTagger(), selector=selector)
nlp.add(NLTKLemmatizer(), selector=selector)

# See what's in the result data pack
from ftx.onto.clinical import MedicalEntityMention
from ft.onto.base_ontology import Sentence
nlp.initialize()
data_pack = next(nlp.process_dataset()).get_pack_at(1)
sent = data_pack.get_single(Sentence)

print(f"Sentence: {sent.text}")
print("Entities created by SciSpacy:")
for entity in data_pack.get(MedicalEntityMention, sent):
    for umls in entity.umls_entities:
        print(f"    entity: {umls.name}, cui: {umls.cui}")
print("Semantic role labels created by AllenNLP:")
for pred in data_pack.get(PredicateLink, sent):
    verb = pred.get_parent()
    noun = pred.get_child()
    print(f"    verb: {data_pack.text[verb.begin:verb.end]}, noun: {data_pack.text[noun.begin:noun.end]}, noun_type: {pred.arg_type}")

  "Passing parameters through __init__ is deprecated,"
[nltk_data] Downloading package punkt to /home/murphy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/murphy/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/murphy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/murphy/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/murphy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
INFO:elasticsearch:GET http://localhost:9200/ [status:200 request:0.004s]
INFO:elasticsearch:POST http://localhost:9200/elastic_index/_search [status:200 request:0.052s]
  extended_neighbors[em

Sentence: Assessing the Country-Level Excess All-Cause Mortality and the Impacts of Air Pollution and Human Activity during the COVID-19 Epidemic

Citation:
Entities created by SciSpacy:
    entity: Homo sapiens, cui: C0086418
    entity: Humanin, human, cui: C4318409
    entity: Bone Tissue, Human, cui: C4520924
    entity: Approved for Human Use Product, cui: C4055445
    entity: AR protein, human, cui: C1447749
    entity: COVID-19, cui: C5203670
Semantic role labels created by AllenNLP:
    verb: Assessing, noun: the Country-Level Excess All-Cause Mortality and the Impacts of Air Pollution and Human Activity, noun_type: ARG1
    verb: Assessing, noun: during the COVID-19 Epidemic

Citation, noun_type: ARGM-TMP


In [6]:
"""
Use a response creator to format the result data packs into human readable formats.
"""
from composable_source.processors.response_creator import ResponseCreator
nlp.add(ResponseCreator(), config=config.response)

# Execute the pipeline once on one query and see the results 
nlp.initialize()
data_pack = next(nlp.process_dataset())

[nltk_data] Downloading package punkt to /home/murphy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/murphy/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/murphy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/murphy/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/murphy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
INFO:elasticsearch:GET http://localhost:9200/ [status:200 request:0.005s]
INFO:elasticsearch:POST http://localhost:9200/elastic_index/_search [status:200 request:0.051s]


•Relation:
COVID-19	caused	(WHO 2020
•Source Sentence:
COVID-19 should be recorded on the medical certificate of cause of death for ALL decedents where the disease caused, or is assumed to have caused, or contributed to death (WHO 2020, 3).On the one hand, when COVID-19 is not part of the causal chain that leads directly to death, it should not be indicated as the underlying cause of death.(From Paper: , COVID-19 as the underlying cause of death: disentangling facts and values)
•UMLS Concepts:
 - covid-19
	Name: COVID-19	CUI: C5203670	Learn more at: https://www.ncbi.nlm.nih.gov/search/all/?term=C5203670
•Relation:
COVID-19	cause	permanent damage to olfactory and gustatory function
•Source Sentence:
Does COVID-19 cause permanent damage to olfactory and gustatory function?(From Paper: , Journal Pre-proofs Does COVID-19 cause permanent damage to olfactory and gustatory func- tion? Does COVID-19 cause permanent damage to olfactory and gustatory function? Self-reported olfactory and taste d