In [None]:
!pip install ipywidgets widgetsnbextension pandas-profiling
!pip install forte.elastic
!pip install composable_source.utils

In [4]:
import forte
from forte.data.readers import TerminalReader
from fortex.nltk.nltk_processors import NLTKLemmatizer, NLTKWordTokenizer, NLTKPOSTagger, NLTKSentenceSegmenter
from fortex.allennlp.allennlp_processors import AllenNLPProcessor

nlp = forte.pipeline.Pipeline()

nlp.set_reader(TerminalReader())

nlp.add(NLTKSentenceSegmenter())
nlp.add(NLTKWordTokenizer())
nlp.add(NLTKPOSTagger())
nlp.add(NLTKLemmatizer())

allennlp_config = {
    'processors': "tokenize, pos, srl",
    'tag_formalism': "srl",
    'overwrite_entries': False,
    'allow_parallel_entries': True,
    'srl_url': "https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz"
}
nlp.add(AllenNLPProcessor(), config=allennlp_config)

<forte.pipeline.Pipeline at 0x2013e09ce10>

In [8]:
nlp.initialize()
from ft.onto.base_ontology import Token, Sentence, PredicateLink
data_pack = next(nlp.process_dataset())
for sent in data_pack.get(Sentence):
    print("Tokens created by NLTK:")
    for token in data_pack.get(Token, sent, components=["fortex.nltk.nltk_processors.NLTKWordTokenizer"]):
        print(f" text: {token.text}, pos: {token.pos}, lemma: {token.lemma}")
print("Semantic role labels created by AllenNLP:")
for pred in data_pack.get(PredicateLink, sent, components=["fortex.allennlp.allennlp_processors.AllenNLPProcessor"]):
    verb = pred.get_parent()
    noun = pred.get_child()
    print(f" verb: {verb.text}, noun: {data_pack.text[noun.begin:noun.end]}, noun_type: {pred.arg_type}")

[nltk_data] Downloading package punkt to C:\Users\Mohammad
[nltk_data]     Dareer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Mohammad Dareer\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:\Users\Mohammad
[nltk_data]     Dareer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Tokens created by NLTK:
 text: What, pos: WP, lemma: What
 text: does, pos: VBZ, lemma: do
 text: nlp, pos: JJ, lemma: nlp
 text: mean, pos: VB, lemma: mean
 text: ?, pos: ., lemma: ?
Semantic role labels created by AllenNLP:
 verb: mean, noun: What, noun_type: ARG1
 verb: mean, noun: nlp, noun_type: ARG0


In [10]:
from forte.data.caster import MultiPackBoxer
boxer_config = {"pack_name": "query"}
nlp.add(MultiPackBoxer(), config=boxer_config)

<forte.pipeline.Pipeline at 0x2013e09ce10>

In [15]:
from typing import Any, Dict, Tuple
from forte.data.data_pack import DataPack
from forte.data.multi_pack import MultiPack
from forte.processors.base import QueryProcessor
from fortex.elastic.elastic_search_processor import ElasticSearchProcessor


class ElasticSearchQueryCreator(QueryProcessor):
    @classmethod
    def default_configs(cls) -> Dict[str, Any]:
        config = super().default_configs()
        config.update({
            "size": 1000,
            "field": "doc",
            "query_pack_name": "query"
        }) 
        return config

    def _process_query(self, input_pack: MultiPack) -> Tuple[DataPack, Dict[str, Any]]:
        query_pack = input_pack.get_pack(self.configs.query_pack_name)
        query_pack.pack_name = self.configs.query_pack_name
        query = self._build_query_nlp(query_pack)
        return query_pack, query

    def _build_query_nlp(self, input_pack: DataPack) -> Dict[str, Any]:
        query, arg0, arg1, verb, _, is_answer_arg0 = query_preprocess(input_pack)
        if not arg0 or not arg1:
            processed_query = query
        if is_answer_arg0 is None:
            processed_query = f'{arg0} {verb} {arg1}'.lower()
        elif is_answer_arg0:
            processed_query = f'{arg1} {verb}'.lower()
        else:
            processed_query = f'{arg0} {verb}'.lower()
        return {
            "query": {
                "match_phrase": {
                    self.configs.field: {
                        "query": processed_query,
                        "slop": 10
                    }
                }
            },
            "size": self.configs.size
        }

In [16]:
query_creator_config = {"size": 10}
nlp.add(ElasticSearchQueryCreator(), query_creator_config)

<forte.pipeline.Pipeline at 0x2013e09ce10>

In [17]:
from forte.data.ontology.top import Query
nlp.initialize()
print(next(nlp.process_dataset()).get_pack("query").get_single(Query).value)

[nltk_data] Downloading package punkt to C:\Users\Mohammad
[nltk_data]     Dareer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Mohammad Dareer\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:\Users\Mohammad
[nltk_data]     Dareer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
ERROR:forte.pipeline:value of the packs should be DataPack, but got forte.data.multi_pack.MultiPack
Traceback (most recent call last):
  File "c:\Users\Mohammad Dareer\anaconda3\envs\forte_qa\lib\site-packages\forte\pipeline.py", line 1089, in _process_with_component
    raw_job.alter_pack(component.cast(pack))
  File "c:\Users\Mohammad Dareer\anaconda3\envs\forte_qa\lib\site-packages\forte\data\caster.py", line 73, in cast
    p.add_pack_(pack,

ProcessExecutionException: Exception occurred when running forte.data.caster.MultiPackBoxer

In [18]:
from fortex.elastic import ElasticSearchProcessor
elastic_search_config = {
    "query_pack_name": "query",
    "index_config":{
        "index_name": "cisi",
        "hosts": "localhost:9200",
        "algorithm": "bm25",
    },
    "field": "content",
    "response_pack_name_prefix": "passage",
    "indexed_text_only": False
}
nlp.add(ElasticSearchProcessor(), elastic_search_config)

<forte.pipeline.Pipeline at 0x2013e09ce10>

In [19]:
nlp.initialize()
for m_pack in nlp.process_dataset():
    print(f"Question: {m_pack.get_pack('query').text}")
    print("Results")
    for pack in m_pack.packs:
        if pack.pack_name != "query":
            print(f" {pack.text[:100]}...")

[nltk_data] Downloading package punkt to C:\Users\Mohammad
[nltk_data]     Dareer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Mohammad Dareer\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to C:\Users\Mohammad
[nltk_data]     Dareer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
ERROR:forte.pipeline:value of the packs should be DataPack, but got forte.data.multi_pack.MultiPack
Traceback (most recent call last):
  File "c:\Users\Mohammad Dareer\anaconda3\envs\forte_qa\lib\site-packages\forte\pipeline.py", line 1089, in _process_with_component
    raw_job.alter_pack(component.cast(pack))
  File "c:\Users\Mohammad Dareer\anaconda3\envs\forte_qa\lib\site-packages\forte\data\caster.py", line 73, in cast
    p.add_pack_(pack,

ProcessExecutionException: Exception occurred when running forte.data.caster.MultiPackBoxer

In [14]:
from collections import defaultdict
from typing import Dict, DefaultDict
from ft.onto.base_ontology import Token, Sentence, PredicateLink, Annotation
from forte.data.data_pack import DataPack


def query_preprocess(input_pack: DataPack):
    """
    Extract nouns and verb from user input query.
    :param input_pack:
    :return:sentence: query text
        arg0: subject in query
        arg1: object in query
        predicate: verb in query
        verb_lemma: verb lemma
        is_answer_arg0: should subject(arg0) or object(arg1) be returned
        as answer
    """
    sentence = input_pack.get_single(Sentence)

    relations: DefaultDict[str, Dict[str, Dict[str, str]]] = defaultdict(dict)
    text_mention_mapping = {}

    # get all srl relations
    for link in input_pack.get(PredicateLink, sentence):
        verb = link.get_parent()
        verb_text = verb.text
        argument = link.get_child()
        argument_text = argument.text

        text_mention_mapping[verb_text] = verb
        text_mention_mapping[argument_text] = argument
        relations[verb_text][link.arg_type] = argument_text

    arg0, arg1, predicate = "", "", ""
    for verb_text, entity in relations.items():
        arg0, arg1, predicate = collect_mentions(
            text_mention_mapping, entity, verb_text
        )
        if arg0 == "" and arg1 == "":
            continue

    assert (
        isinstance(arg0, Annotation)
        and isinstance(arg1, Annotation)
        and isinstance(predicate, Annotation)
    ), (
        "AllenNLP SRL cannot extract the two arguments or the "
        "predicate in your query, please check our examples "
        "or rephrase your question"
    )

    verb_lemma, is_answer_arg0 = None, None

    # check pos tag and lemma for tokens
    for token in input_pack.get(
        entry_type=Token,
        range_annotation=sentence,
        components=["fortex.nltk.nltk_processors.NLTKWordTokenizer"],
    ):
        # find WH words
        if token.pos in {"WP", "WP$", "WRB", "WDT"}:
            if arg0.begin <= token.begin and arg0.end >= token.end:
                is_answer_arg0 = True
            elif arg1.begin <= token.begin and arg1.end >= token.end:
                is_answer_arg0 = False

        # find verb lemma
        if token.text == predicate.text:
            verb_lemma = token.lemma

    return (
        sentence,
        arg0.text if arg0 else "",
        arg1.text if arg1 else "",
        predicate.text,
        verb_lemma,
        is_answer_arg0,
    )


def collect_mentions(text_mention_mapping, relation, verb_text):
    """
    Get arg0,arg1 and predicate entity mention
    :param text_mention_mapping:
    :param relation:
    :param verb_text:
    :return:
    """
    arg0_text, arg1_text = get_arg_text(relation)

    if arg0_text == "" or arg1_text == "":
        return "", "", ""

    arg0 = text_mention_mapping[arg0_text]
    arg1 = text_mention_mapping[arg1_text]
    predicate = text_mention_mapping[verb_text]

    return arg0, arg1, predicate


def get_arg_text(relation):
    """
    find arg0 and arg1 text in all relations. we considered 3 annotation
    for comprehensive subject and object extraction
    As AllenNLP uses PropBank Annotation, each verb sense has numbered
    arguments e.g., ARG-0, ARG-1, etc.
    ARG-0 is usually PROTO-AGENT
    ARG-1 is usually PROTO-PATIENT
    ARG-2 is usually benefactive, instrument, attribute
    :param relation:
    :return:
    """
    arg0_text, arg1_text = "", ""
    if "ARG0" in relation and "ARG1" in relation:
        arg0_text = relation["ARG0"]
        arg1_text = relation["ARG1"]

    elif "ARG1" in relation and "ARG2" in relation:
        arg0_text = relation["ARG1"]
        arg1_text = relation["ARG2"]

    return arg0_text, arg1_text