In [6]:
from allennlp.predictors.predictor import Predictor as AllenNLPPredictor


class PythonPredictor:
    def __init__(self, config=None):
        self.predictor = AllenNLPPredictor.from_path(
            "../pretrained/bidaf-elmo-model-2018.11.30-charpad.tar.gz"
        )

    def predict(self, payload):
        """
        :param payload: dict containing the keys "passage" and "question" - both keys point to string values. 
        "passage" refers to the source doc that the model will look at while "question" refers to the question 
        asked to the model.
        :returns: a string representing the most probable answer, according to the model.
        """
        prediction = self.predictor.predict(
            passage=payload["passage"], question=payload["question"]
        )
        return prediction["best_span_str"]


In [7]:
predictor = PythonPredictor()

_jsonnet not loaded, treating C:\Users\Melvin\AppData\Local\Temp\tmpl2i87cjg\config.json as json
_jsonnet not loaded, treating snippet as json
  "num_layers={}".format(dropout, num_layers))


In [8]:
psg = "The prosecution led evidence from 14 witnesses. These witnesses comprised officers from the Central Narcotics Bureau (CNB), police officers, analysts from the Health Sciences Authority (HSA) and 4 laypersons. The 4 laypersons included Mohd Yuswi Bin Mohd Yusof (Yuswi), PW9, to whom the accused passed the drugs to and Yuswi’s 3 children. As the trafficking charges were preferred against the accused arresting from Yuswi’s arrest, it would be appropriate to begin with his arrest."
qn = "how many witnesses were there?"

payload = {
    'passage': psg,
    'question': qn
}
prediction = predictor.predict(payload)
print(prediction)

14


In [9]:
import json
with open("data/qna-test.json") as f:
    # we do this so that we don't hold on to a system file after we
    # have loaded the data.
    test_data = json.load(f)

In [10]:
for payload in test_data:
    prediction = predictor.predict(payload)
    print(f"para: {payload['passage']}\n")
    print(f"qn: {payload['question']}\n")
    print(f"predicted ans: {prediction}\n")

para: Liew Muhammad Zulkifli (the accused), male, aged 33, claimed trial and was convicted on 5 (enhanced) trafficking charges under the Misuse of Drugs Act (MDA), Chapter 185. The offences were committed on 14 April 2017, at about 11 p.m., in the vicinity of Block (Blk) 57 Lengkok Bahru, Singapore when he gave to one Mohd Yuswi Bin Mohd Yusof (Yuswi) 4 different type of Class A controlled drugs, namely 3.99 grams of methamphetamine, 1.18 grams of MDMA, 0.15 gram of ketamine, 0.58 gram methamphetamine and nimetazepam, a Class C controlled drug. (There were 9 other charges under the MDA which were stood down pending the outcome of the trial).

qn: was the accused convicted of trafficking charges?

predicted ans: Liew Muhammad Zulkifli

para: Liew Muhammad Zulkifli (the accused), male, aged 33, claimed trial and was convicted on 5 (enhanced) trafficking charges under the Misuse of Drugs Act (MDA), Chapter 185. The offences were committed on 14 April 2017, at about 11 p.m., in the vicinit

## From extracted text (in json) to answers
1. present whole document and query to system.
2. search for potentially relevant paragraphs; this narrows the search space for the subsequent QnA NLP model. 
2. from these paragraphs, search for answer.
3. print top answer (or top few answers)

In [52]:
import re

class primitiveSearchEngine:
    def __init__(self):
        pass
        
    def search(self, itr, queries):
        """
        Searches for the passages/paragraphs that contain a 
        co-occurence of the exact query terms, in any order.
        
        :params itr: a dict containing strings to search through.
        :params queries: a list of query terms.
        :returns: a dict of the form, {key: search_result}.
        """
        regex = "^"
        for term in queries:
            # regex = regex + term + '|'
            regex = regex + rf"(?=.*\b{term}\b)"
        regex = regex + ".*$"
        
        # note: this regex pattern searches for the co-occurence of the
        # exact specified terms, in any order.
        
        pattern = re.compile(regex)
        
        results = {}
        
        for k, v in itr.items():
            match = pattern.search(v)
            
            if match:
                results[k] = v
        return results

In [None]:
# Porter stemming
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize

def tokenise(string): # works on any arbitrary string
    tokens = []
    for sentence in sent_tokenize(string):
        for token in word_tokenize(sentence):
            tokens.append(token)
    return tokens

def stem(token): # tokenizes any particular token
    return PorterStemmer().stem(token)

In [112]:
searchEngine = primitiveSearchEngine()

with open('data/sample.json') as f:
    # open the extracted texts that were stored as json files.
    data_dict = json.load(f)

queries = ['convicted']

queries = [stem(term) for term in queries] # remove if stemming doesn't help

# find paragraphs related to these keywords.
results = searchEngine.search(data_dict[0]['paragraphs'], queries)
for item in results.items():
    print(item)

('25', 'Accordingly, we convicted the accused of the offence of attempting to export to Australia the diamorphine in the quantity as charged contrary to s 7 read with s 12 of the Act. We stood down the alternative charge. We sentenced him to death.')


In [116]:
qn = "did the accused get convicted?"

for k, v in results.items():
    payload = {
        'passage': v,
        'question': qn
    }
    prediction = predictor.predict(payload)
    print(prediction + "\n")

diamorphine



In [73]:
# todo: use a proper search engine from some python library..
# ..hopefully with some semantic search capabilities or pagerank.
# ..with some advanced options like AND/OR, filters

## Test this system on the labelled MDA dataset
1. Compare against paragraph numbers. Did the auto system find the correct paragraph?
2. Compare against the words in the automatic answer and the human-found answer. Is there a convergence/good overlap of the words in both answers?