In [1]:
from allennlp.predictors.predictor import Predictor as AllenNLPPredictor

class PythonPredictor:
    def __init__(self, config=None):
        self.predictor = AllenNLPPredictor.from_path(
            "../pretrained/bidaf-elmo-model-2018.11.30-charpad.tar.gz"
        )

    def predict(self, payload):
        """
        :param payload: dict containing the keys "passage" and "question" - both keys point to string values. 
        "passage" refers to the source doc that the model will look at while "question" refers to the question 
        asked to the model.
        :returns: a string representing the most probable answer, according to the model.
        """
        prediction = self.predictor.predict(
            passage=payload["passage"], question=payload["question"]
        )
        return prediction["best_span_str"]
    
    def full_predict(self, payload):
        """
        :param payload: dict containing the keys "passage" and "question" - both keys point to string values. 
        "passage" refers to the source doc that the model will look at while "question" refers to the question 
        asked to the model.
        :returns: a dict representing the predictions made by model.
        """
        prediction = self.predictor.predict(
            passage=payload["passage"], question=payload["question"]
        )
        return prediction

In [2]:
predictor = PythonPredictor()
type(predictor.predictor)

_jsonnet not loaded, treating C:\Users\Melvin\AppData\Local\Temp\tmp1fj34r4e\config.json as json
_jsonnet not loaded, treating snippet as json
  "num_layers={}".format(dropout, num_layers))


allennlp.predictors.bidaf.BidafPredictor

In [3]:
# example prediction
psg = "The prosecution led evidence from 14 witnesses. These witnesses comprised officers from the Central Narcotics Bureau (CNB), police officers, analysts from the Health Sciences Authority (HSA) and 4 laypersons. The 4 laypersons included Mohd Yuswi Bin Mohd Yusof (Yuswi), PW9, to whom the accused passed the drugs to and Yuswi’s 3 children. As the trafficking charges were preferred against the accused arresting from Yuswi’s arrest, it would be appropriate to begin with his arrest."
qn = "how many witnesses were there?"

payload = {
    'passage': psg,
    'question': qn
}
prediction = predictor.predict(payload)
print(prediction)

full_prediction = predictor.full_predict(payload)
print(full_prediction)

14
{'passage_question_attention': [[0.3176080584526062, 0.22527922689914703, 0.07190658152103424, 0.11300385743379593, 0.11031807214021683, 0.16188427805900574], [0.2194063514471054, 0.12254849821329117, 0.17734859883785248, 0.10546589642763138, 0.17083479464054108, 0.2043958604335785], [0.2679828107357025, 0.18773551285266876, 0.0692141205072403, 0.1258569061756134, 0.09786252677440643, 0.251348078250885], [0.1109834760427475, 0.05815255641937256, 0.4946731626987457, 0.10516639798879623, 0.14123159646987915, 0.08979285508394241], [0.26482462882995605, 0.11954894661903381, 0.13950295746326447, 0.2297879457473755, 0.14064475893974304, 0.10569076985120773], [0.3619944751262665, 0.5336117744445801, 0.037298474460840225, 0.02248273976147175, 0.0221631471067667, 0.022449364885687828], [0.01806076243519783, 0.020166311413049698, 0.8120173215866089, 0.08458580076694489, 0.04400130733847618, 0.021168548613786697], [0.14268618822097778, 0.09384894371032715, 0.07031748443841934, 0.10342743992805

## From extracted text (in json) to answers
1. present whole document and query to system.
2. search for potentially relevant paragraphs; this narrows the search space for the subsequent QnA NLP model. 
2. from these paragraphs, search for answer.
3. print top answer (or top few answers)

In [26]:
import re

class primitiveSearchEngine:
    def __init__(self):
        pass
        
    def and_search(self, itr, queries):
        """
        Searches for the passages/paragraphs that contain a 
        co-occurence of the exact query terms, in any order.
        
        :params itr: a dict containing strings to search through.
        :params queries: a list of query terms.
        :returns: a dict of the form, {key: search_result}.
        """
        regex = "^"
        for term in queries:
            # regex = regex + term + '|'
            regex = regex + rf"(?=.*\b{term}\b)"
        regex = regex + ".*$"
        
        # note: this regex pattern searches for the co-occurence of the
        # exact specified terms, in any order.
        
        pattern = re.compile(regex)
        
        results = {}
        
        for k, v  in itr.items():
            match = pattern.search(v)
            
            if match:
                results[k] = v
        return results
    
    def or_search(self, itr, queries):
        """
        Searches for the paragraphs/strings that contain any of the query terms.
        
        :params itr: a dict containing strings to search through. they key can be a para number.
        :params queries: a list of query terms.
        :returns: a dict of the form, {key: search_result}.
        """
        results = {}
        
        for k, v  in itr.items():
            for term in queries:
                if term in v:
                    results[k] = v
            
        return results
    
    def rule1(self, itr, queries, scorethreshold):
        """
        Rule 1 is an OR search and gives an equal weightage to each keyword
        
        :returns: a dictionary. key is subset of itr.keys() and value is rule1's relevance score.
        """
        output = dict()
        for para in itr:
            score = 0
            for word in queries:
                if word in itr[para]:
                    score += 1 
            if score > scorethreshold: 
                output[para] = score

        output = {k: v for k, v in sorted(output.items(), key=lambda x: x[1], reverse=True)}
        return output

In [4]:
searchEngine = primitiveSearchEngine()
import json
with open('data/sample.json') as f:
    # open the extracted texts that were stored as json files.
    data_dict = json.load(f)

search_terms = ['convicted']
# todo: add porter stemming to capture more words.

# find paragraphs related to these keywords.
results = searchEngine.and_search(data_dict[0]['paragraphs'], search_terms)
for item in results.items():
    print(item)

('25', 'Accordingly, we convicted the accused of the offence of attempting to export to Australia the diamorphine in the quantity as charged contrary to s 7 read with s 12 of the Act. We stood down the alternative charge. We sentenced him to death.')


In [None]:
qn = "did the accused get convicted?"

for k, v in results.items():
    payload = {
        'passage': v,
        'question': qn
    }
    prediction = predictor.predict(payload)
    print(prediction + "\n")

In [None]:
# todo: use a proper search engine from some python library..
# ..hopefully with some semantic search capabilities or pagerank.
# ..with some advanced options like AND/OR, filters

## Run predictions on documents with 1 simple type of question.

In [None]:
'''
1. pull out the paragraphs of a valid document
2. run search engine to find relevant paragraphs
3. concatenate relevant paragraphs into a new "source" document
4. feed source and question into QnA model, get answer.
5. manually evaluate if the answer is able to answer the question.
6. auto evaluate if the predicted answer is the same or overlaps with the ground-truth answer.

7. repeat 1-6 for every doc and every question we want to test.

a. we start with questions about whether the accused was convicted. find one good phrasing.
b. can consider testing other phrasings for the same question as well.
c. then we identify other high-prio questions to test.
d. tabulate model's accuracy in answering each type of question.
'''

In [5]:
import pandas as pd
import numpy as np
import json
with open('data/cases.json') as f:
    cases = json.load(f)

In [6]:
'''
Set up the output_df dataframe from the .json output file.
(based on glowsplint's test_output.py code)
'''
output_df = pd.read_json('data/cases.json')

court_map = {
    'High Court': 'SGHC',
    'Court of Appeal': 'SGCA',
    'District Court': 'SGDC',
    'Court of Criminal Appeal': 'SGCA'
}

output_df['Court'] = output_df.Court.map(court_map)
ref_columns = {
    0: 'refSLR',
    1: 'ref'
}

# Split reference into SLR reference and normal reference
output_df[list(ref_columns.values())] = pd.DataFrame(
    output_df.reference.apply(pd.Series)).rename(columns=ref_columns)
ref_na = output_df['ref'].isna()
output_df.loc[ref_na, 'ref'], output_df.loc[ref_na,
                                                      'refSLR'] = output_df.refSLR[ref_na], np.nan
output_df.drop('reference', axis=1, inplace=True)

# Construct the unique reference
exploded_output = output_df.explode('parties')
output_df = exploded_output.loc[exploded_output.parties.str.lower(
) != "public prosecutor"].copy()
output_df['unique_ref'] = output_df['ref'] + \
    ' ' + output_df['parties']
output_df.unique_ref = output_df.unique_ref.str.upper().str.replace(
    "AND ANOTHER", "").str.replace("AND OTHERS", "").str.replace(' +', ' ').str.strip()
output_df = output_df[['unique_ref', 'Date', 'Court',
                                 'coram', 'counsel', 'parties', 'paragraphs']]
output_df.rename(columns={'parties': 'accused'}, inplace=True)

In [7]:
output_df.head()

Unnamed: 0,unique_ref,Date,Court,coram,counsel,accused,paragraphs
0,[2000] SGCA 8 CHIN SIONG KIAN,2000-02-12,SGCA,"[L P Thean JA, Chao Hick Tin JA]","{'prosecution': ['Low Cheong Yeow'], 'defence'...",Chin Siong Kian,"{'1': 'On 6 September 1999 the appellant, toge..."
1,[2019] SGCA 81 MOHD AKEBAL S/O GHULAM JILANI,2019-11-28,SGCA,"[Sundaresh Menon CJ, Tay Yong Kwang JA, Steven...","{'prosecution': ['Chin Jincheng', 'in Criminal...",Mohd Akebal s/o Ghulam Jilani,{'1': 'These appeals arise from the joint tria...
1,[2019] SGCA 81 MOHAMMED RUSLI BIN ABDUL RAHMAN,2019-11-28,SGCA,"[Sundaresh Menon CJ, Tay Yong Kwang JA, Steven...","{'prosecution': ['Chin Jincheng', 'in Criminal...",Mohammed Rusli Bin Abdul Rahman,{'1': 'These appeals arise from the joint tria...
2,[1989] SGHC 75 GOH AH LIM,1989-08-24,SGHC,"[Lai Kew Chai J, F A Chua J]","{'prosecution': ['Lee Sing Lit'], 'defence': [...",Goh Ah Lim,"{'1': 'The accused, a male Chinese aged 46, fa..."
3,[1989] SGHC 9 KADIR BIN AWANG,1989-02-03,SGHC,"[T S Sinnathuray J, Joseph Grimberg JC]","{'prosecution': ['Lee Sing Lit'], 'defence': [...",Kadir bin Awang,{'1': 'Kadir bin Awang (“the accused”) was cha...


In [11]:
# filter out cases that are definitely appeal cases
appeals_indices = []
for i in range(len(output_df)):
    if 'SGCA' in output_df.iloc[i]["unique_ref"]:
        appeals_indices.append(i)

appeals_df = output_df.iloc[appeals_indices]
appeals_df.head()

Unnamed: 0,unique_ref,Date,Court,coram,counsel,accused,paragraphs
0,[2000] SGCA 8 CHIN SIONG KIAN,2000-02-12,SGCA,"[L P Thean JA, Chao Hick Tin JA]","{'prosecution': ['Low Cheong Yeow'], 'defence'...",Chin Siong Kian,"{'1': 'On 6 September 1999 the appellant, toge..."
1,[2019] SGCA 81 MOHD AKEBAL S/O GHULAM JILANI,2019-11-28,SGCA,"[Sundaresh Menon CJ, Tay Yong Kwang JA, Steven...","{'prosecution': ['Chin Jincheng', 'in Criminal...",Mohd Akebal s/o Ghulam Jilani,{'1': 'These appeals arise from the joint tria...
1,[2019] SGCA 81 MOHAMMED RUSLI BIN ABDUL RAHMAN,2019-11-28,SGCA,"[Sundaresh Menon CJ, Tay Yong Kwang JA, Steven...","{'prosecution': ['Chin Jincheng', 'in Criminal...",Mohammed Rusli Bin Abdul Rahman,{'1': 'These appeals arise from the joint tria...
5,[1991] SGCA 14 SIM AH CHEOH,1991-05-31,SGCA,"[Yong Pung How CJ, Chan Sek Keong J, L P Thean J]","{'prosecution': ['Chan Seng Onn'], 'defence': ...",Sim Ah Cheoh and others,"{'1': 'The first appellant, Sim Ah Cheoh (“Sim..."
7,[1992] SGCA 30 KO MUN CHEUNG,1992-04-30,SGCA,"[L P Thean J, Goh Joon Seng J, S Rajendran J]","{'prosecution': [], 'defence': ['P Suppiah', '...",Ko Mun Cheung and another,"{'1': 'The two appellants, Ko Mun Cheung (“Ko”..."


In [30]:
searchEngine = primitiveSearchEngine()

ConvictionTrialQ = ["accordingly", "acquit", "charge", "convict", "element", "guilty", "made out", "prove", "reasonable doubt", "reasons", "satisfied", "sentence", "therefore" ]
ConvictionAppealQ = ConvictionTrialQ + ["affirm", "allow", "dismiss"]
#search_terms = ['acquit', 'acquitted', 'acquital', 'dismiss', 'allowed', 'rejected']
# todo: add porter stemming to capture more words.


In [47]:
def predict_all(cases_df, search_terms, qn):
    """
    todo: put under the predictor class.
    """
    pred = []
    for i in range(len(cases_df)):
        print(f"{i}/{len(cases_df) - 1}")

        # find paragraphs related to search terms; can also try finding sentences later on.
        SCORE_THRESHOLD = 0  # arbitrary threshold for "rule1"; result must be greater than thres.
        results = searchEngine.rule1(cases_df.iloc[i]['paragraphs'], search_terms, SCORE_THRESHOLD)
        combined_psg = ""
        total_score = 0
        MAX_PARAS = 3
        paras = 0
        
        for key, score in results.items():
            paras = paras + 1
            if paras <= MAX_PARAS:
                combined_psg = combined_psg + cases_df.iloc[i]['paragraphs'][key]
                total_score = total_score + score
            else:
                break
        print(combined_psg)
        print(f"keywords score: {total_score}.")
        print()
        prediction = 'NA' # prediction defaults to NA if no relevant paras found.
        if combined_psg:
            payload = {
                'passage': combined_psg,
                'question': qn
            }
            prediction = predictor.predict(payload)
            #print("ans: " + prediction + "\n")
            #print('===')
        pred.append(prediction)
    return pred

In [48]:
pred = predict_all(appeals_df, search_terms, "was the appeal dismissed?")
print(pred)

0/130
On 6 September 1999 the appellant, together with another Wan Yue Kong (“Wan”), was convicted and sentenced to death by Judicial Commissioner Amarjeet Singh (“the trial judge”) under a joint charge of drug trafficking. He appealed against his conviction and sentence. At the conclusion of the hearing we dismissed the appeal and now give our reasons. We should add that Wan did not file any appeal.Both the appellant and Wan were jointly tried in the same court. At the close of the Prosecution’s case the court allowed the Prosecution’s application that the capital charges against the two accused be amended to a joint capital charge. In allowing the amendment, the trial judge permitted counsel to further cross-examine any of the Prosecution’s witnesses if they so wished. Counsel for the appellant did avail himself of this opportunity.In the meantime, Wan returned to the coffee shop. After finishing his food, he told Chong that he was leaving for Tiong Bahru Plaza. Chong agreed to follo

In [50]:
path = input('enter path to save csv file:\n')
to_save = pd.DataFrame()
to_save.insert(0, "unique_ref", appeals_df['unique_ref'].values)
to_save.insert(1, "appeal_prediction", pred)
to_save.to_csv(path)

enter path to save csv file:
 data/appeals_pred.csv
