In [4]:
from allennlp.predictors.predictor import Predictor as AllenNLPPredictor

class PythonPredictor:
    def __init__(self, config=None):
        self.predictor = AllenNLPPredictor.from_path(
            "../pretrained/bidaf-elmo-model-2018.11.30-charpad.tar.gz"
        )

    def predict(self, payload, full=False):
        """
        :param payload: dict containing the keys "passage" and "question" - both keys point to string values. 
        "passage" refers to the source doc that the model will look at while "question" refers to the question 
        asked to the model.
        :returns: a string representing the most probable answer, according to the model.
        """
        prediction = self.predictor.predict(
            passage=payload["passage"], question=payload["question"]
        )
        if full:
            return prediction
        else:
            return prediction["best_span_str"]

In [5]:
predictor = PythonPredictor()
type(predictor.predictor)

_jsonnet not loaded, treating C:\Users\Melvin\AppData\Local\Temp\tmppt3airpm\config.json as json
_jsonnet not loaded, treating snippet as json
  "num_layers={}".format(dropout, num_layers))


allennlp.predictors.bidaf.BidafPredictor

### Example prediction

In [6]:
# example prediction
payload = {
    "passage": "The trial judges accepted that both the appellants had come into Singapore only with a view to boarding a flight to Amsterdam the next day. They, however, rejected the submission made on behalf of the appellants that bringing drugs into Singapore with a view solely of exporting them would not be an offence under s 7 of the Act. They also rejected Ko’s defence that he did not know that what he was carrying was diamorphine. Accordingly, they convicted the appellants. Against the convictions, this appeal was brought. At the conclusion we dismissed it, and we now give our reasons.Ground (a) can be disposed of very briefly. By s 18(2) of the Act a rebuttable presumption arose that Ko knew the nature of the drug that he was carrying. Once the presumption arose, the onus of discharging it was on Ko. Having heard Ko’s defence, the trial judges were satisfied that he had not discharged the presumption. We have reviewed the record and it is clear that the trial judges were entitled on the evidence before them to arrive at this finding. We saw no reason to interfere.and submitted that s 7 was applicable only when it was sought to punish a master or captain who had contravened s 20. We could not accept that submission. In common with a number of other similar provisions in the Act, what s 20 does is to raise a presumption as to knowledge. By s 20, if it is proved that a drug was found in a ship or aircraft, then the presumption would arise that the drug was imported in the ship or aircraft with the knowledge of the master or captain. No doubt, in such a case, a master or captain may be charged for violating s 7 of the Act but that does not mean to say that s 7 is confined in its operations only to the master of a ship or captain of an aircraft used for the import of drugs. We see no reason why s 7 should not operate against (say) a passenger in a ship or aircraft who was importing drugs. Against such a passenger the presumption under s 20 as to knowledge would obviously not be applicable but (as in this case) the presumption under s 18(2) would apply.",
    "question": "was the appeal dismissed?"
}
prediction = predictor.predict(payload, full=True)
prediction['best_span_str']

'rejected Ko’s defence that he did not know that what he was carrying was diamorphine. Accordingly, they convicted the appellants. Against the convictions, this appeal was brought. At the conclusion we dismissed it'

## From extracted text (in json) to answers
1. present whole document and query to system.
2. search for potentially relevant paragraphs; this narrows the search space for the subsequent QnA NLP model. 
2. from these paragraphs, search for answer.
3. print top answer (or top few answers)

In [7]:
import re

class primitiveSearchEngine:
    def __init__(self):
        pass
        
    def and_search(self, itr, queries):
        """
        Searches for the passages/paragraphs that contain a 
        co-occurence of the exact query terms, in any order.
        
        :params itr: a dict containing strings to search through.
        :params queries: a list of query terms.
        :returns: a dict of the form, {key: search_result}.
        """
        regex = "^"
        for term in queries:
            # regex = regex + term + '|'
            regex = regex + rf"(?=.*\b{term}\b)"
        regex = regex + ".*$"
        
        # note: this regex pattern searches for the co-occurence of the
        # exact specified terms, in any order.
        
        pattern = re.compile(regex)
        
        results = {}
        
        for k, v  in itr.items():
            match = pattern.search(v)
            
            if match:
                results[k] = v
        return results
    
    def or_search(self, itr, queries):
        """
        Searches for the paragraphs/strings that contain any of the query terms.
        
        :params itr: a dict containing strings to search through. they key can be a para number.
        :params queries: a list of query terms.
        :returns: a dict of the form, {key: search_result}.
        """
        results = {}
        
        for k, v  in itr.items():
            for term in queries:
                if term in v:
                    results[k] = v
            
        return results
    
    def rule1(self, itr, queries, scorethreshold):
        """
        Rule 1 is an OR search and gives an equal weightage to each keyword
        
        :param scorethreshold: integer. min number of relevant terms that must appear in 
        a text (could be a paragaph). 
        :returns: a dictionary. keys are a subset of itr.keys() and ea value is rule1's 
        relevance score.
        """
        output = dict()
        for para in itr:
            score = 0
            for word in queries:
                if word in itr[para]:
                    score += 1 
            if score >= scorethreshold: 
                output[para] = score

        output = {k: v for k, v in sorted(output.items(), key=lambda x: x[1], reverse=True)}
        return output

In [8]:
# Porter stemming
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize

def tokenise(string): # works on any arbitrary string
    tokens = []
    for sentence in sent_tokenize(string):
        for token in word_tokenize(sentence):
            tokens.append(token)
    return tokens

def stem(token): # tokenizes any particular token
    return PorterStemmer().stem(token)

## Run predictions on documents with 1 simple type of question.

In [9]:
import pandas as pd
import numpy as np
import json
# with open('data/cases.json') as f:
#     cases = json.load(f)

In [10]:
from TestCaseExtractor import TestCaseExtractor
tester = TestCaseExtractor(path='data/cases.json')
output_df = tester.output_df_aligned

In [11]:
# filter out cases that are definitely appeal cases
appeals_indices = []
for i in range(len(output_df)):
    if 'SGCA' in output_df.iloc[i]["Court"]:
        appeals_indices.append(i)

appeals_df = output_df.iloc[appeals_indices]
appeals_df.head()

Unnamed: 0,unique_ref,case_id,date,Court,coram,counsel,listed_parties,accused,paragraphs
3,[1991] SGCA 14 SIM AH CHEOH,Criminal Appeal No 12 of 1988,1991-05-31,SGCA,"[Yong Pung How CJ, Chan Sek Keong J, L P Thean J]","{'prosecution': ['Chan Seng Onn'], 'defence': ...","[Sim Ah Cheoh and others, Public Prosecutor]",Sim Ah Cheoh and others,"{'1': 'The first appellant, Sim Ah Cheoh (“Sim..."
5,[1992] SGCA 30 KO MUN CHEUNG,Criminal Appeal No 4 of 1990,1992-04-30,SGCA,"[L P Thean J, Goh Joon Seng J, S Rajendran J]","{'prosecution': [], 'defence': ['Ismail Hamid'...","[Ko Mun Cheung and another, Public Prosecutor]",Ko Mun Cheung and another,"{'1': 'The two appellants, Ko Mun Cheung (“Ko”..."
6,[1992] SGCA 71 NG KWOK CHUN,Criminal Appeal No 24 of 1991,1992-10-29,SGCA,"[F A Chua J, Chao Hick Tin J, Goh Phai Cheng JC]","{'prosecution': ['Bala Reddy'], 'defence': ['L...","[Ng Kwok Chun and another, Public Prosecutor]",Ng Kwok Chun and another,{'1': 'This appeal raised a short but signific...
11,[1993] SGCA 14 TSE PO CHUNG NATHAN,Criminal Appeal No 2 of 1992,1993-02-22,SGCA,"[Yong Pung How CJ, Chao Hick Tin J, Warren L H...","{'prosecution': ['Bala Reddy'], 'defence': ['P...","[Tse Po Chung Nathan and another, Public Prose...",Tse Po Chung Nathan and another,{'1': 'This appeal raises an important but ide...
12,[1993] SGCA 15 VIRAT KAEWNERN,Criminal Appeal No 20 of 1992,1993-02-27,SGCA,"[Yong Pung How CJ, Chao Hick Tin J, Warren L H...","{'prosecution': [], 'defence': ['N K Rajah wit...","[Public Prosecutor, Virat Kaewnern]",Virat Kaewnern,{'1': 'This is an appeal by the Public Prosecu...


In [12]:
# find the appellant of each case, and put into df.
appellants = list()
for i in range(len(appeals_df)):
    if appeals_df.iloc[i]['listed_parties'][0].lower() != "public prosecutor":
        appellant = appeals_df.iloc[i]['accused']
        and_index = appellant.find("and ")
        if and_index != -1:
            # if 'X and others' then cut away ' and others'
            appellant = appellant[0:and_index - 1]
    else:
        appellant = "public prosecutor"
    appellants.append(appellant)
print(appellants)
appeals_df = appeals_df.assign(appellants=appellants)
appeals_df = appeals_df.reset_index(drop=True)
appeals_df.head()

['Sim Ah Cheoh', 'Ko Mun Cheung', 'Ng Kwok Chun', 'Tse Po Chung Nathan', 'public prosecutor', 'public prosecutor', 'Fung Yuk Shing', 'public prosecutor', 'Kong Weng Chong', 'Van Damme Johannes', 'Hyecinth Ihejirika', 'Ubaka Chris Chinenye', 'Navarat Maykha', 'Mat Repin bin Mamat', 'Tsang Kai Mong Elke', 'Chan Chi Pun', 'Low Kok Wai', 'Anyanwu', 'Jasbir Singh', 'public prosecutor', 'Hartej Sidhu', 'Lam Cheuk Wang', 'Don Promphinit', 'Mou Pui Peng', 'Osman bin Din', 'Tan Yew Lee', 'public prosecutor', 'Lee Yuan Kwang', 'public prosecutor', 'Goh Joon Tong', 'Foong Seow Ngui', 'Lee Meng Hong', 'Lim Lye Huat Benny', 'Wong Yoke Wah', 'Mazlan bin Sulaiman', 'Muhamed Hazani bin Ghani alias Abu Hanifah', 'Tan Meng Jee', 'Abdul Raman bin Yusof', 'Muhammad Jefrry', 'Teh Thiam Huat', 'Chua Kiat Ann', 'Tan Lay Keat', 'Tan Chuan Ten', 'Low Cheng Soon', 'Lau Boon Huat', 'Fun Seong Cheng', 'Yong Yow Chee', 'Yeo Choon Huat', 'Mohd Ariff bin Mat Rifin', 'Loh Kim Cheng', 'Heng Aik Ren Thomas', 'Chou Kooi

Unnamed: 0,unique_ref,case_id,date,Court,coram,counsel,listed_parties,accused,paragraphs,appellants
0,[1991] SGCA 14 SIM AH CHEOH,Criminal Appeal No 12 of 1988,1991-05-31,SGCA,"[Yong Pung How CJ, Chan Sek Keong J, L P Thean J]","{'prosecution': ['Chan Seng Onn'], 'defence': ...","[Sim Ah Cheoh and others, Public Prosecutor]",Sim Ah Cheoh and others,"{'1': 'The first appellant, Sim Ah Cheoh (“Sim...",Sim Ah Cheoh
1,[1992] SGCA 30 KO MUN CHEUNG,Criminal Appeal No 4 of 1990,1992-04-30,SGCA,"[L P Thean J, Goh Joon Seng J, S Rajendran J]","{'prosecution': [], 'defence': ['Ismail Hamid'...","[Ko Mun Cheung and another, Public Prosecutor]",Ko Mun Cheung and another,"{'1': 'The two appellants, Ko Mun Cheung (“Ko”...",Ko Mun Cheung
2,[1992] SGCA 71 NG KWOK CHUN,Criminal Appeal No 24 of 1991,1992-10-29,SGCA,"[F A Chua J, Chao Hick Tin J, Goh Phai Cheng JC]","{'prosecution': ['Bala Reddy'], 'defence': ['L...","[Ng Kwok Chun and another, Public Prosecutor]",Ng Kwok Chun and another,{'1': 'This appeal raised a short but signific...,Ng Kwok Chun
3,[1993] SGCA 14 TSE PO CHUNG NATHAN,Criminal Appeal No 2 of 1992,1993-02-22,SGCA,"[Yong Pung How CJ, Chao Hick Tin J, Warren L H...","{'prosecution': ['Bala Reddy'], 'defence': ['P...","[Tse Po Chung Nathan and another, Public Prose...",Tse Po Chung Nathan and another,{'1': 'This appeal raises an important but ide...,Tse Po Chung Nathan
4,[1993] SGCA 15 VIRAT KAEWNERN,Criminal Appeal No 20 of 1992,1993-02-27,SGCA,"[Yong Pung How CJ, Chao Hick Tin J, Warren L H...","{'prosecution': [], 'defence': ['N K Rajah wit...","[Public Prosecutor, Virat Kaewnern]",Virat Kaewnern,{'1': 'This is an appeal by the Public Prosecu...,public prosecutor


### Search Engine for finding relevant phrases in case file.

In [13]:
searchEngine = primitiveSearchEngine()

# Query lists
# Note that strings are searched, not words. So "element" will also count in "elements"; "rebut" in "rebutted"
ConvictionTrialQ = ["accordingly", "acquit", "charge", "convict", "element", "guilty", "made out", "prove", "reasonable doubt", "reasons", "satisfied", "sentence", "therefore" ]
ConvictionAppealQ = ConvictionTrialQ + ["affirm", "allow", "dismiss"]
PresumptionQ = ["balance of probabilities", "evidence", "failed to", "fails to", "MDA", "presumption", "reasonable doubt", "rebut"]
TraffickingQ = PresumptionQ + ["17(c)", "trafficking"]
PossessionQ = PresumptionQ + ["18(1)", "possession", ]
KnowledgeQ = PresumptionQ + ["18(2)", "actual", "knowledge"]
CourierQ = ["33B", "certificate", "courier", "MDA", "substantive assistance"]
SentenceQ = ["cane", "caning", "convict", "death", "impose", "imprisonment", "mandatory", "months", "punish", "sentence", "stroke", "years"]

#add porter stemming to capture more relevant words.
search_terms = ConvictionAppealQ
search_terms = [stem(term) for term in search_terms] # remove if stemming doesn't help

In [14]:
import csv
with open('appeals_rules/dismissed2.csv', newline='') as f:
    reader = csv.reader(f)
    yes_rules = list(reader)

with open('appeals_rules/allowed2.csv', newline='') as f:
    reader = csv.reader(f)
    no_rules = list(reader)
    
def sentence2outcome(sent):
    # TODO: add rule for ambiguous/unclear or '-1'.

    for rule in yes_rules[0]:
        if rule in sent:
            return 1
            break

    for rule in no_rules[0]:
        if rule in sent:
            return 0
            break
    return -1

In [15]:
from nltk.tokenize.treebank import TreebankWordDetokenizer
def predict_multiple(cases_df, search_terms, print_output=True):
    """
    todo: put under the predictor class.
    """
    preds = []
    sentences = []
    outcomes = []
    for i in range(len(cases_df)):
        print(f"{i+1}/{len(cases_df)}")
        appellant = cases_df.iloc[i]['appellants']
        
        qn = f"was {appellant}'s appeal dismissed?"
        #qn = f"was {appellant}'s appeal allowed?"
        # todo: modify this code chunk to ask multiple question-phrasings at once
        # instead of only one question-phrasing per document.
        
        # Find paragraphs related to search terms; can also try finding sentences instead, later on.
        # rule1() yields item numbers (e.g. paragraph numbers) that contain ANY of the search terms. 
        SCORE_THRESHOLD = 2  # arbitrary threshold for "rule1"; result must be greater than thres.
        results = searchEngine.rule1(cases_df.iloc[i]['paragraphs'], search_terms, SCORE_THRESHOLD)
        
        # Concatenate top relevant paragraphs into one chunk to feed as input to the QnA model.
        combined_psg = ""
        total_score = 0
        MAX_PARAS = 4
        num_paras = 0
        
        item_numbers = [int(key) for key in results.keys()]  # rule1()'s results contains {ref number:score} pairs. 
        descending_keys =  sorted(item_numbers, reverse=True)

        for key in descending_keys:
            '''
            Higher paragraph (or sentence) number is given priority for addition to the input
            chunk to the QnA model as the intuition is that the case's conclusion is likely
            found at the end of the case doc.
            
            alternatively, the top items for inclusion into the chunk can be those items that
            had the highest keyword scores.
            '''
            num_paras = num_paras + 1
            if num_paras <= MAX_PARAS:
                combined_psg = combined_psg + " " + cases_df.iloc[i]['paragraphs'][str(key)]
                total_score = total_score + results[str(key)]
            else:
                break
    
        sentence_str = 'NA' # prediction defaults to NA if no relevant paras found.
        if combined_psg:
            payload = {
                'passage': combined_psg,
                'question': qn
            }
            prediction = predictor.predict(payload, full=True)
            predicted_span = prediction['best_span_str']
            span_indices = prediction['best_span']
            
            tokens = prediction['passage_tokens']
            start_index = span_indices[0]
            end_index = span_indices[1]
            
            sent_start_index = start_index
            while tokens[sent_start_index] != '.' and sent_start_index > 0:
                sent_start_index = sent_start_index - 1
            sent_end_index = end_index
            while sent_end_index < len(tokens):
                if tokens[sent_end_index] != '.':
                    sent_end_index = sent_end_index + 1
                else:
                    break
            
            if tokens[sent_start_index] == '.':
                sentence = tokens[sent_start_index + 1 : sent_end_index]
            else:
                sentence = tokens[sent_start_index : sent_end_index]
            detokenizer = TreebankWordDetokenizer()
            sentence_str = detokenizer.detokenize(sentence)  # the output is not perfectly formatted.
            
        preds.append(predicted_span)
        sentences.append(sentence_str)
        
        outcome = sentence2outcome(sentence_str)
        outcomes.append(outcome)
        
        if print_output:
            # TODO: use logger.
            print(f"qn: {qn}")
            print("relevant paragraphs:\n" + combined_psg)
            print(f"relevant paragraph numbers: {descending_keys[:MAX_PARAS]}")
            print(f"keywords score: {total_score}.")
            print()
            print(f"predicted span: {predicted_span}")
            print(f"full sentence or sequence: {sentence_str}")
            #print(f"item reference number: {answer_item_num}")  # determining this isn't straightforward.
            #print(f"start and end indices of tokens in passage: {span_indices}")
            print()
    return preds, sentences, outcomes

In [16]:
preds, sentences, outcomes = predict_multiple(appeals_df[5:7], search_terms)

1/2
qn: was public prosecutor's appeal dismissed?
relevant paragraphs:
 We would record our grave doubts over the correctness of the appellant’s actions in the days after the respondent’s acquittal below. However, in view of our decision to dismiss the appeal, and in the absence of full argument on the nature and extent of the court’s inherent jurisdiction over its criminal procedure, we would decline to express any definitive view on Mr Sant Singh’s contentions although we are of the preliminary view that such a power to stay criminal proceedings in circumstances where it can be shown that the accused could not have a fair trial exists. The learned DPP conceded in his reply that the appeal would fail if we found against him on the issue of joint custody or joint possession of the Gucci bag. However, in view of counsel’s arguments on the question of common intention, we will deal briefly with the views we formed on this element of the charge against the respondent. In R v Strong The Ti

In [17]:
outcomes

[1, 1]

In [17]:
# save results
path = input('enter path to save csv file:\n')
to_save = pd.DataFrame()
to_save.insert(0, "unique_ref", appeals_df['unique_ref'].values)
to_save.insert(1, "raw_answer", preds)
to_save.insert(2, "final_answer", sentences)
to_save.insert(3, "outcome", outcomes)
to_save.to_csv(path)

enter path to save csv file:
 data/appeals_02.csv
