In [1]:
from allennlp.predictors.predictor import Predictor as AllenNLPPredictor

class PythonPredictor:
    def __init__(self, config=None):
        self.predictor = AllenNLPPredictor.from_path(
            "bidaf-elmo-model-2018.11.30-charpad.tar"
        )

    def predict(self, payload):
        """
        :param payload: dict containing the keys "passage" and "question" - both keys point to string values. 
        "passage" refers to the source doc that the model will look at while "question" refers to the question 
        asked to the model.
        :returns: a string representing the most probable answer, according to the model.
        """
        prediction = self.predictor.predict(
            passage=payload["passage"], question=payload["question"]
        )
        return prediction["best_span_str"]
    
    def full_predict(self, payload):
        """
        :param payload: dict containing the keys "passage" and "question" - both keys point to string values. 
        "passage" refers to the source doc that the model will look at while "question" refers to the question 
        asked to the model.
        :returns: a dict representing the predictions made by model.
        """
        prediction = self.predictor.predict(
            passage=payload["passage"], question=payload["question"]
        )
        return prediction

In [2]:
predictor = PythonPredictor()
type(predictor.predictor)

_jsonnet not loaded, treating C:\Users\melvin\AppData\Local\Temp\tmpdvmveebe\config.json as json
_jsonnet not loaded, treating snippet as json
  "num_layers={}".format(dropout, num_layers))


allennlp.predictors.bidaf.BidafPredictor

In [3]:
# example prediction
payload = {
    "passage": "The trial judges accepted that both the appellants had come into Singapore only with a view to boarding a flight to Amsterdam the next day. They, however, rejected the submission made on behalf of the appellants that bringing drugs into Singapore with a view solely of exporting them would not be an offence under s 7 of the Act. They also rejected Ko’s defence that he did not know that what he was carrying was diamorphine. Accordingly, they convicted the appellants. Against the convictions, this appeal was brought. At the conclusion we dismissed it, and we now give our reasons.Ground (a) can be disposed of very briefly. By s 18(2) of the Act a rebuttable presumption arose that Ko knew the nature of the drug that he was carrying. Once the presumption arose, the onus of discharging it was on Ko. Having heard Ko’s defence, the trial judges were satisfied that he had not discharged the presumption. We have reviewed the record and it is clear that the trial judges were entitled on the evidence before them to arrive at this finding. We saw no reason to interfere.and submitted that s 7 was applicable only when it was sought to punish a master or captain who had contravened s 20. We could not accept that submission. In common with a number of other similar provisions in the Act, what s 20 does is to raise a presumption as to knowledge. By s 20, if it is proved that a drug was found in a ship or aircraft, then the presumption would arise that the drug was imported in the ship or aircraft with the knowledge of the master or captain. No doubt, in such a case, a master or captain may be charged for violating s 7 of the Act but that does not mean to say that s 7 is confined in its operations only to the master of a ship or captain of an aircraft used for the import of drugs. We see no reason why s 7 should not operate against (say) a passenger in a ship or aircraft who was importing drugs. Against such a passenger the presumption under s 20 as to knowledge would obviously not be applicable but (as in this case) the presumption under s 18(2) would apply.",
    "question": "was the appeal dismissed?"
}
prediction = predictor.full_predict(payload)
prediction.keys()
prediction['best_span_str']

'rejected Ko’s defence that he did not know that what he was carrying was diamorphine. Accordingly, they convicted the appellants. Against the convictions, this appeal was brought. At the conclusion we dismissed it'

## From extracted text (in json) to answers
1. present whole document and query to system.
2. search for potentially relevant paragraphs; this narrows the search space for the subsequent QnA NLP model. 
2. from these paragraphs, search for answer.
3. print top answer (or top few answers)

In [4]:
import re

class primitiveSearchEngine:
    def __init__(self):
        pass
        
    def and_search(self, itr, queries):
        """
        Searches for the passages/paragraphs that contain a 
        co-occurence of the exact query terms, in any order.
        
        :params itr: a dict containing strings to search through.
        :params queries: a list of query terms.
        :returns: a dict of the form, {key: search_result}.
        """
        regex = "^"
        for term in queries:
            # regex = regex + term + '|'
            regex = regex + rf"(?=.*\b{term}\b)"
        regex = regex + ".*$"
        
        # note: this regex pattern searches for the co-occurence of the
        # exact specified terms, in any order.
        
        pattern = re.compile(regex)
        
        results = {}
        
        for k, v  in itr.items():
            match = pattern.search(v)
            
            if match:
                results[k] = v
        return results
    
    def or_search(self, itr, queries):
        """
        Searches for the paragraphs/strings that contain any of the query terms.
        
        :params itr: a dict containing strings to search through. they key can be a para number.
        :params queries: a list of query terms.
        :returns: a dict of the form, {key: search_result}.
        """
        results = {}
        
        for k, v  in itr.items():
            for term in queries:
                if term in v:
                    results[k] = v
            
        return results
    
    def rule1(self, itr, queries, scorethreshold):
        """
        Rule 1 is an OR search and gives an equal weightage to each keyword
        
        :param scorethreshold: integer. min number of relevant terms that must appear in 
        a text (could be a paragaph). 
        :returns: a dictionary. keys are a subset of itr.keys() and ea value is rule1's 
        relevance score.
        """
        output = dict()
        for para in itr:
            score = 0
            for word in queries:
                if word in itr[para]:
                    score += 1 
            if score >= scorethreshold: 
                output[para] = score

        output = {k: v for k, v in sorted(output.items(), key=lambda x: x[1], reverse=True)}
        return output

In [26]:
# Porter stemming
import nltk
nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize

def tokenise(string): # works on any arbitrary string
    tokens = []
    for sentence in sent_tokenize(string):
        for token in word_tokenize(sentence):
            tokens.append(token)
    return tokens

def stem(token): # tokenizes any particular token
    return PorterStemmer().stem(token)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\melvin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


## Run predictions on documents with 1 simple type of question.

In [7]:
'''
1. pull out the paragraphs of a valid document
2. run search engine to find relevant paragraphs
3. concatenate relevant paragraphs into a new "source" document
4. feed source and question into QnA model, get answer.
5. manually evaluate if the answer is able to answer the question.
6. auto evaluate if the predicted answer is the same or overlaps with the ground-truth answer.

7. repeat 1-6 for every doc and every question we want to test.

a. we start with questions about whether the accused was convicted. find one good phrasing.
b. can consider testing other phrasings for the same question as well.
c. then we identify other high-prio questions to test.
d. tabulate model's accuracy in answering each type of question.
'''

'\n1. pull out the paragraphs of a valid document\n2. run search engine to find relevant paragraphs\n3. concatenate relevant paragraphs into a new "source" document\n4. feed source and question into QnA model, get answer.\n5. manually evaluate if the answer is able to answer the question.\n6. auto evaluate if the predicted answer is the same or overlaps with the ground-truth answer.\n\n7. repeat 1-6 for every doc and every question we want to test.\n\na. we start with questions about whether the accused was convicted. find one good phrasing.\nb. can consider testing other phrasings for the same question as well.\nc. then we identify other high-prio questions to test.\nd. tabulate model\'s accuracy in answering each type of question.\n'

In [7]:
import pandas as pd
import numpy as np
import json
# with open('data/cases.json') as f:
#     cases = json.load(f)

In [8]:
'''
Set up the output_df dataframe from the .json output file.
(based on glowsplint's test_output.py code)
'''
output_df = pd.read_json('data/cases.json')

court_map = {
    'High Court': 'SGHC',
    'Court of Appeal': 'SGCA',
    'District Court': 'SGDC',
    'Court of Criminal Appeal': 'SGCA'
}

output_df['Court'] = output_df.Court.map(court_map)
ref_columns = {
    0: 'refSLR',
    1: 'ref'
}

# Split reference into SLR reference and normal reference
output_df[list(ref_columns.values())] = pd.DataFrame(
    output_df.reference.apply(pd.Series)).rename(columns=ref_columns)
ref_na = output_df['ref'].isna()
output_df.loc[ref_na, 'ref'], output_df.loc[ref_na,'refSLR'] = output_df.refSLR[ref_na], np.nan
output_df.drop('reference', axis=1, inplace=True)

# retain the ordered list of parties in one column, before exploding output_df['parties'] next.
output_df = output_df.assign(listed_parties=output_df.parties)

# Construct the unique reference
exploded_output = output_df.explode('parties')
output_df = exploded_output.loc[exploded_output.parties.str.lower(
) != "public prosecutor"].copy()
output_df['unique_ref'] = output_df['ref'] + \
    ' ' + output_df['parties']
output_df.unique_ref = output_df.unique_ref.str.upper().str.replace(
    "AND ANOTHER", "").str.replace("AND OTHERS", "").str.replace(' +', ' ').str.strip()
output_df = output_df[['unique_ref', 'Date', 'Court',
                                 'coram', 'counsel', 'listed_parties', 'parties', 'paragraphs']]
output_df.rename(columns={'parties': 'accused'}, inplace=True)

In [9]:
output_df.head()

Unnamed: 0,unique_ref,Date,Court,coram,counsel,listed_parties,accused,paragraphs
0,[2000] SGCA 8 CHIN SIONG KIAN,2000-02-12,SGCA,"[L P Thean JA, Chao Hick Tin JA]","{'prosecution': ['Low Cheong Yeow'], 'defence'...","[Chin Siong Kian, Public Prosecutor]",Chin Siong Kian,"{'1': 'On 6 September 1999 the appellant, toge..."
1,[2019] SGCA 81 MOHD AKEBAL S/O GHULAM JILANI,2019-11-28,SGCA,"[Sundaresh Menon CJ, Tay Yong Kwang JA, Steven...","{'prosecution': ['Chin Jincheng', 'in Criminal...","[Mohd Akebal s/o Ghulam Jilani, Public Prosecu...",Mohd Akebal s/o Ghulam Jilani,{'1': 'These appeals arise from the joint tria...
1,[2019] SGCA 81 MOHAMMED RUSLI BIN ABDUL RAHMAN,2019-11-28,SGCA,"[Sundaresh Menon CJ, Tay Yong Kwang JA, Steven...","{'prosecution': ['Chin Jincheng', 'in Criminal...","[Mohd Akebal s/o Ghulam Jilani, Public Prosecu...",Mohammed Rusli Bin Abdul Rahman,{'1': 'These appeals arise from the joint tria...
2,[1989] SGHC 75 GOH AH LIM,1989-08-24,SGHC,"[Lai Kew Chai J, F A Chua J]","{'prosecution': ['Lee Sing Lit'], 'defence': [...","[Public Prosecutor, Goh Ah Lim]",Goh Ah Lim,"{'1': 'The accused, a male Chinese aged 46, fa..."
3,[1989] SGHC 9 KADIR BIN AWANG,1989-02-03,SGHC,"[T S Sinnathuray J, Joseph Grimberg JC]","{'prosecution': ['Lee Sing Lit'], 'defence': [...","[Public Prosecutor, Kadir bin Awang]",Kadir bin Awang,{'1': 'Kadir bin Awang (“the accused”) was cha...


In [10]:
# filter out cases that are definitely appeal cases
appeals_indices = []
for i in range(len(output_df)):
    if 'SGCA' in output_df.iloc[i]["Court"]:
        appeals_indices.append(i)

appeals_df = output_df.iloc[appeals_indices]
appeals_df.head()

Unnamed: 0,unique_ref,Date,Court,coram,counsel,listed_parties,accused,paragraphs
0,[2000] SGCA 8 CHIN SIONG KIAN,2000-02-12,SGCA,"[L P Thean JA, Chao Hick Tin JA]","{'prosecution': ['Low Cheong Yeow'], 'defence'...","[Chin Siong Kian, Public Prosecutor]",Chin Siong Kian,"{'1': 'On 6 September 1999 the appellant, toge..."
1,[2019] SGCA 81 MOHD AKEBAL S/O GHULAM JILANI,2019-11-28,SGCA,"[Sundaresh Menon CJ, Tay Yong Kwang JA, Steven...","{'prosecution': ['Chin Jincheng', 'in Criminal...","[Mohd Akebal s/o Ghulam Jilani, Public Prosecu...",Mohd Akebal s/o Ghulam Jilani,{'1': 'These appeals arise from the joint tria...
1,[2019] SGCA 81 MOHAMMED RUSLI BIN ABDUL RAHMAN,2019-11-28,SGCA,"[Sundaresh Menon CJ, Tay Yong Kwang JA, Steven...","{'prosecution': ['Chin Jincheng', 'in Criminal...","[Mohd Akebal s/o Ghulam Jilani, Public Prosecu...",Mohammed Rusli Bin Abdul Rahman,{'1': 'These appeals arise from the joint tria...
5,[1991] SGCA 14 SIM AH CHEOH,1991-05-31,SGCA,"[Yong Pung How CJ, Chan Sek Keong J, L P Thean J]","{'prosecution': ['Chan Seng Onn'], 'defence': ...","[Sim Ah Cheoh and others, Public Prosecutor]",Sim Ah Cheoh and others,"{'1': 'The first appellant, Sim Ah Cheoh (“Sim..."
7,[1992] SGCA 30 KO MUN CHEUNG,1992-04-30,SGCA,"[L P Thean J, Goh Joon Seng J, S Rajendran J]","{'prosecution': [], 'defence': ['P Suppiah', '...","[Ko Mun Cheung and another, Public Prosecutor]",Ko Mun Cheung and another,"{'1': 'The two appellants, Ko Mun Cheung (“Ko”..."


In [11]:
searchEngine = primitiveSearchEngine()

ConvictionTrialQ = ["accordingly", "acquit", "charge", "convict", "element", "guilty", "made out", "prove", "reasonable doubt", "reasons", "satisfied", "sentence", "therefore" ]
ConvictionAppealQ = ConvictionTrialQ + ["affirm", "allow", "dismiss"]

#add porter stemming to capture more relevant words.
search_terms = ConvictionAppealQ
search_terms = [stem(term) for term in search_terms] # remove if stemming doesn't help

In [49]:
from random import choice
from nltk.tokenize.treebank import TreebankWordDetokenizer
def predict_multiple(cases_df, search_terms, print_output=True):
    """
    todo: put under the predictor class.
    """
    preds = []
    for i in range(len(cases_df)):
        print(f"{i+1}/{len(cases_df)}")
        # determine appellant.
        if cases_df.iloc[i]['listed_parties'][0].lower() != "public prosecutor":
            appellant = cases_df.iloc[i]['accused']
            and_index = appellant.find("and")
            if and_index != -1:
                # if 'X and others' then cut away ' and others'
                appellant = appellant[0:and_index - 1]
        else:
            appellant = "public prosecutor"
        
        qn = f"was {appellant}'s appeal dismissed?"
        #qn = f"was {appellant}'s appeal allowed?"
        # todo: modify this code chunk to ask multiple question-phrasings at once
        # instead of only one question-phrasing per document.
        
        sentences = {}
        item_num = 1
        for para_num, para in cases_df.iloc[i]['paragraphs'].items():
            for sent in sent_tokenize(para):
                sentences[item_num] = sent
                item_num = item_num + 1
        
        # Find sentences related to search terms.
        # rule1() yields item numbers that contain ANY of the search terms. 
        SCORE_THRESHOLD = 1  # arbitrary threshold for "rule1"; result must be greater than thres.
        results = searchEngine.rule1(sentences, search_terms, SCORE_THRESHOLD)
        
        predicted_sent = 'NA' # prediction defaults to NA if no relevant paras found.
        if len(results) > 0:    
            top_item_keys = list()
            keys_list = list(results.keys())
            max_score = results[keys_list[0]]  # because keys_list is presorted according to
            # descending score
            for item_num, score in results.items():
                if score == max_score:
                    top_item_keys.append(item_num)
                else:
                    break

            random_key = choice(top_item_keys)
            predicted_sent = sentences[random_key] 
    
        preds.append(predicted_sent)
        
        if print_output:
            # TODO: use logger.
            print(f"number of sentences found: {len(sentences)}")       
            print(f"number of candidate sentences found: {len(top_item_keys)}")       
            print(f"qn: {qn}")
            print(f"predicted sent: {predicted_sent}")
            print()
    return preds

In [50]:
preds = predict_multiple(appeals_df, search_terms)
len(preds)

1/131
number of sentences found: 224
number of candidate sentences found: 2
qn: was Chin Siong Kian's appeal dismissed?
predicted sent: The trial judge therefore convicted the appellant and Wan under the amended joint charge and sentenced them to suffer death.

2/131
number of sentences found: 107
number of candidate sentences found: 4
qn: was Mohd Akebal s/o Ghulam Jilani's appeal dismissed?
predicted sent: (a)     Akebal was convicted on a single charge of trafficking not less than 29.06g of diamorphine, an offence under s 5(1)(a) of the Misuse of Drugs Act (Cap 185, 2008 Rev Ed) (“MDA”), and sentenced to death.

3/131
number of sentences found: 107
number of candidate sentences found: 4
qn: was Mohammed Rusli Bin Abdul Rahman's appeal dismissed?
predicted sent: (c)     Andi was convicted on one charge of possessing not less than 29.06g of diamorphine for the purpose of trafficking under s 5(1)(a) read with s 5(2) of the MDA, and sentenced to life imprisonment and 15 strokes of the c

131

In [57]:
# display length of each predicted answer in terms of no. of tokens.

tokenized_answers = list()
for sent in preds:
    tokens = word_tokenize(sent)
    tokenized_answers.append(tokens)

lengths = [len(answer) for answer in tokenized_answers]
print(f"min sent length: {min(lengths)}")
print(f"max sent length {max(lengths)}")

min sent length: 6
max sent length 162


In [58]:
path = input('enter path to save csv file:\n')
to_save = pd.DataFrame()
to_save.insert(0, "unique_ref", appeals_df['unique_ref'].values)
to_save.insert(1, "predicted_answer", preds)
to_save.to_csv(path)

enter path to save csv file:
 data/appeals_base.csv
