In [55]:
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/malaysia-ejudgement.jsonl

In [None]:
# !pip install -q -U google-generativeai

In [43]:
import json
from glob import glob
from typing import List
from time import time, sleep

from tenacity import Retrying, RetryError, \
                     stop_after_attempt, wait_fixed

from tqdm import tqdm
from pydantic import BaseModel

from huggingface_hub import HfApi
import google.generativeai as genai

In [44]:
# configure model credentials. Get your own API key!

genai.configure(api_key="")

model = genai.GenerativeModel('gemini-1.5-flash')

In [20]:
class SyntheticQASchema(BaseModel):
    question: str
    answer: str
    
schema = SyntheticQASchema.schema()
schema

{'properties': {'question': {'title': 'Question', 'type': 'string'},
  'answer': {'title': 'Answer', 'type': 'string'}},
 'required': ['question', 'answer'],
 'title': 'SyntheticQASchema',
 'type': 'object'}

### 1.0 Append chunks together (to get 100k context length)

#### 1.1 Count token length in dataset

In [31]:
# check what one sample looks like
with open('malaysia-ejudgement.jsonl') as f:
    for x in tqdm(f):
        x=json.loads(x)
        print(x)
        break

0it [00:00, ?it/s]

Microsoft Word - DRAFT GOJ HUP HUAT CONSTRUCTION AND ENGINEERING SDN BHD v. PECCA LEATHER SDN BHD  Another


 1   IN THE HIGH COURT OF MALAYA AT KANGAR 

IN THE STATE OF PERLIS, MALAYSIA 

(COMMERCIAL DIVISION) 

COMPANIES WINDING -UP NO RA-28NCC-4-03/2022 

POST COMPANIES (WINDING UP) APPLICATION NO.: RA-28PW-4-05/2023 

(COMPANIES WINDING-UP NO.:RA-28NCC-4-03/2022)   In the matter of sections 465 (1)(e), and 

466(1)(a), 471, 492, 492 and 494 of the 

Companies Act, 2016,   And   In the matter of HUP HUAT 

CONSTRUCTION AND ENGINEERING SDN. 

BHD. (Company No.783563-M) (In 

Liquidation),   And   In the matter of the Companies (Winding-

Up) Rules 1972,   And   In the matter of Ooi Lee Wei (NRIC : No.: 

770220-09-5197), the Applicant 

09/02/2024 12:30:38

RA-28PW-4-05/2023 Kand. 105

S/N I6I3gBthU6C4y3ED3Q9Q
**Note : Serial number will be used to verify the originality of this document via eFILING portal



 2   BETWEEN   PECCA LEATHER SDN BHD   ... PETITIONER   AND   HUP HUAT CONS




In [39]:
def count_token(filename):
    filename_write = f'count_token_{filename.split(".")[0]}.jsonl'
    print(f'Writing to: {filename_write}')
    
    with open(filename_write, 'w') as f:
        with open(filename) as f2:
            for x in tqdm(f2):

                x=json.loads(x)

                data = {}
                while True:
                    try:
                        data['count'] = model.count_tokens(x).total_tokens
                        break
                    except:
                        time.sleep(60)

                data['prompt'] = x

                json.dump(data, f)
                f.write('\n')

In [40]:
count_token('malaysia-ejudgement.jsonl')

Writing to: count_token_malaysia-ejudgement.jsonl


17204it [23:24, 12.25it/s]


In [41]:
# check what one sample looks like
with open('count_token_malaysia-ejudgement.jsonl') as _:
    for y in tqdm(_):
        y=json.loads(y)
        print(y)
        break

0it [00:00, ?it/s]

{'count': 2916, 'prompt': 'Microsoft Word - DRAFT GOJ HUP HUAT CONSTRUCTION AND ENGINEERING SDN BHD v. PECCA LEATHER SDN BHD  Another\n\n\n 1   IN THE HIGH COURT OF MALAYA AT KANGAR \n\nIN THE STATE OF PERLIS, MALAYSIA \n\n(COMMERCIAL DIVISION) \n\nCOMPANIES WINDING -UP NO RA-28NCC-4-03/2022 \n\nPOST COMPANIES (WINDING UP) APPLICATION NO.: RA-28PW-4-05/2023 \n\n(COMPANIES WINDING-UP NO.:RA-28NCC-4-03/2022)   In the matter of sections 465 (1)(e), and \n\n466(1)(a), 471, 492, 492 and 494 of the \n\nCompanies Act, 2016,   And   In the matter of HUP HUAT \n\nCONSTRUCTION AND ENGINEERING SDN. \n\nBHD. (Company No.783563-M) (In \n\nLiquidation),   And   In the matter of the Companies (Winding-\n\nUp) Rules 1972,   And   In the matter of Ooi Lee Wei (NRIC : No.: \n\n770220-09-5197), the Applicant \n\n09/02/2024 12:30:38\n\nRA-28PW-4-05/2023 Kand. 105\n\nS/N I6I3gBthU6C4y3ED3Q9Q\n**Note : Serial number will be used to verify the originality of this document via eFILING portal\n\n\n\n 2   BETWE




#### 1.2 Append examples with each other to reach 100k Context Length

In [42]:
fw = 'malaysia-ejudgement_context_append.jsonl'
f = 'count_token_malaysia-ejudgement.jsonl'
chunk_temp = []
count = 0

with open(fw,'w') as fopen:
    with open(f) as fopen1:
        for x in tqdm(fopen1):
            
            x=json.loads(x)
            count+=x['count']
            chunk_temp.extend(x['prompt'])
            
            if count<=100000:
                pass
            else:
                data = {}
                data['count'] = count
                data['context'] = ''.join(list(chunk_temp))
                
                json.dump(data, fopen)
                fopen.write('\n')
                
                # reset count and variable for appended text variable.
                count=0
                chunk_temp=[]

17204it [00:13, 1254.89it/s]


### 2.0 Generate Q&A from datasets

#### 2.1 Testing

In [12]:
fw = 'malaysia-ejudgement_context_append.jsonl'

In [13]:
data_monitor = []

with open(fw) as f:
    for x in tqdm(f):
        data_monitor.append(json.loads(x))

1689it [00:01, 964.31it/s]


In [6]:
data_monitor[0]

{'count': 108787,
 'context': 'Microsoft Word - DRAFT GOJ HUP HUAT CONSTRUCTION AND ENGINEERING SDN BHD v. PECCA LEATHER SDN BHD  Another\n\n\n 1   IN THE HIGH COURT OF MALAYA AT KANGAR \n\nIN THE STATE OF PERLIS, MALAYSIA \n\n(COMMERCIAL DIVISION) \n\nCOMPANIES WINDING -UP NO RA-28NCC-4-03/2022 \n\nPOST COMPANIES (WINDING UP) APPLICATION NO.: RA-28PW-4-05/2023 \n\n(COMPANIES WINDING-UP NO.:RA-28NCC-4-03/2022)   In the matter of sections 465 (1)(e), and \n\n466(1)(a), 471, 492, 492 and 494 of the \n\nCompanies Act, 2016,   And   In the matter of HUP HUAT \n\nCONSTRUCTION AND ENGINEERING SDN. \n\nBHD. (Company No.783563-M) (In \n\nLiquidation),   And   In the matter of the Companies (Winding-\n\nUp) Rules 1972,   And   In the matter of Ooi Lee Wei (NRIC : No.: \n\n770220-09-5197), the Applicant \n\n09/02/2024 12:30:38\n\nRA-28PW-4-05/2023 Kand. 105\n\nS/N I6I3gBthU6C4y3ED3Q9Q\n**Note : Serial number will be used to verify the originality of this document via eFILING portal\n\n\n\n 2   B

In [14]:
len(data_monitor)

1689

In [15]:
# check count accuracy
data_monitor[3]['count']

107861

In [16]:
# verify with model
model.count_tokens(data_monitor[3]['context'])

total_tokens: 107861

In [17]:
prefix = """
Context:
```
{context}
```

Provide a response in a structured JSON format that matches the following model: {json_model}
Based on Context above, generate a complex question.

"""

In [18]:
prefix1 = """
Context:
```
{context}
```

Based on Context above, generate a complex question.

answer this format, and reply only in English:

question: 

answer:

"""

In [19]:
# test on one example
prompt = prefix1.format(context=data_monitor[3]['context'])
response = model.generate_content(prompt)

response

response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=protos.GenerateContentResponse({
      "candidates": [
        {
          "content": {
            "parts": [
              {
                "text": "question:  In the case of WSA Precision Sdn Bhd v Lee Kok Cheong, the Defendant was accused of breaching his fiduciary duty by establishing competing companies and selling the Plaintiff's goods at discounted prices. However, the Defendant argued that the Plaintiff was aware of his involvement in the competing companies and that the discounts were authorized by the Plaintiff's management. Considering the specific details of the case, including the evidence presented by both parties and the legal principles involved, how does the court reconcile the Defendant's claims of awareness and authorization with the Plaintiff's allegations of breach of fiduciary duty? \n\nanswer: This is a complex question that requires a deep understanding of the case and its context.

#### 2.2 Inference

In [20]:
prefix1 = """
Context:
```
{context}
```

Based on Context above, generate a complex question.

answer this format, and reply only in English:

question: 

answer:

"""

In [21]:
fw = 'malaysia-ejudgement_synthetic_qa_sets_1.jsonl'
fr = 'malaysia-ejudgement_context_append.jsonl'

with open(fw, 'w') as fwrite:
    with open(fr) as fread:
        i = 0
        for x in tqdm(fread):
            
            final_res = {}
            
            x=json.loads(x)
            
            prompt = prefix1.format(context=x['context'])
            
            try:
                for attempt in Retrying(wait=wait_fixed(60 * 5), stop=stop_after_attempt(2)):
                    with attempt:
                        # based on checkpoints
                        if i > 573:
                            response = model.generate_content(prompt)

                            final_res['context_length'] = x['count']
                            final_res['context'] = x['context']
                            final_res['qna'] = response.candidates[0].content.parts[0].text

                            json.dump(final_res, fwrite)
                            fwrite.write('\n')
                        
                        i += 1
            except RetryError:
                print(f'Error at document with this index: {i}')
                i += 1
                continue

575it [05:07,  1.41s/it] 

Error at document with this index: 574


576it [10:12,  3.39s/it]

Error at document with this index: 575


586it [17:36, 47.03s/it]

Error at document with this index: 585


610it [29:38, 108.67s/it]

Error at document with this index: 609


616it [36:28, 116.08s/it]

Error at document with this index: 615


623it [43:02, 110.08s/it]

Error at document with this index: 622


629it [49:06, 91.27s/it] 

Error at document with this index: 628


638it [55:57, 103.11s/it]

Error at document with this index: 637


643it [1:02:00, 116.10s/it]

Error at document with this index: 642


655it [1:10:03, 104.77s/it]

Error at document with this index: 654


659it [1:16:01, 124.72s/it]

Error at document with this index: 658


660it [1:21:05, 178.49s/it]

Error at document with this index: 659


667it [1:27:53, 118.69s/it]

Error at document with this index: 666


678it [1:35:58, 105.98s/it]

Error at document with this index: 677


679it [1:41:05, 164.96s/it]

Error at document with this index: 678


697it [1:49:49, 84.21s/it] 

Error at document with this index: 696


700it [1:55:31, 123.85s/it]

Error at document with this index: 699


789it [2:34:58, 102.04s/it]

Error at document with this index: 788


790it [2:40:04, 163.37s/it]

Error at document with this index: 789


793it [2:45:39, 152.61s/it]

Error at document with this index: 792


799it [2:52:15, 122.05s/it]

Error at document with this index: 798


809it [2:59:42, 107.55s/it]

Error at document with this index: 808


818it [3:07:18, 111.83s/it]

Error at document with this index: 817


823it [3:13:10, 116.54s/it]

Error at document with this index: 822


838it [3:22:11, 104.12s/it]

Error at document with this index: 837


839it [3:27:17, 164.57s/it]

Error at document with this index: 838


849it [3:34:30, 107.52s/it]

Error at document with this index: 848


853it [3:40:15, 123.68s/it]

Error at document with this index: 852


854it [3:45:19, 177.78s/it]

Error at document with this index: 853


871it [3:54:21, 101.68s/it]

Error at document with this index: 870


880it [4:02:21, 107.27s/it]

Error at document with this index: 879


894it [4:10:54, 104.35s/it]

Error at document with this index: 893


895it [4:15:59, 164.66s/it]

Error at document with this index: 894


899it [4:21:42, 136.98s/it]

Error at document with this index: 898


914it [4:30:39, 103.80s/it]

Error at document with this index: 913


959it [4:47:44, 104.81s/it]

Error at document with this index: 958


1004it [5:05:43, 106.75s/it]

Error at document with this index: 1003


1034it [5:20:01, 103.06s/it]

Error at document with this index: 1033


1050it [5:30:08, 109.09s/it]

Error at document with this index: 1049


1092it [5:46:20, 101.84s/it]

Error at document with this index: 1091


1095it [5:51:58, 132.40s/it]

Error at document with this index: 1094


1100it [5:58:19, 124.57s/it]

Error at document with this index: 1099


1101it [6:03:25, 178.82s/it]

Error at document with this index: 1100


1103it [6:08:50, 183.40s/it]

Error at document with this index: 1102


1105it [6:14:09, 184.56s/it]

Error at document with this index: 1104


1144it [6:29:55, 103.56s/it]

Error at document with this index: 1143


1155it [6:37:17, 101.22s/it]

Error at document with this index: 1154


1172it [6:46:44, 102.98s/it]

Error at document with this index: 1171


1186it [6:55:13, 103.08s/it]

Error at document with this index: 1185


1202it [7:04:41, 103.18s/it]

Error at document with this index: 1201


1216it [7:13:06, 103.59s/it]

Error at document with this index: 1215


1244it [7:25:52, 103.49s/it]

Error at document with this index: 1243


1275it [7:45:30, 108.27s/it]

Error at document with this index: 1274


1284it [7:52:56, 106.72s/it]

Error at document with this index: 1283


1291it [7:59:16, 108.65s/it]

Error at document with this index: 1290


1299it [8:06:06, 107.01s/it]

Error at document with this index: 1298


1304it [8:12:11, 117.43s/it]

Error at document with this index: 1303


1321it [8:21:42, 102.73s/it]

Error at document with this index: 1320


1324it [8:27:11, 131.36s/it]

Error at document with this index: 1323


1328it [8:32:57, 128.82s/it]

Error at document with this index: 1327


1331it [8:38:28, 140.65s/it]

Error at document with this index: 1330


1332it [8:43:34, 190.21s/it]

Error at document with this index: 1331


1366it [8:56:42, 100.73s/it]

Error at document with this index: 1365


1393it [9:10:29, 102.25s/it]

Error at document with this index: 1392


1394it [9:15:39, 164.46s/it]

Error at document with this index: 1393


1400it [9:22:21, 122.15s/it]

Error at document with this index: 1399


1443it [9:38:54, 105.33s/it]

Error at document with this index: 1442


1450it [9:45:13, 107.91s/it]

Error at document with this index: 1449


1468it [9:55:23, 103.78s/it]

Error at document with this index: 1467


1469it [10:00:28, 164.31s/it]

Error at document with this index: 1468


1485it [10:09:20, 105.87s/it]

Error at document with this index: 1484


1493it [10:16:20, 108.70s/it]

Error at document with this index: 1492


1500it [10:23:20, 110.45s/it]

Error at document with this index: 1499


1583it [10:50:53, 102.08s/it]

Error at document with this index: 1582


1627it [11:07:35, 102.80s/it]

Error at document with this index: 1626


1640it [11:16:01, 105.26s/it]

Error at document with this index: 1639


1646it [11:22:09, 111.62s/it]

Error at document with this index: 1645


1651it [11:28:15, 117.99s/it]

Error at document with this index: 1650


1657it [11:34:44, 114.55s/it]

Error at document with this index: 1656


1658it [11:39:47, 171.26s/it]

Error at document with this index: 1657


1677it [11:49:19, 101.25s/it]

Error at document with this index: 1676


1689it [11:52:35, 25.31s/it] 


In [9]:
# check final results
data = []

with open('malaysia-ejudgement_synthetic_qa_sets_1.jsonl') as f:
    for x in f:
        data.append(json.loads(x))

In [10]:
data[0].keys()

dict_keys(['context_length', 'context', 'qna'])

In [23]:
len(data)

1034

In [24]:
data[0]['qna']

'question: In the context of the provided legal documents, how does the principle of indefeasibility of title, as enshrined in the National Land Code (KTN), interact with the alleged representations made by TNB employees to the Defendants regarding their occupancy of the land? Does the  doctrine of laches, or the existence of a TOL (Lesen Pendudukan Sementara), potentially diminish or negate the enforceability of TNB\'s indefeasible title in this case? \n\nanswer: This is a complex legal question with multiple layers. The provided case analysis suggests that TNB has a valid and indefeasible title to the land based on its registration in the land register. This principle, enshrined in Section 89 of the KTN, generally renders such registered titles unassailable, barring specific exceptions like fraud. However, the Defendants claim they received assurances from TNB employees allowing their continued occupancy, despite lacking a formal written agreement. \n\nThe issue hinges on whether TNB

### 3.0 Split Q&A into columns and get 'Explanation' column

Yeah, probs should've used the SyntheticQASchema earlier!

In [45]:
with open('parsed_qna_malaysia_ejudgement.jsonl', 'w') as f_write:
    with open('malaysia-ejudgement_synthetic_qa_sets_1.jsonl') as f:
        i = 0
        for l in tqdm(f):
            try:
                l = json.loads(l)
                splitted = l['qna'].split('\n\nanswer:')
                if len(splitted) != 2:
                    _ = splitted
                    splitted = l['qna'].split('\n \nanswer:')
                    print(i)
                    print(_)
                    print('')

                question = splitted[0].split('stion:')[1].strip()
                answer = splitted[1].strip()

                context = l["qna"]
                appended_prompt = f"""
                    Context:
                    ```
                    {context}
                    ```

                    According to the context, please explain why you answer this
                    """

                l['raw_context'] = l.pop('context')
                l['question'] = question
                l['answer'] = answer
                l["answer_explanation"] = model.generate_content(appended_prompt) \
                                               .candidates[0].content.parts[0].text
                l['count_token'] = l.pop('context_length')
                l.pop('qna')

                # write l dictionary to f_write
                f_write.write(f'{json.dumps(l)}\n')
            except IndexError:
                # handle the exception where Gemini blocks the output and returns nothing
                print(f"Model coundn't generate for this index: {i}")
                continue
            
            i += 1

165it [08:54,  3.15s/it]

165
['question:  In the context of the provided legal documents, explain the legal principles of "Res Judicata" and "Issue Estoppel" and how they were applied (or not applied) in the decision regarding the Plaintiffs\' claims against the Defendant. \n \nanswer: The provided legal documents highlight a complex case involving the Plaintiffs (AirAsia Berhad and AirAsia X Berhad) and the Defendant (Malaysia Airports (Sepang) Sdn Bhd).  The Defendant argued that the Plaintiffs were barred from pursuing their claims due to the doctrine of Res Judicata, which prevents parties from relitigating issues already decided by a court. The Defendant specifically argued the Plaintiffs\' claims were subject to Issue Estoppel, which prohibits the relitigation of specific issues already decided in previous cases.\n\nThe Judicial Commissioner\'s decision focused on the absence of a clear and conclusive settlement agreement between the parties.  The Court found that there was no binding agreement to resolv

214it [11:27,  3.11s/it]

213
['question: In the context of the legal proceedings between CMM1 Investment Limited and the Minister of Finance Malaysia, does the applicant\'s application for judicial review represent a valid challenge to the respondent\'s inaction, despite the absence of a direct decision by the Minister? Does the application raise genuine issues regarding the Minister\'s failure to consider the applicability of the Labuan Business Activity Tax Act 1990 (LBATA) and the applicant\'s legitimate expectations stemming from the Exemption Order?\n \nanswer: The answer is multifaceted and hinges on several key aspects of the legal framework governing judicial review in Malaysia. Firstly, the concept of "deemed decision" – where inaction by a public authority can be construed as a decision – is a crucial element. Secondly, the application\'s arguability, based on the applicant\'s claim that the LBATA should have been applied instead of the ITA, raises a critical question about the Minister\'s alleged fa

880it [48:30,  2.29s/it]

Model coundn't generate for this index: 877


1034it [56:46,  3.29s/it]


In [46]:
# check final results
data = []

with open('parsed_qna_malaysia_ejudgement.jsonl') as f:
    for x in tqdm(f):
        data.append(json.loads(x))

1031it [00:01, 860.04it/s]


In [47]:
data[0].keys()

dict_keys(['raw_context', 'question', 'answer', 'answer_explanation', 'count_token'])

In [48]:
data[0]['question']

"In the context of the provided legal documents, how does the principle of indefeasibility of title, as enshrined in the National Land Code (KTN), interact with the alleged representations made by TNB employees to the Defendants regarding their occupancy of the land? Does the  doctrine of laches, or the existence of a TOL (Lesen Pendudukan Sementara), potentially diminish or negate the enforceability of TNB's indefeasible title in this case?"

In [49]:
data[0]['answer']

'This is a complex legal question with multiple layers. The provided case analysis suggests that TNB has a valid and indefeasible title to the land based on its registration in the land register. This principle, enshrined in Section 89 of the KTN, generally renders such registered titles unassailable, barring specific exceptions like fraud. However, the Defendants claim they received assurances from TNB employees allowing their continued occupancy, despite lacking a formal written agreement. \n\nThe issue hinges on whether TNB\'s actions or inaction, including its employees\' statements, could constitute a "laches" defense, where the delay in asserting ownership leads to a loss of rights. This defense hinges on a finding of unfair prejudice to the Defendants due to TNB\'s delay and their reliance on the representations. The TOL, while a short-term permit, also creates ambiguity. If deemed a legitimate grant by a proper authority, it could arguably be seen as a formal recognition of occ

In [50]:
data[0]["answer_explanation"]

"The answer provided is a concise and accurate explanation of the legal complexities surrounding the interplay between the principle of indefeasibility of title, laches, and the existence of a TOL (Lesen Pendudukan Sementara) in the context of the provided legal documents. \n\nHere's why:\n\n1. **Understanding the Key Concepts:** The answer correctly identifies and defines the core legal principles involved: \n    * **Indefeasibility of Title:** This principle, enshrined in the National Land Code (KTN), protects registered land titles from challenges unless specific exceptions like fraud are proven. The answer correctly highlights Section 89 of the KTN as the relevant provision.\n    * **Laches:**  This doctrine refers to the delay in asserting a legal right, which can result in the loss of that right if it causes unfair prejudice to the opposing party. The answer explains how this doctrine could potentially impact the case by examining the effect of TNB's inaction and the representati

In [54]:
# export dataset!
api = HfApi()

api.upload_file(
    path_or_fileobj='parsed_qna_malaysia_ejudgement.jsonl',
    path_in_repo='parsed_qna_malaysia_ejudgement.jsonl',
    repo_id='malaysia-ai/long-context-malaysia-ejudgement-QA',
    repo_type='dataset',
)

parsed_qna_malaysia_ejudgement.jsonl:   0%|          | 0.00/402M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/malaysia-ai/long-context-malaysia-ejudgement-QA/commit/f46e5fc9e3c18bbc3079f8cc4d8afe2553beba4e', commit_message='Upload parsed_qna_malaysia_ejudgement.jsonl with huggingface_hub', commit_description='', oid='f46e5fc9e3c18bbc3079f8cc4d8afe2553beba4e', pr_url=None, pr_revision=None, pr_num=None)