In [1]:
import json

# Open the JSON file and load the data
with open('allDocument.json', 'r') as file:
    data = json.load(file)

# Print the loaded data
# print(len(data[0]))
data[0]


{'question': "In the case of Nasr v NRMA Insurance [2006] NSWSC 1018, why was the plaintiff's appeal lodged out of time?",
 'text': "In Nasr v NRMA Insurance [2006] NSWSC 1018, the plaintiff's appeal was lodged out of time because the summons was filed on 8 June 2006, seven months after the decision of the Local Court was made on 4 October 2005. No explanation was provided for this delay.",
 'doc_id': '81ceb3c0'}

In [2]:
from collections import defaultdict

In [3]:
hashes = defaultdict(list)

for doc in data:
    docId = doc['doc_id']
    hashes[docId].append(doc)

In [4]:
hashes['81ceb3c0']

[{'question': "In the case of Nasr v NRMA Insurance [2006] NSWSC 1018, why was the plaintiff's appeal lodged out of time?",
  'text': "In Nasr v NRMA Insurance [2006] NSWSC 1018, the plaintiff's appeal was lodged out of time because the summons was filed on 8 June 2006, seven months after the decision of the Local Court was made on 4 October 2005. No explanation was provided for this delay.",
  'doc_id': '81ceb3c0'}]

In [18]:
import ollama

promptTemplate = """
    You emulate a user who will use the legal document assistant.
    Formulate 3 questions this user might ask based on a legal documents record. The record
    should contain the answer to the questions, and the questions should be complete and not too short.
    If possible, use as fewer words as possible from the record. 

    The record:
    question: {question}
    text: {text}

    Provide the output in array like this format below:

    ["question1", "question2", ..., "question5"]
    """.strip()

def generate_questions(doc):
    prompt = promptTemplate.format(**doc)

    llm = ollama.chat(
        model="qwen:0.5b",
        messages=[{"role": "user", "content":prompt}],
        # stream=True
    )

    
    return llm['message']['content']



In [12]:
from tqdm.auto import tqdm 

In [19]:
results = {}
for doc in tqdm(data): 
    doc_id = doc['doc_id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 47/27109 [16:41<189:15:37, 25.18s/it]

In [None]:
print(results['81ceb3c0'])

[
  "the plaintiff's appeal was lodged out of time because the summons was filed on 8 June 2006, seven months after the decision of the Local Court was made on 4 October 2005. No explanation was provided for this delay."
]
