In [3]:
from openai import OpenAI
import pickle
from dotenv import load_dotenv
from tqdm import tqdm

load_dotenv()

True

In [4]:
# loading article chunks
with open('../../data/document_chunks/article_wise_chunks_categorized_english.pkl', 'rb') as file:
    article_chunks = pickle.load(file)

article_chunks[:3]

[('The Republic and its territories\n11.\t(1)\tPakistan shall be Federal Republic to be known as the Islamic Republic of Pakistan, hereinafter referred to as Pakistan.\n\n2[(2)\tThe territories of Pakistan shall comprise—\n\nthe\tProvinces\tof\t3[Balochistan],\tthe\t4[Khyber Pakhtunkhwa], the Punjab and 5[Sindh];\nthe Islamabad Capital Territory, hereinafter referred to as the Federal Capital; 6[and]\n6[(c)\t*\t*\t*\t*\t*\t*]\n\n6[(c)] such States and territories as are or may be included in Pakistan, whether by accession or otherwise.\n\n(3) 7[Majlis-e-Shoora (Parliament)] may by law admit into the Federation new States or areas on such terms and conditions as it thinks fit.]',
  'Medium'),
 ('Islam to be State religion\n\nIslam shall be the State religion of Pakistan. ',
  'Small'),
 ('The   Objectives\tResolution   to   form\tpart\tof substantive provisions\n\n8[2A. The principles and provisions set out in the Objectives Resolution reproduced in the Annex are hereby made substantive

In [5]:
def quantify_questions(section_list):
    size_mapping = {
        'Small': 1,
        'Medium': 2,
        'Large': 3
    }
    result = [size_mapping[size] for _, size in section_list]
    return result

number_of_questions = quantify_questions(article_chunks)
number_of_questions[:10]

[2, 1, 1, 1, 2, 2, 2, 2, 3, 1]

In [4]:
def generate_questions_and_answers(article_text, number_of_questions, model="gpt-4"):
    # Define the system prompt
    system_prompt = """You are an expert in interpreting Pakistani legal documents. 
Given an article text from a legal document, you generate a set of questions and their corresponding answers."""

    # Define the task prompt
    task_prompt = f"""Your task is to generate up to {number_of_questions} unique question(s) and answer(s) 
based on the following article text:


Article Text:
```{article_text}```


Instructions:
- If it is not possible to generate {number_of_questions} question(s), provide as many questions and answers as you can based on the content.
- If no question can be generated, respond with 'NONE'.
- Ensure that each question is unique and related to the text provided.
- The answer should be based on the information provided in the article text only.
- Do not mention article numbers in the questions.
- Do not mention about the article text in the questions.
- You are to ask questions as a layman would ask naturally, not as a legal expert. They should be from the perspective of a person who is trying to find out what the constitution says about a particular topic.
- The questions should not be too specific or too general.
- Avoid using the same words or phrases from the article text in the questions.
- The answers can use the same words or phrases from the article text.

Example Output Format:
Question: What is the official name of the country Pakistan?
Answer: The official name of the country is the Islamic Republic of Pakistan.

Question: Which regions are part of Pakistan?
Answer: The regions mentioned as parts of Pakistan are Balochistan, Khyber Pakhtunkhwa, Punjab, Sindh, and the Islamabad Capital Territory."""

    # Create the OpenAI client
    client = OpenAI()

    # Generate the response
    completion = client.chat.completions.create(
        model=model,
        temperature=0,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": task_prompt}
        ]
    )
    
    return completion.choices[0].message.content

In [6]:
gpt_responses = []
for i in tqdm(range(len(article_chunks)), desc="Processing Articles"):
    gpt_response = generate_questions_and_answers(article_text=article_chunks[i][0], 
                                                   number_of_questions=number_of_questions[i], 
                                                   model="gpt-4o")
    gpt_responses.append(gpt_response)

Processing Articles: 100%|██████████| 312/312 [10:46<00:00,  2.07s/it]


In [8]:
# Save the list to a binary file
with open('../../data/augmented_datasets/gpt_respnoses.pkl', 'wb') as file:
    pickle.dump(gpt_responses, file)

In [12]:
import re

def parse_qa_list(qa_list):
    parsed_list = []
    
    for entry in qa_list:
        # Split entry into questions and answers
        qa_pairs = re.findall(r'Question: (.*?)\nAnswer: (.*?)(?:\n\n|$)', entry, re.DOTALL)
        parsed_list.append([(q, a) for q, a in qa_pairs])
    
    return parsed_list

### Parsed to format [[(q1, a1), (q2,a2)], [(q1,a1)], ....]

In [14]:
parsed_list_of_qa_sets = parse_qa_list(gpt_responses)

In [15]:
parsed_list_of_qa_sets[:3]

[[('What type of government does Pakistan have?',
   'Pakistan is a Federal Republic known as the Islamic Republic of Pakistan.'),
  ('Can new states or areas become part of Pakistan?',
   'Yes, the Majlis-e-Shoora (Parliament) may admit new states or areas into the Federation by law on terms and conditions it thinks fit.')],
 [('What is the designated religion of Pakistan?',
   'Islam shall be the State religion of Pakistan.')],
 [('What has been made a substantive part of the Constitution according to the provided text?',
   'The principles and provisions set out in the Objectives Resolution have been made a substantive part of the Constitution.')]]

## Convert to CSV

In [16]:
import csv

def save_qa_to_csv(parsed_list, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Question', 'Answer', 'Answer Chunk Index']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        
        for chunk_index, qa_pairs in enumerate(parsed_list):
            for question, answer in qa_pairs:
                writer.writerow({
                    'Question': question,
                    'Answer': answer,
                    'Answer Chunk Index': chunk_index
                })

In [17]:
save_qa_to_csv(parsed_list_of_qa_sets, '../../data/augmented_datasets/qa_dataset_eng_v1.csv')

In [2]:
import pandas as pd

pd.read_csv('../../data/augmented_datasets/qa_dataset_eng_v1.csv')

Unnamed: 0,Question,Answer,Answer Chunk Index
0,What type of government does Pakistan have?,Pakistan is a Federal Republic known as the Is...,0
1,Can new states or areas become part of Pakistan?,"Yes, the Majlis-e-Shoora (Parliament) may admi...",0
2,What is the designated religion of Pakistan?,Islam shall be the State religion of Pakistan.,1
3,What has been made a substantive part of the C...,The principles and provisions set out in the O...,2
4,What is the State's responsibility regarding e...,The State shall ensure the elimination of all ...,3
...,...,...,...
616,Who is responsible for handling accounts that ...,The Auditor-General is responsible for handlin...,309
617,What powers does the Auditor-General have rega...,The Auditor-General has the same powers and fu...,309
618,Will existing taxes continue to be collected a...,"Yes, all taxes and fees levied under any law i...",310
619,When was the Proclamation of Emergency mention...,The Proclamation of Emergency was originally i...,311
