In [1]:
import pickle
from dotenv import load_dotenv
from tqdm import tqdm
import pandas as pd
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator
from langchain_openai import ChatOpenAI
from pydantic.v1 import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser
import os
import json
from tqdm.notebook import tqdm
import langsmith

load_dotenv()

True

In [2]:
df_contexts = pd.read_csv('../../data/document_chunks/article_wise_chunks_eng_urdu_processed.csv')
df_contexts["index"] = df_contexts.index
df_contexts.head()

Unnamed: 0,English_Context_Chunks,Urdu_Context_Chunks,urdu_cleaned,Context_Index,index
0,The Republic and its territories\n11.\t(1)\tPa...,ﺍ۔ ﻣﻤﻠﮑﺖ ﭘﺎﮐﺴﺘﺎﻥ ﺍﯾﮏ ﻭﻓﺎﻗﯽ ﺟﻤﮩﻮﺭﯾﮧ ﮨﻮﮔﯽ ﺟﺲ ﮐﺎ...,۱۔ مملکت پاکستان ایک وفاقی جمہوریہ ہوگی جس کا ...,0,0
1,Islam to be State religion\n\nIslam shall be t...,ﺍﺳﻼﻡ ﻣﻠﮑﺘﯽ ﻣﺬﮨﺐ ﮨﻮﮔﺎ۔,اسلام ریاست کا مذہب ہوگا۔,1,1
2,The Objectives\tResolution to form\tpart...,ﺍﺳﻼﻡ ﭘﺎﮐﺴﺘﺎﻥ ﮐﺎﻣﻤﻠﮑﺘﯽ ﻣﺬﮨﺐ ﮨﻮﮔﺎ۔\n\n۲۲ ﺍﻟﻒ۔ ﺿﻤ...,اسلام پاکستان کا مملکتی مذہب ہوگا۔\n\n۲۲ الف۔ ...,2,2
3,Elimination of exploitation\n\nThe State shall...,ﺍﺳﺘﺤﺼﺎﻝ ﮐﺎ ﺧﺎﺗﻤﮧ۔ ﻣ۳۔ﻤﻠﮑﺖ ﺍﺳﺘﺤﺼﺎﻝ ﮐﯽ ﺗﻤﺎﻡ ﺍﻗﺴﺎ...,استحصال کا خاتمہ\n\nریاست استحصال کی تمام اقسا...,3,3
4,Right of individuals to be dealt with in accor...,ﻗﺎﻧﺍﻮﻓﺮﻥﺍﺩﻭﮐﻏﺎﯿﺮﺣﮦﻖﮐﮐﮯﮧ ﺍﻥ ﺳﮯ۴۔۴ﮨﺮﺷﮩﺮﯼ ﮐﺎﺧﻮﺍﮨﮑ...,افراد کا حق کہ ان کے ساتھ قانون کے مطابق سلوک ...,4,4


In [3]:
df_qa = pd.read_csv('../../data/augmented_datasets/qa_dataset_eng_v1.csv')
df_qa.head()

Unnamed: 0,Question,Answer,Answer Chunk Index
0,What type of government does Pakistan have?,Pakistan is a Federal Republic known as the Is...,0
1,Can new states or areas become part of Pakistan?,"Yes, the Majlis-e-Shoora (Parliament) may admi...",0
2,What is the designated religion of Pakistan?,Islam shall be the State religion of Pakistan.,1
3,What has been made a substantive part of the C...,The principles and provisions set out in the O...,2
4,What is the State's responsibility regarding e...,The State shall ensure the elimination of all ...,3


In [5]:
df = pd.merge(df_qa, df_contexts, left_on='Answer Chunk Index', right_on='index').drop_duplicates(subset=['Question'])

In [6]:
class QnAPair(BaseModel):
    question: str = Field(description="the question in Urdu")
    answer: str = Field(description="the answer in Urdu")

In [7]:
parser = PydanticOutputParser(pydantic_object=QnAPair)

In [8]:
model = ChatOpenAI(model="gpt-4o", temperature=0)

In [9]:
PROMPT_TEMPLATE = """
You are provided with the **context in English** as well as the **Q&A pair in English**. You will also be given the **context in Urdu**. Your task is to translate the English Q&A pair into Urdu, ensuring that the translation aligns with the style and tone of the Urdu context provided.

**Context in English:**
```{context_en}```

**Question (in English):**  
```{question}```

**Answer (in English):** 
```{answer}```


**Context in Urdu:** 
```{context_ur}```

{format_instructions}
"""

prompt = PromptTemplate(
    template=PROMPT_TEMPLATE,
    input_variables=["question", "answer", "context_en", "context_ur"],
    partial_variables={
        "format_instructions": parser.get_format_instructions()},
)

In [10]:
chain = prompt | model | parser

In [11]:
def translate_qna_pair(question, answer, context_en, context_ur):
    response = chain.invoke({"question": question, "answer": answer, "context_en": context_en, "context_ur": context_ur})

    return {'urdu_question': response.question,
            'urdu_answer': response.answer,
            'english_question': question,
            'english_answer': answer}

In [12]:
RESPONSE_FILE = "../../data/english_to_urdu_translations/openai_translations_v2.json"

if not os.path.exists(RESPONSE_FILE):
    with open(RESPONSE_FILE, 'w') as f:
        json.dump([], f)

In [13]:
def update_file(filepath, new_data):
    with open(filepath, 'r') as f:
        data = json.load(f)

    data.append(new_data)

    with open(filepath, 'w') as f:
        data = json.dump(data, f, indent=2)

In [14]:
error_count = 0

for index, row in tqdm(df.iterrows(), total=len(df)):
    question = row["Question"]
    answer = row["Answer"]
    context_en = row["English_Context_Chunks"]
    context_urdu = row["urdu_cleaned"]
    try:
        data_to_append = translate_qna_pair(question, answer, context_en=context_en, context_ur=context_urdu)
    except Exception as error:
        error_count += 1
        print(f'Error (#{error_count}): {error}')
        print(question)
        print('---' * 30)

        data_to_append = {'english_question': question,
                          'english_answer': answer,
                          'error': f"{error}"}
        
    update_file(RESPONSE_FILE, data_to_append)

  0%|          | 0/619 [00:00<?, ?it/s]

Error (#1): Failed to parse QnAPair from completion {"properties": {"question": {"title": "Question", "description": "the question in Urdu", "type": "string"}, "answer": {"title": "Answer", "description": "the answer in Urdu", "type": "string"}}, "required": ["question", "answer"]}. Got: 2 validation errors for QnAPair
question
  field required (type=value_error.missing)
answer
  field required (type=value_error.missing)
Under what circumstances can the President appoint an Additional Judge to a High Court?
------------------------------------------------------------------------------------------
Error (#2): Invalid json output: ```json
{
  "question": "اس سیاق و سباق میں "چیف جسٹس" سے کیا مراد ہے؟",
  "answer": ""چیف جسٹس" سے عدالت کا چیف جسٹس مراد ہے۔"
}
```
What does the term "Chief Justice" refer to in this context?
------------------------------------------------------------------------------------------
Error (#3): Failed to parse QnAPair from completion {"properties": {"question

## Error Log

In [None]:
"""Error (#1): Failed to parse QnAPair from completion {"properties": {"question": {"title": "Question", "description": "the question in Urdu", "type": "string"}, "answer": {"title": "Answer", "description": "the answer in Urdu", "type": "string"}}, "required": ["question", "answer"]}. Got: 2 validation errors for QnAPair
question
  field required (type=value_error.missing)
answer
  field required (type=value_error.missing)
Under what circumstances can the President appoint an Additional Judge to a High Court?
------------------------------------------------------------------------------------------
Error (#2): Invalid json output: ```json
{
  "question": "اس سیاق و سباق میں "چیف جسٹس" سے کیا مراد ہے؟",
  "answer": ""چیف جسٹس" سے عدالت کا چیف جسٹس مراد ہے۔"
}
```
What does the term "Chief Justice" refer to in this context?
------------------------------------------------------------------------------------------
Error (#3): Failed to parse QnAPair from completion {"properties": {"question": {"title": "Question", "description": "the question in Urdu", "type": "string"}, "answer": {"title": "Answer", "description": "the answer in Urdu", "type": "string"}}, "required": ["question", "answer"]}. Got: 2 validation errors for QnAPair
question
  field required (type=value_error.missing)
answer
  field required (type=value_error.missing)
What is included in the definition of "law" according to this chapter?
------------------------------------------------------------------------------------------
"""

In [15]:
error_qs = ["What is included in the definition of \"law\" according to this chapter?",
            "What does the term \"Chief Justice\" refer to in this context?",
            "Under what circumstances can the President appoint an Additional Judge to a High Court?"]

In [23]:
# Records that gave errors
error_df = df[df['Question'].isin(error_qs)]
error_df

Unnamed: 0,Question,Answer,Answer Chunk Index,English_Context_Chunks,Urdu_Context_Chunks,urdu_cleaned,Context_Index,index
393,Under what circumstances can the President app...,The President can appoint an Additional Judge ...,206,Additional Judges\n\nAt any time when—\n\nthe ...,ﺍﺿﺎﻧﯽ ﺟﺞ۔ ۷۹۱۔\n\nﺍﻟﻒﺍﻟﻒ ﮐﺴﯽ ﻋﺪﺍﻟﺖ ﻋﺎﻟﯿﮧ ﮐﮯﮐﺴﯽ...,اضافی جج\n\nکسی عدالت عالیہ کے کسی جج کا عہدہ ...,206,206
410,"What does the term ""Chief Justice"" refer to in...","""Chief Justice"" means Chief Justice of the Court.",214,"Definitions\n\n203B. In this Chapter, unless t...",۳۰۲۔ﺏ۔ ﺍﺱ ﺑﺎﺏ ﻣﯿﮟ،ﺗﺎﻭﻗﺘﯿﮑﮧ ﮐﻮﺋﯽ ﺍﻣﺮﻣﻮﺿﻮﻉ ﯾﺎﺳﯿﺎ...,```urdu\n۳۰۲۔ب۔ اس باب میں، تاوقتیکہ کوئی امر ...,214,214
411,"What is included in the definition of ""law"" ac...","""Law"" includes any custom or usage having the ...",214,"Definitions\n\n203B. In this Chapter, unless t...",۳۰۲۔ﺏ۔ ﺍﺱ ﺑﺎﺏ ﻣﯿﮟ،ﺗﺎﻭﻗﺘﯿﮑﮧ ﮐﻮﺋﯽ ﺍﻣﺮﻣﻮﺿﻮﻉ ﯾﺎﺳﯿﺎ...,```urdu\n۳۰۲۔ب۔ اس باب میں، تاوقتیکہ کوئی امر ...,214,214


In [24]:
for index, row in tqdm(error_df.iterrows(), total=len(error_df)):
    question = row["Question"]
    answer = row["Answer"]
    context_en = row["English_Context_Chunks"]
    context_urdu = row["urdu_cleaned"]
    try:
        data_to_append = translate_qna_pair(question, answer, context_en=context_en, context_ur=context_urdu)
    except Exception as error:
        error_count += 1
        print(f'Error (#{error_count}): {error}')
        print(question)
        print('---' * 30)

        data_to_append = {'english_question': question,
                          'english_answer': answer,
                          'error': f"{error}"}
        
    update_file(RESPONSE_FILE, data_to_append)

  0%|          | 0/3 [00:00<?, ?it/s]

Error (#4): Failed to parse QnAPair from completion {"properties": {"question": {"title": "Question", "description": "the question in Urdu", "type": "string"}, "answer": {"title": "Answer", "description": "the answer in Urdu", "type": "string"}}, "required": ["question", "answer"]}. Got: 2 validation errors for QnAPair
question
  field required (type=value_error.missing)
answer
  field required (type=value_error.missing)
What is included in the definition of "law" according to this chapter?
------------------------------------------------------------------------------------------


In [29]:
error_df.iloc[2].Question
error_df.iloc[2].Answer
error_df.iloc[2].urdu_cleaned
error_df.iloc[2].English_Context_Chunks

'Definitions\n\n203B. In this Chapter, unless there is anything repugnant in the subject or context,—\n\n1[(a)\t"Chief Justice" means Chief Justice of the Court;]\n\n"Court" means the Federal Shariat Court constituted in pursuance of Article 203C ;\n\n2[(bb)\t"Judge" means Judge of the Court;]\n\n"law" includes any custom or usage having the force of law but does not include the Constitution, Muslim personal law, any law relating to the procedure of any court or tribunal or, until the expiration of 3[ten] years from the commencement of this Chapter, any fiscal law or any law relating to the levy and collection of taxes and fees or banking or insurance practice and procedure ; and'

In [30]:
data_to_append = translate_qna_pair(question=error_df.iloc[2].Question, 
                   answer=error_df.iloc[2].Answer, 
                   context_en=error_df.iloc[2].English_Context_Chunks, 
                   context_ur=error_df.iloc[2].urdu_cleaned)

In [31]:
update_file(RESPONSE_FILE, data_to_append)