In [7]:
import json
import uuid
from tqdm.auto import tqdm
from collections import defaultdict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('../data/ura_faqs.json','r') as faqs:
    faqs = json.load(faqs)

In [3]:
def generate_faq_id(faq):
    return str(uuid.uuid4())[:8]

In [4]:
for faq in faqs:
    faq['id'] = generate_faq_id(faq)

In [5]:
faqs

[{'question': 'What is Voluntary Disclosure?',
  'answer': 'Voluntary disclosure is a process where the taxpayer discloses information related to tax liabilities, misstatements or omissions his or her tax declarations to Uganda Revenue Authority (URA) without being prompted by any action or threat of action by URA.Please note that;A voluntary disclosure must be complete and accurate, covering all relevant periods where there was previously inaccurate, incomplete or unreported information regarding the taxpayer’s affairsA taxpayer who is subject to ongoing compliance action in respect of a given tax head and a particular tax period may nonetheless make voluntary disclosure in relation to a different tax head in the same or different period or the same tax head in a different period. This is allowed provided that the information that is disclosed would not inevitably have been discovered by the ongoing compliance action',
  'section': 'General FAQs',
  'id': 'fdff035c'},
 {'question': 'I

In [6]:
final_faqs = []

for faq in tqdm(faqs):
    if faq['question'] == '' and faq['answer'] == '':
        continue
    final_faqs.append(faq)

100%|██████████| 219/219 [00:00<00:00, 1179143.23it/s]


In [7]:
final_faqs

[{'question': 'What is Voluntary Disclosure?',
  'answer': 'Voluntary disclosure is a process where the taxpayer discloses information related to tax liabilities, misstatements or omissions his or her tax declarations to Uganda Revenue Authority (URA) without being prompted by any action or threat of action by URA.Please note that;A voluntary disclosure must be complete and accurate, covering all relevant periods where there was previously inaccurate, incomplete or unreported information regarding the taxpayer’s affairsA taxpayer who is subject to ongoing compliance action in respect of a given tax head and a particular tax period may nonetheless make voluntary disclosure in relation to a different tax head in the same or different period or the same tax head in a different period. This is allowed provided that the information that is disclosed would not inevitably have been discovered by the ongoing compliance action',
  'section': 'General FAQs',
  'id': 'fdff035c'},
 {'question': 'I

In [8]:
hashes = defaultdict(list)

for faq in final_faqs:
    faq_id = faq['id']
    hashes[faq_id].append(faq)

In [9]:
len(hashes),len(final_faqs)

(217, 217)

In [10]:
with open('../data/faqs-with-ids.json','wt') as file_out:
    json.dump(final_faqs,file_out,indent=2)

In [11]:
prompt_template = """
You emulate a Ugandan citizen who pays taxes to the Uganda Revenue Authority.
Formulate 5 questions this citizen might ask based on an FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

question: {question}
answer: {answer}
section: {section}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", "question3","question4","question5"]
""".strip()

In [13]:
from openai import OpenAI
client = OpenAI(api_key=api_key)

In [14]:
import pandas as pd

In [15]:
df_faq = pd.DataFrame(final_faqs)
df_faqs = df_faq.sample(20)
faqs_sample = df_faqs.to_dict(orient='records')

In [16]:
faqs_sample

[{'question': 'How do I Register for Tax Agent?',
  'answer': 'Please note that you need to have a new TIN to access the services. This is done onlineGo to the web portal (ura.go.ug) and login into your account using your tin and password, select E-Registration under eservices and click on registration under tax agent registration.Go to Tax Type Details and tick the tax type/s that you will be filing for your clients, fill in the Activation date using the date selector and submit.You will receive an Acknowledgement Receipt that bears a reference number and an approval notice will be sent to your registered email address once the application is approved.',
  'section': 'Domestic Taxes FAQs',
  'id': '8c2d1a34'},
 {'question': 'What is Stamp Duty?',
  'answer': 'Stamp Duty is amount of money (duty) payable on every document that confers/gives any right or liability upon being created, transferred, limited, extended, extinguished or recorded.These documents are referred to as instruments 

In [17]:
def generate_questions(faq):
    prompt = prompt_template.format(**faq)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [18]:
results = {}

for faq in tqdm(final_faqs): 
    faq_id = faq['id']
    if faq_id in results:
        continue

    questions = generate_questions(faq)
    results[faq_id] = questions

100%|██████████| 217/217 [08:11<00:00,  2.26s/it]


In [20]:
len(results)

217

In [21]:
parsed_results = {}

for faq_id, json_questions in results.items():
    parsed_results[faq_id] = json.loads(json_questions)

In [22]:
faq_index = {faq['id']:faq for faq in faqs}

In [23]:
final_results = []

for faq_id, questions in parsed_results.items():
    faq = faq_index[faq_id]
    for q in questions:
        final_results.append((q, faq, faq_id))

In [24]:
final_results 

[('What does voluntary disclosure mean for Ugandan taxpayers?',
  {'question': 'What is Voluntary Disclosure?',
   'answer': 'Voluntary disclosure is a process where the taxpayer discloses information related to tax liabilities, misstatements or omissions his or her tax declarations to Uganda Revenue Authority (URA) without being prompted by any action or threat of action by URA.Please note that;A voluntary disclosure must be complete and accurate, covering all relevant periods where there was previously inaccurate, incomplete or unreported information regarding the taxpayer’s affairsA taxpayer who is subject to ongoing compliance action in respect of a given tax head and a particular tax period may nonetheless make voluntary disclosure in relation to a different tax head in the same or different period or the same tax head in a different period. This is allowed provided that the information that is disclosed would not inevitably have been discovered by the ongoing compliance action',


In [25]:
import pandas as pd

In [26]:
df = pd.DataFrame(final_results, columns=['question', 'answer', 'document'])

In [27]:
df.to_csv('../data/ground-truth-data.csv', index=False)

## Upload to Blob Storage

In [28]:
%pip install azure-storage-blob azure-identity

Collecting azure-identity
  Downloading azure_identity-1.18.0-py3-none-any.whl.metadata (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting msal>=1.30.0 (from azure-identity)
  Downloading msal-1.31.0-py3-none-any.whl.metadata (11 kB)
Collecting msal-extensions>=1.2.0 (from azure-identity)
  Downloading msal_extensions-1.2.0-py3-none-any.whl.metadata (7.6 kB)
Collecting PyJWT<3,>=1.0.0 (from PyJWT[crypto]<3,>=1.0.0->msal>=1.30.0->azure-identity)
  Downloading PyJWT-2.9.0-py3-none-any.whl.metadata (3.0 kB)
Collecting portalocker<3,>=1.4 (from msal-extensions>=1.2.0->azure-identity)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Downloading azure_identity-1.18.0-py3-none-any.whl (187 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m187.2/187.2 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading msal-1.31.0-py3-none-any.whl (113 kB)
[2K   [90m━━━━━━━━

In [26]:
import os
from azure.identity import DefaultAzureCredential
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient

In [27]:
container_name = 'urafaqs'
account_url = 'https://urafaqs.blob.core.windows.net'

default_credential = DefaultAzureCredential()

In [28]:
from azure.storage.blob import BlobServiceClient

def upload_blob_file(container_name,file_name):
    blob_service_client = BlobServiceClient(account_url=account_url, credential=default_credential)
    container_client = blob_service_client.get_container_client(container=container_name)
    with open(os.path.join('../data',file_name),'rb') as data:
        blob_name = file_name
        return container_client.upload_blob(name=blob_name, data=data, overwrite=True)


In [29]:
upload_blob_file("urafaqs","faqs-with-ids.json")

HttpResponseError: This request is not authorized to perform this operation using this permission.
RequestId:b297d012-001e-003e-71b7-0fa8c2000000
Time:2024-09-26T01:56:31.3442238Z
ErrorCode:AuthorizationPermissionMismatch
Content: <?xml version="1.0" encoding="utf-8"?><Error><Code>AuthorizationPermissionMismatch</Code><Message>This request is not authorized to perform this operation using this permission.
RequestId:b297d012-001e-003e-71b7-0fa8c2000000
Time:2024-09-26T01:56:31.3442238Z</Message></Error>