In [4]:
import json
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch

In [5]:
model_name = "all-MiniLM-L6-v2"

In [6]:
with open('../data/faqs-with-ids.json','r') as faqs:
    faqs = json.load(faqs)

In [7]:
faqs

[{'question': 'What is Voluntary Disclosure?',
  'answer': 'Voluntary disclosure is a process where the taxpayer discloses information related to tax liabilities, misstatements or omissions his or her tax declarations to Uganda Revenue Authority (URA) without being prompted by any action or threat of action by URA.Please note that;A voluntary disclosure must be complete and accurate, covering all relevant periods where there was previously inaccurate, incomplete or unreported information regarding the taxpayer’s affairsA taxpayer who is subject to ongoing compliance action in respect of a given tax head and a particular tax period may nonetheless make voluntary disclosure in relation to a different tax head in the same or different period or the same tax head in a different period. This is allowed provided that the information that is disclosed would not inevitably have been discovered by the ongoing compliance action',
  'section': 'General FAQs',
  'id': 'f0c1432d'},
 {'question': 'I

In [8]:
model = SentenceTransformer(model_name)



In [15]:
for faq in tqdm(faqs):
    question = faq['question']
    answer = faq['answer']
    section = faq['section']

    qa = question + ' '+ answer

    faq['question_vector'] = model.encode(question)
    faq['answer_vector'] = model.encode(answer)
    faq['question_answer_vector'] = model.encode(section)

100%|██████████| 219/219 [00:12<00:00, 17.95it/s]


In [16]:
es = Elasticsearch('http://localhost:9200')
es.ping()

True

In [17]:
index_settings = {
    "settings":{
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings":{
        "properties":{
            "question":{"type":"text"},
            "answer":{"type":"text"},
            "section":{"type":"text"},
            "question_vector":{
                "type":"dense_vector",
                "dims":384,
                "index":True,
                "similarity":"cosine"
            },
            "answer_vector":{
                "type":"dense_vector",
                "dims":384,
                "index":True,
                "similarity":"cosine"
            },
            "question_answer_vector":{
                "type":"dense_vector",
                "dims":384,
                "index":True,
                "similarity":"cosine"
            }
        }
    }
}

index_name = "ura_faqs"


es.indices.delete(index=index_name,ignore_unavailable=True)
es.indices.create(index=index_name,body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'ura_faqs'})

In [18]:
for faq in tqdm(faqs):
    es.index(index=index_name,document=faq)

100%|██████████| 219/219 [00:02<00:00, 73.24it/s]


In [19]:
def elastic_search_hybrid(field, query, query_vector,index_name="ura_faqs"):
    knn_query = {
        "field": field,
        "query_vector": query_vector,
        "k": 1,
        "num_candidates": 10000,
        "boost": 0.5
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": 
                {
                    "query": query, 
                    "fields": ["question","answer","section"], 
                    "type": "best_fields", 
                    "boost": 0.5
                }
            },
        }
    }

    search_query = {
        "knn": knn_query,
        "query": keyword_query,
        "size": 5,
        "_source": ["question", "answer", "section"]
    }

    try:
        results = []

        es_results = es.search(index=index_name, body=search_query)
        result_docs = [hit['_source'] for hit in es_results['hits']['hits']]

        for i in result_docs:
            result = {
                        'question': i['question'], 
                        'answer': i['answer'],  
                        'section': i['section']            
                    }
            results.append(result)
        return results
            
    except Exception as e:
        print(f"Error during hybrid search: {e}")
        return []


In [22]:
def faq_question(q):
    vq = model.encode(q)

    return elastic_search_hybrid("question_answer_vector", q, vq,index_name="ura_faqs")

In [23]:
faq_question("What is a tin?")

[{'question': 'What is a TIN',
  'answer': 'Taxpayer Identification Number in Uganda is a unique identifying number assigned to every taxpayer by Uganda Revenue Authority (URA) for tax administration purposes. Any person who is likely to transact in any tax related business with URA, shall be required to apply for a TIN. The TIN is therefore an administrative requirement and applies to all taxpayers regardless of the taxtransaction.',
  'section': 'Domestic Taxes FAQs'},
 {'question': 'Individual TIN Application',
  'answer': 'What is TIN Registration- IndividualIt is a TIN registration process used by an individual to obtain a TIN from URA. An application is done online by completing and uploading an excel Template. The TIN application is subject to verification and approval by a URA staff.Individual registration',
  'section': 'Domestic Taxes FAQs'},
 {'question': 'Instant TIN Application',
  'answer': 'What is an Instant TIN?It is a TIN registration process that is done using an onl