In [1]:
from dotenv import load_dotenv

load_dotenv(override=True)

True

In [2]:
from langchain_openai import OpenAIEmbeddings

In [3]:
from datasets import load_from_disk
from tqdm import tqdm
import json
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [4]:
dataset_dict = load_from_disk("../../data/complete_dataset/")

print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['question_eng', 'question_urdu', 'context_eng', 'context_urdu', 'answer_eng', 'answer_urdu', 'context_index', '__index_level_0__'],
        num_rows: 495
    })
    validation: Dataset({
        features: ['question_eng', 'question_urdu', 'context_eng', 'context_urdu', 'answer_eng', 'answer_urdu', 'context_index', '__index_level_0__'],
        num_rows: 124
    })
})


In [5]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [6]:
docs = []
questions = []
repeated = set()

print('Validation')
for i in range(len(dataset_dict['validation'])):
    test_point = dataset_dict['validation'][i]

    context = test_point['context_urdu']
    question = test_point['question_urdu']

    if context not in repeated:
        repeated.add(context)
        docs.append(context)

    questions.append({'question': question, 'context': context, 'context_idx': docs.index(context)})

print('Training')
for i in range(len(dataset_dict['train'])):
    test_point = dataset_dict['train'][i]

    context = test_point['context_urdu']

    if context not in repeated:
        repeated.add(context)
        docs.append(context)

Validation
Training


In [7]:
len(questions), len(docs)

(124, 305)

In [8]:
question_embeddings = embeddings.embed_documents([q['question'] for q in questions])
docs_embeddings = embeddings.embed_documents(docs)

In [13]:
len(docs_embeddings[0])

3072

In [9]:
def get_nearest_answer(question_embedding, answer_embeddings):
    similarities = cosine_similarity([question_embedding], answer_embeddings)
    best_match_index = np.argsort(similarities)[0][::-1]
    return best_match_index

In [10]:
def get_top_k_acc(k):
    correct = 0
    total = len(questions)

    for i, pair in tqdm(enumerate(questions), desc=f"Getting top {k}", total=len(questions)):
        context_idx = pair['context_idx']
        question_embedding = question_embeddings[i]

        nearest_indices = get_nearest_answer(question_embedding, docs_embeddings)[:k]

        if context_idx in nearest_indices:
            correct += 1

    print(f'Top {k} Acc: {(correct / total) * 100 :.2f}%')

In [12]:
get_top_k_acc(1)
get_top_k_acc(3)
get_top_k_acc(5)

Getting top 1: 100%|██████████| 124/124 [00:07<00:00, 16.27it/s]


Top 1 Acc: 53.23%


Getting top 3: 100%|██████████| 124/124 [00:08<00:00, 15.20it/s]


Top 3 Acc: 72.58%


Getting top 5: 100%|██████████| 124/124 [00:07<00:00, 15.62it/s]

Top 5 Acc: 77.42%





Top 1 Acc: `53.23%`  
Top 3 Acc: `73.39%`  
Top 5 Acc: `77.42%`  