In [6]:
import pickle
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
# Define the file path
file_path = "../../data/embeddings/article_chunks_openai_embeddings_large.pkl"

# Load the pickle file
with open(file_path, 'rb') as file:
    data = pickle.load(file)

# Optional: print a sample of the data
print(data[:1])
embeddings = np.array([i['embedding'] for i in data])
print(embeddings.shape)

[{'article': 'The Republic and its territories\n11.\t(1)\tPakistan shall be Federal Republic to be known as the Islamic Republic of Pakistan, hereinafter referred to as Pakistan.\n\n2[(2)\tThe territories of Pakistan shall comprise—\n\nthe\tProvinces\tof\t3[Balochistan],\tthe\t4[Khyber Pakhtunkhwa], the Punjab and 5[Sindh];\nthe Islamabad Capital Territory, hereinafter referred to as the Federal Capital; 6[and]\n6[(c)\t*\t*\t*\t*\t*\t*]\n\n6[(c)] such States and territories as are or may be included in Pakistan, whether by accession or otherwise.\n\n(3) 7[Majlis-e-Shoora (Parliament)] may by law admit into the Federation new States or areas on such terms and conditions as it thinks fit.]', 'embedding': [0.027545789256691933, 0.022039029747247696, -0.0017291101394221187, 0.03685569018125534, -0.03678370639681816, 0.013520951382815838, -0.010317672975361347, -0.0008180655422620475, -0.03690367937088013, -0.017803985625505447, 0.02068333514034748, 0.028817500919103622, 0.00321827433072030

In [8]:
df = pd.read_csv("../../data/augmented_datasets/qa_dataset_eng_v1.csv")
df

Unnamed: 0,Question,Answer,Answer Chunk Index
0,What type of government does Pakistan have?,Pakistan is a Federal Republic known as the Is...,0
1,Can new states or areas become part of Pakistan?,"Yes, the Majlis-e-Shoora (Parliament) may admi...",0
2,What is the designated religion of Pakistan?,Islam shall be the State religion of Pakistan.,1
3,What has been made a substantive part of the C...,The principles and provisions set out in the O...,2
4,What is the State's responsibility regarding e...,The State shall ensure the elimination of all ...,3
...,...,...,...
616,Who is responsible for handling accounts that ...,The Auditor-General is responsible for handlin...,309
617,What powers does the Auditor-General have rega...,The Auditor-General has the same powers and fu...,309
618,Will existing taxes continue to be collected a...,"Yes, all taxes and fees levied under any law i...",310
619,When was the Proclamation of Emergency mention...,The Proclamation of Emergency was originally i...,311


In [9]:
load_dotenv()
client = OpenAI()

def get_openai_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [11]:
# The code below uses OpenAI's API!!! Be careful with the number of requests you make
def find_most_similar_index(question, embeddings):
    question_embedding = np.array(get_openai_embedding(question, model="text-embedding-3-large"))
    similarities = cosine_similarity([question_embedding], embeddings)
    most_similar_index = np.argmax(similarities)
    return most_similar_index

df['Predicted_Index'] = df['Question'].apply(lambda q: find_most_similar_index(q, embeddings))

In [12]:
df

Unnamed: 0,Question,Answer,Answer Chunk Index,Predicted_Index
0,What type of government does Pakistan have?,Pakistan is a Federal Republic known as the Is...,0,0
1,Can new states or areas become part of Pakistan?,"Yes, the Majlis-e-Shoora (Parliament) may admi...",0,0
2,What is the designated religion of Pakistan?,Islam shall be the State religion of Pakistan.,1,1
3,What has been made a substantive part of the C...,The principles and provisions set out in the O...,2,2
4,What is the State's responsibility regarding e...,The State shall ensure the elimination of all ...,3,3
...,...,...,...,...
616,Who is responsible for handling accounts that ...,The Auditor-General is responsible for handlin...,309,309
617,What powers does the Auditor-General have rega...,The Auditor-General has the same powers and fu...,309,309
618,Will existing taxes continue to be collected a...,"Yes, all taxes and fees levied under any law i...",310,310
619,When was the Proclamation of Emergency mention...,The Proclamation of Emergency was originally i...,311,311


In [17]:
predicted_indices = df['Predicted_Index'].to_list()

In [21]:
# Save the first 3 elements
with open('../../data/predictions/retrieval_indices_openai_large_cosine.pkl', 'wb') as f:
    pickle.dump(predicted_indices, f)

## Top 1 accuracy

In [14]:
correct_predictions = (df['Answer Chunk Index'] == df['Predicted_Index']).sum()
total_predictions = len(df)
accuracy = correct_predictions / total_predictions * 100

print(f'Accuracy: {accuracy:.2f}%')

Accuracy: 83.09%
