# Imports

In [24]:
import pickle
from openai import OpenAI
from dotenv import load_dotenv

## Embedding Functions

In [10]:
load_dotenv()
client = OpenAI()

def get_openai_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding


## Data Processing

In [14]:
# loading article chunks
with open('../../data/document_chunks/article_wise_chunks_raw_english.pkl', 'rb') as file:
    article_chunks = pickle.load(file)

article_chunks[:3]

['The Republic and its territories\n11.\t(1)\tPakistan shall be Federal Republic to be known as the Islamic Republic of Pakistan, hereinafter referred to as Pakistan.\n\n2[(2)\tThe territories of Pakistan shall comprise—\n\nthe\tProvinces\tof\t3[Balochistan],\tthe\t4[Khyber Pakhtunkhwa], the Punjab and 5[Sindh];\nthe Islamabad Capital Territory, hereinafter referred to as the Federal Capital; 6[and]\n6[(c)\t*\t*\t*\t*\t*\t*]\n\n6[(c)] such States and territories as are or may be included in Pakistan, whether by accession or otherwise.\n\n(3) 7[Majlis-e-Shoora (Parliament)] may by law admit into the Federation new States or areas on such terms and conditions as it thinks fit.]',
 'Islam to be State religion\n\nIslam shall be the State religion of Pakistan. ',
 'The   Objectives\tResolution   to   form\tpart\tof substantive provisions\n\n8[2A. The principles and provisions set out in the Objectives Resolution reproduced in the Annex are hereby made substantive part of the Constitution an

In [16]:
article_chunks_embeddings_small = [get_openai_embedding(chunk) for chunk in article_chunks]

In [20]:
article_chunks_embeddings_large = [get_openai_embedding(chunk, model="text-embedding-3-large") for chunk in article_chunks]

In [21]:
print(len(article_chunks_embeddings_small))
print(len(article_chunks_embeddings_large))

312
312


In [41]:
print("Small embedding size:", len(article_chunks_embeddings_small[0]))
print("Large embedding size:", len(article_chunks_embeddings_large[0]))

Small embedding size: 1536
Large embedding size: 3072


In [25]:
# Preparing list of dictionaries for embeddings
data_small = [
    {"article": article_chunks[i], "embedding": article_chunks_embeddings_small[i]}
    for i in range(len(article_chunks))
]

data_large = [
    {"article": article_chunks[i], "embedding": article_chunks_embeddings_large[i]}
    for i in range(len(article_chunks))
]

In [31]:
data_large[5]

{'article': 'Loyalty to State and obedience to Constitution and law\n\n(1)      Loyalty to the State is the basic duty of every citizen.\n\n(2) Obedience to the Constitution and law is the 1[inviolable] obligation of every citizen wherever he may be and of every other person for the time being within Pakistan. ',
 'embedding': [-0.020150519907474518,
  -0.005689247045665979,
  -0.001411561737768352,
  0.05625738948583603,
  0.002169852377846837,
  0.014064347371459007,
  -0.03604071959853172,
  0.012344342656433582,
  -0.006496326066553593,
  -0.007938483729958534,
  0.020163750275969505,
  0.008368485607206821,
  0.028763774782419205,
  -0.027176078408956528,
  -0.016445893794298172,
  -0.025972073897719383,
  0.015149273909628391,
  0.03201855346560478,
  0.0031274319626390934,
  -0.03670226037502289,
  0.013098498806357384,
  -0.031859781593084335,
  0.046307824552059174,
  0.01340942271053791,
  0.0025022763293236494,
  0.018853899091482162,
  0.016763431951403618,
  0.007151251193

In [27]:
# Save the data to pickle files
with open('../../data/embeddings/article_chunks_openai_embeddings_small.pkl', 'wb') as f:
    pickle.dump(data_small, f)

with open('../../data/embeddings/article_chunks_openai_embeddings_large.pkl', 'wb') as f:
    pickle.dump(data_large, f)