In [1]:
import sys
import os
from datetime import datetime
from tqdm import tqdm
import json
sys.path.append("./src")
from kblam.utils.data_utils import DataPoint
from kblam.gpt_session import GPT

In [2]:
def _setup_default_token(secret_name="gcr-gpt4o-450k-1") -> None:
    from azure.identity import DefaultAzureCredential
    from azure.keyvault.secrets import SecretClient

    # Get GPT-4 API key
    # seems to be working only locally
    default_credential = DefaultAzureCredential(
        exclude_shared_token_cache_credential=True,
        exclude_interactive_browser_credential=False,
    )
    secret_client = SecretClient(
        "https://alexandriakeyvault.vault.azure.net/", default_credential
    )
    return secret_client.get_secret(secret_name).value

gpt = GPT(
    model_name="gpt-4o",
    endpoint_url="https://gcrgpt4aoai1c.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-04-01-preview",
    temperature=0.5,
    token = _setup_default_token()
)

In [None]:
nq_path = r"C:\Users\t-isazawat\Documents\Experiments\KBLaM Rebuttal\Natural Questions\NQ-open.dev.jsonl"
nq_kb_path = r"C:\Users\t-isazawat\Documents\Experiments\KBLaM Rebuttal\Natural Questions\NQ-open.dev-kb.jsonl"
qa = []

with open(nq_path, "r") as f:
    for line in f.readlines():
        qa.append(json.loads(line))


In [5]:
qa[0]

{'question': 'when was the last time anyone was on the moon',
 'answer': ['14 December 1972 UTC', 'December 1972']}

In [6]:
prompt = [
            {
                "role": "system",
                "content": "Generate the knowledge triple that is necessary to answer the given question-answer pair. Prioritise the knowledge triple that is most relevant to the question-answer pair, and provide the knowledge triple in the form of (subject, predicate, object). The knowledge triple should be a fact that is not explicitly mentioned in the question-answer pair. If there is some ambiguity, you should choose the following predicates: 'description', 'objective', and 'purpose'.",
            },
            {
                "role": "user",
                "content": "Q: What is the description of Alexandria? A: The description of Alexandria is an automatic knowledge base construction system.",
            },
            {
                "role": "assistant",
                "content": "(Alexandria, description, automatic knowledge base construction system)"
            },
            {
                "role": "user",
                "content": "Q: What is the objective of Alexandria? A: The objective of Alexandria is to generate knowledge triples that are necessary to answer a given question-answer pair.",
            },
            {
                "role": "assistant",
                "content": "(Alexandria, objective, generate knowledge triples that are necessary to answer a given question-answer pair)"
            },
            {
                "role": "user",
                "content": "Q: Who is Joe Biden? A: Joe Biden is the President of the United States.",
            },
            {
                "role": "assistant",
                "content": "(Joe Biden, description, President of the United States)"
            },
            {
                "role": "user",
                "content": "Q: when was the last time anyone was on the moon? A: 14 December 1972 UTC.",
            },
            {
                "role": "assistant",
                "content": "(last time on the moon, date, 14 December 1972 UTC)"
            },
            {
                "role": "user",
                "content": "Q: who wrote he ain't heavy he's my brother lyrics? A: Bobby Scott.",
            },
            {
                "role": "assistant",
                "content": "(he ain't heavy he's my brother lyrics, author, Bobby Scott)"
            }
        ]


In [None]:


def enrich_qa(qa, prompt):
    for i, qa_item in enumerate(tqdm(qa, "Generating triples from QA")):
        if qa_item["answer"]:
            messages = prompt + [{"role": "user", "content": f"Q: {qa_item['question']} A: {qa_item['answer'][0]}"}]
            response = gpt.api_call_chat(messages=messages)
            qa_item["knowledge_triple"] = response
    return qa

enriched_2 = enrich_qa(qa[:100], prompt)

Generating QA...: 100%|██████████| 100/100 [03:02<00:00,  1.83s/it]


In [11]:
for el in enriched_2:
    print(el["knowledge_triple"])

(last manned moon landing, date, 14 December 1972 UTC)
(he ain't heavy he's my brother lyrics, author, Bobby Scott and Bob Russell)
(The Bastard Executioner, number of seasons, one)
(Philadelphia Eagles, last Super Bowl win, 2017)
(South Carolina, won, last year's NCAA women's basketball championship)
(Isle of Wight, became an island, during the last Ice Age)
(Love Yourself by Justin Bieber, subject, Rihanna)
(England, ruler in 1616, James I)
(hot coffee mod in San Andreas, description, normally inaccessible mini-game)
(802.11a standard, maximum data rate, 54 Mbit/s)
(Chhattisgarh, location, centre of India)
(I Ran All the Way Home, performer, The Impalas)
(Wallace, origin, Scottish surname)
(Ben Stone, portrayed by, Michael Moriarty)
(Niketa Calame, role, voice of Nala in The Lion King)
(Max Shippee, role, Gram on The Young and the Restless)
(new citizens, take, United States Oath of Allegiance)
(Darth Vader, identity, Anakin Skywalker)
(Sarah, gave birth at age, 100)
(bb gun, minimum

In [None]:
def get_tuple(tuple_string):
    trimmed_string = tuple_string.strip("()")

    elements = trimmed_string.split(',')

    tuple_elements = tuple(element.strip() for element in elements)
    return tuple_elements

def create_kb(enriched_qa):
    kb = []
    for qa_item in enriched_qa:
        if qa_item["knowledge_triple"]:
            triple = get_tuple(qa_item["knowledge_triple"])
            kb.append(DataPoint(
                Q=qa_item["question"],
                A=qa_item["answer"][0],
                name=triple[0],
                description_type=triple[1],
                description=triple[2],
                key_string=f"the {triple[1]} of {triple[0]}",
            ))
    return kb

kb = create_kb(enriched_2)


AttributeError: 'DataPoint' object has no attribute 'to_dict'

In [7]:
import dataclasses

# json.dump([dataclasses.asdict(el) for el in kb], open("kb.json", "w"), indent=4)
out_path = r"C:\Users\t-isazawat\Documents\Experiments\KBLaM Rebuttal\Natural Questions\nq_kb\kb.json"
reloaded_kb= [DataPoint(**el) for el in json.load(open("kb.json", "r"))]
json.dump([dataclasses.asdict(el) for el in reloaded_kb], open(out_path, "w"), indent=4)

In [4]:
gpt = GPT(
    "ada-embeddings",
    token=_setup_default_token("gpt4-api-key1"),
    endpoint_url="https://gcraoai1sw1.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2023-03-15-preview"
    )


dataset = reloaded_kb
key_embeds = []
value_embeds = []
batch_size = 100

chunks = [dataset[i : i + batch_size] for i in range(0, len(dataset), batch_size)]

for chunk in tqdm(chunks):
    key_embeds.extend(gpt.generate_embeddings([entity.key_string for entity in chunk]))
    value_embeds.extend(gpt.generate_embeddings([entity.description for entity in chunk]))

100%|██████████| 1/1 [00:01<00:00,  1.22s/it]


In [9]:
import numpy as np

out_folder = r"C:\Users\t-isazawat\Documents\Experiments\KBLaM Rebuttal\Natural Questions\nq_embeds"
np.save(out_folder + r"\embd_key.npy", np.array(key_embeds))
np.save(out_folder + r"\embd_value.npy", np.array(value_embeds))