In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage, convert_to_messages
from os.path import dirname, join
from dotenv import load_dotenv
from typing import Literal
from typing_extensions import TypedDict, Annotated
from langgraph.graph import StateGraph, START, MessagesState, END
from langgraph.types import Command
from operator import add
from Agent import BTEx
import pandas as pd
import getpass
import os
import json

In [None]:
# Set your OpenAI API key
load_dotenv("/Users/mastorga/Documents/BTE-LLM/.env")

if not os.environ.get("OPENAI_API_KEY"): #field to ask for OpenAI API key
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Please enter OpenAI API Key: ")

In [None]:
# Create an LLM-based agent
llm = ChatOpenAI(temperature=0, model="gpt-4o")  # Change model if needed

In [None]:
def get_llm_response(question):
    print(question + '\n\n')
    
    response = llm.invoke(question).content

    print(f"LLM answer: {response}")
    
    return response

In [None]:
f = open("/Users/mastorga/Documents/BTE-LLM/Prototype/data/MCQ_1hop.json")

MCQsinglehop = json.load(f)

print(len(MCQsinglehop))

In [None]:
import random

MCQsinglehop_test = random.sample(MCQsinglehop, 50)

print(MCQsinglehop_test)

In [None]:
# LLM only
LLM_tally = {}
LLM_correctct = 0

for index, item in enumerate(MCQsinglehop_test):
    print(f"Question {index+1}. \n")
    question = "IMPORTANT: only return the number corresponding to the correct answer. \n" + item.get("question", "")

    LLM_ans = get_llm_response(question)

    groundtruth = item.get("answer", "")

    print(f"Correct answer: {groundtruth}\n\n")

    if groundtruth in LLM_ans:
        LLM_tally[index+1] = "CORRECT"
        print("\nCORRECT")
        LLM_correctct += 1
    else:
        LLM_tally[index+1] = "INCORRECT"
        print("\nINCORRECT")

    print("----------------------------------")

print(LLM_tally)
print(f"\n Score: {LLM_correctct}/{len(MCQsinglehop_test)} \n Accuracy: {LLM_correctct/len(MCQsinglehop_test)}")

In [None]:
# BTE-RAG 50m5k
BTEx_50m5k_tally = {}
BTEx_50m5k_correctct = 0

for index, item in enumerate(MCQsinglehop_test):
    print(f"Question {index+1}. \n")
    question = "IMPORTANT: only return the number corresponding to the correct answer. \n" + item.get("question", "")

    BTEx_50m5k_ans = BTEx(question, maxresults=50, k=5)

    groundtruth = item.get("answer", "")

    print(f"Correct answer: {groundtruth}\n\n")

    if BTEx_50m5k_ans:
        if groundtruth in BTEx_50m5k_ans:
            BTEx_50m5k_tally[index+1] = "CORRECT"
            print("\nCORRECT")
            BTEx_50m5k_correctct += 1
        else:
            BTEx_50m5k_tally[index+1] = "INCORRECT"
            print("\nINCORRECT")
    else:
        BTEx_50m5k_tally[index+1] = "NO ANSWER"
        print("\nNO ANSWER")

    print("----------------------------------")

print(BTEx_50m5k_tally)
print(f"\n Score: {BTEx_50m5k_correctct}/{len(MCQsinglehop_test)} \n Accuracy: {BTEx_50m5k_correctct/len(MCQsinglehop_test)}")

In [None]:
# BTE-RAG 50m10k
BTEx_50m10k_tally = {}
BTEx_50m10k_correctct = 0

for index, item in enumerate(MCQsinglehop_test):
    print(f"Question {index+1}. \n")
    question = "IMPORTANT: only return the number corresponding to the correct answer. \n" + item.get("question", "")

    BTEx_50m10k_ans = BTEx(question, maxresults=50, k=10)

    groundtruth = item.get("answer", "")

    print(f"Correct answer: {groundtruth}\n\n")

    if BTEx_50m10k_ans:
        if groundtruth in BTEx_50m10k_ans:
            BTEx_50m10k_tally[index+1] = "CORRECT"
            print("\nCORRECT")
            BTEx_50m10k_correctct += 1
        else:
            BTEx_50m10k_tally[index+1] = "INCORRECT"
            print("\nINCORRECT")
    else:
        BTEx_50m10k_tally[index+1] = "NO ANSWER"
        print("\nNO ANSWER")

    print("----------------------------------")

print(BTEx_50m10k_tally)
print(f"\n Score: {BTEx_50m10k_correctct}/{len(MCQsinglehop_test)} \n Accuracy: {BTEx_50m10k_correctct/len(MCQsinglehop_test)}")

In [None]:
# BTE-RAG 100m5k
BTEx_100m5k_tally = {}
BTEx_100m5k_correctct = 0

for index, item in enumerate(MCQsinglehop_test):
    print(f"Question {index+1}. \n")
    question = "IMPORTANT: only return the number corresponding to the correct answer. \n" + item.get("question", "")

    BTEx_100m5k_ans = BTEx(question, maxresults=100, k=5)

    groundtruth = item.get("answer", "")

    print(f"Correct answer: {groundtruth}\n\n")

    if BTEx_100m5k_ans:
        if groundtruth in BTEx_100m5k_ans:
            BTEx_100m5k_tally[index+1] = "CORRECT"
            print("\nCORRECT")
            BTEx_100m5k_correctct += 1
        else:
            BTEx_100m5k_tally[index+1] = "INCORRECT"
            print("\nINCORRECT")
    else:
        BTEx_100m5k_tally[index+1] = "NO ANSWER"
        print("\nNO ANSWER")

    print("----------------------------------")

print(BTEx_100m5k_tally)
print(f"\n Score: {BTEx_100m5k_correctct}/{len(MCQsinglehop_test)} \n Accuracy: {BTEx_100m5k_correctct/len(MCQsinglehop_test)}")

In [None]:
# BTE-RAG 100m10k
BTEx_100m10k_tally = {}
BTEx_100m10k_correctct = 0

for index, item in enumerate(MCQsinglehop_test):
    print(f"Question {index+1}. \n")
    question = "IMPORTANT: only return the number corresponding to the correct answer. \n" + item.get("question", "")

    BTEx_100m10k_ans = BTEx(question, maxresults=100, k=10)

    groundtruth = item.get("answer", "")

    print(f"Correct answer: {groundtruth}\n\n")

    if BTEx_100m10k_ans:
        if groundtruth in BTEx_100m10k_ans:
            BTEx_100m10k_tally[index+1] = "CORRECT"
            print("\nCORRECT")
            BTEx_100m10k_correctct += 1
        else:
            BTEx_100m10k_tally[index+1] = "INCORRECT"
            print("\nINCORRECT")
    else:
        BTEx_100m10k_tally[index+1] = "NO ANSWER"
        print("\nNO ANSWER")

    print("----------------------------------")

print(BTEx_100m10k_tally)
print(f"\n Score: {BTEx_100m10k_correctct}/{len(MCQsinglehop_test)} \n Accuracy: {BTEx_100m10k_correctct/len(MCQsinglehop_test)}")