In [1]:
import json
import os

# change cwd
project_root = "/home/nyein/georgia_tech_masters/cs7637/cs7637-rag"
os.chdir(project_root)
cwd = os.getcwd()
print(f"cwd={cwd}")

cwd=/home/nyein/georgia_tech_masters/cs7637/rag


In [2]:
from dotenv import load_dotenv
from pathlib import Path
dotenv_path = Path('.env')
load_dotenv(dotenv_path=dotenv_path)

True

In [3]:
def load_openai_keys(use_personal=False):
    if use_personal:
        print(f"Using personal key")
        return os.getenv("OPENAI_API_KEY")
    print("Using chanwah's key")
    return os.getenv("chanwah_openai_api_key")
openai_key = load_openai_keys(True)
os.environ["OPENAI_API_KEY"] = openai_key

Using personal key


In [4]:
from langchain_openai.embeddings import OpenAIEmbeddings
embedding_manager = OpenAIEmbeddings(
    model="text-embedding-3-small"
)

In [5]:
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector

connection = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain"  # Uses psycopg3!
collection_name = "my_docs"


vector_store = PGVector(
    embeddings=embedding_manager,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)

In [6]:
def get_relevant_lecture_slide(
    query: str,
    num_distinct_lect: int = 3,
):
    res = vector_store.similarity_search(
        query,
        k=10,
    )
    distinct_lect = []
    distinct_lect_ids = set()
    while len(distinct_lect) < num_distinct_lect and len(res) > 0:
        doc = res.pop(0)
        if doc.metadata['lecture_number'] not in distinct_lect_ids:
            distinct_lect.append(
                {
                    "lecture_number": doc.metadata['lecture_number'],
                    "lecture_title": doc.metadata['lecture_title'],
                    "lecture": doc.page_content,
                }
            )
            distinct_lect_ids.add(doc.metadata['lecture_number'])
    return distinct_lect

In [7]:
from app.chains.prompts import QA_CHAIN_PROMPT_TEMPLATE

In [8]:
from langchain_openai.chat_models import ChatOpenAI
gpt4o = ChatOpenAI(
    model='gpt-4o-2024-08-06',
    temperature=0.0,
    openai_api_key=openai_key,
    cache=False,
)

In [9]:
from typing import List
def qa(
    question: str,
    question_type: int,
    options: List[str],
    num_distinct_lect: int = 3,
):
    relevant_lectures = get_relevant_lecture_slide(
        query=question,
        num_distinct_lect=num_distinct_lect
    )

    prompt = QA_CHAIN_PROMPT_TEMPLATE.format(
        lecture_notes=json.dumps(relevant_lectures, indent=4),
        question=question,
        question_type=question_type,
        answer_options=json.dumps(options, indent=4),
    )

    resp = gpt4o.invoke(prompt)
    resp = json.loads(resp.content)
    return resp

In [11]:
query = "What is a frame in AI?"
question_type = 1
options = [
    "1. A frame is a data structure used to represent a stereotypical situation.",
    "2. A frame is something you use to hang a picture.",
    "3. A frame is a type of computer monitor.",
]
resp = qa(
    question=query,
    question_type=question_type,
    options=options,
)
print(json.dumps(resp, indent=4))

{
    "cited_lecture_notes": [
        "07, Frames"
    ],
    "answers": [
        "1, A frame is a data structure used to represent a stereotypical situation."
    ]
}
