In [8]:
import os
import json

from langchain_community.vectorstores import FAISS
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader

In [9]:
load_dotenv()

## constants
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
AZURE_DEPLOYMENT_NAME = os.getenv("AZURE_DEPLOYMENT_NAME")
AZURE_ENDPOINT=os.getenv("AZURE_ENDPOINT")

In [10]:
embeddings = AzureOpenAIEmbeddings(
    openai_api_key=AZURE_OPENAI_KEY,
    azure_endpoint=AZURE_ENDPOINT,
    azure_deployment="text-embedding-ada-002",
    openai_api_version="2023-08-01-preview",
)

llm = AzureChatOpenAI(
    openai_api_key=AZURE_OPENAI_KEY,
    azure_endpoint=AZURE_ENDPOINT,
    openai_api_version="2023-08-01-preview",
    deployment_name=AZURE_DEPLOYMENT_NAME,
    temperature=0.9,
)

In [28]:
prompt_no_context = """
Answer a multiple choice question based on your knowledge.
If you know the answer or you can deduce it, respond with A, B, C or D
If you don't know the answer, respond with E.
If you are not sure which asnwer is correct or there are more than one correct answers, respond with E.

Also provide the reasoning why you chose the option you chose.

Question: {question}

Choices:
A) {text_1}
B) {text_2}
C) {text_3}
D) {text_4}
E) I don't know

Response:
"""

In [29]:
with open("../modified_questions.jsonl", "r") as f:
    lines = f.readlines()
    lines = [json.loads(item) for item in lines]

In [30]:
def query_gpt(question) -> str:
    formatted_prompt = prompt_no_context.format(
        question=question["question"],
        text_1=question["wrong_answer_1"],
        text_2=question["correct_answer"],
        text_3=question["wrong_answer_2"],
        text_4=question["wrong_answer_3"]
    )

    response = llm.invoke(formatted_prompt)
    response = response.content

    print(response)

    if response.strip() in ["A", "B", "C", "D", "E"]:
        return response.strip()
    else:
        print(f"Failed to respond with value {response}")
        return None

In [31]:
statistics = {
    "correct": 0,
    "incorrect": 0,
    "unknown": 0,
    "failed": 0
}

data = {}

for i, line in enumerate(lines):
    question = line["multichoice_question"]
    response = query_gpt(question)
    print(f"{i} - Question: {question['question']}\n\tResponse: {response}")
    data[i] = response
    if response is None:
        statistics["failed"] += 1
    elif response == "E":
        statistics["unknown"] += 1
    elif response == "B":
        statistics["correct"] += 1
    else:
        statistics["incorrect"] += 1

for key, value in statistics.items():
    print(key, value)

B) Tesla, Inc.'s enhanced financial performance, marked by growing revenue, higher EBITA Margin, and lower debt, alongside LG Energy Solution, Ltd.'s stable outlook and positive issuer rating, suggests a reinforcing market position for Tesla, backed by a healthy EV battery industry.

Reasoning:
The question states that Tesla, Inc. has improved financial ratios, such as a higher EBITA Margin and reduced debt levels. This indicates that Tesla is performing well financially. Furthermore, it is implied that LG Energy Solution, Ltd. also has a stable outlook and positive issuer rating, suggesting that it too is financially healthy.

Since both companies are doing well, it is reasonable to surmise that Tesla's market position is reinforced by its own strong performance as well as by the health of the battery industry as evidenced by LG's creditworthiness. This suggests that the electric vehicle industry, including both car manufacturers like Tesla and battery suppliers like LG Energy Solutio

In [15]:
filtered_data = []
for k, v in data.items():
    if v != "B":
        filtered_data.append(lines[k])

In [16]:
with open("clean_questions.jsonl", "w") as f:
    for line in filtered_data:
        f.write(json.dumps(line) + "\n")

In [17]:
base_path = "../data/moodys_data"
pdfs = [os.path.join(base_path, filename) for filename in os.listdir(base_path) if filename.endswith(".pdf")]

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1024,
    chunk_overlap=128,
    length_function=len,
    is_separator_regex=False,
)

def load_and_split(path, splitter):
    loader = PyPDFLoader(path)
    documents = loader.load()
    return splitter.split_documents(documents)

documents = [load_and_split(path, text_splitter) for path in pdfs]
documents = sum(documents, [])

In [None]:
vectorstore = FAISS.from_documents(documents, embeddings)
vectorstore.save_local("rag_faiss_index")

In [18]:
vectorstore = FAISS.load_local("rag_faiss_index", embeddings, allow_dangerous_deserialization=True)

In [19]:
prompt_with_context = """
Answer a multiple choice question based on your knowledge and context provided.

YOUR RESPONSE MUST BE ONE OF 5 LETTERS, DON"T ADD ANYTHING ELSE.

Context: {context}

Question: {question}

Choices:
A) {text_1}
B) {text_2}
C) {text_3}
D) {text_4}
E) I don't know

If you know the answer or you can deduce it, respond with A, B, C or D
If you don't know the answer, respond with E.
If you are not sure which asnwer is correct or there are more than one correct answers, respond with E.

Response:
"""

In [20]:
def query_rag(question, k) -> str:
    contexts = vectorstore.similarity_search(
        query=question["question"],
        k=k,
    )

    context_texts = [context.page_content for context in contexts]

    context = "\n".join(context_texts)
    
    formatted_prompt = prompt_with_context.format(
        context=context,
        question=question["question"],
        text_1=question["wrong_answer_1"],
        text_2=question["correct_answer"],
        text_3=question["wrong_answer_2"],
        text_4=question["wrong_answer_3"]
    )

    response = llm.invoke(formatted_prompt)
    response = response.content

    print(f"Response: {response}")

    if response.strip() in ["A", "B", "C", "D", "E"]:
        return response.strip()
    else:
        print(f"Failed to respond with value {response}")
        return None

In [21]:
rag_statistics = {
    "correct": 0,
    "incorrect": 0,
    "unknown": 0,
    "failed": 0
}

rag_data = {}

for i, line in enumerate(filtered_data):
    question = line["multichoice_question"]
    response = query_rag(question, 5)
    rag_data[i] = response
    if response is None:
        rag_statistics["failed"] += 1
    elif response == "E":
        rag_statistics["unknown"] += 1
    elif response == "B":
        rag_statistics["correct"] += 1
    else:
        rag_statistics["incorrect"] += 1

for key, value in rag_statistics.items():
    print(key, value)

Response: B
Response: B
Response: B
Response: B
correct 4
incorrect 0
unknown 0
failed 0
