In [2]:
import os

# Define the directory path
directory = r"..\agent\papers\parsed_txt"

# List to store file contents
txt_files_content = []

# Loop through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            txt_files_content.append(file.read())

len(txt_files_content)
[len(txt) for txt in txt_files_content]

[176247, 98415, 48448, 61007]

In [5]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
import os

load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

llm = ChatOpenAI(model="gpt-4.1-mini", temperature=0.3)
prompt = ChatPromptTemplate.from_template(
    "Given the following academic text, generate 10 example questions "
    "that could be asked using the context across the whole file. "
    "Return the questions as a list of strings.\n\n"
    "Text:\n{context}"
)

prompt = ChatPromptTemplate.from_template(
    "Given the following academic text, generate 10 example questions "
    "using the context across the whole file along with "
    "their respective answers.\n\nText:\n{context}"
)

In [4]:
from pydantic import BaseModel, Field
from typing import List


class QAItem(BaseModel):
    question: str = Field(..., description="The generated mock question.")
    answer: str = Field(..., description="The answer to the mock question.")

class MockQA(BaseModel):
    """A list of mock questions and answers generated from the provided text."""
    qa: List[QAItem] = Field(..., description="A list of mock question and answer pairs.")


structured_llm = llm.with_structured_output(MockQA)

In [6]:
qa_seperator = "\n\n=== qa break ===\n\n"
input_dir = "../agent/papers/parsed_txt"
output_dir = "mock_qa"
os.makedirs(output_dir, exist_ok=True)

for filename in os.listdir(input_dir):

    if filename.endswith(".txt"):
        file_path = os.path.join(input_dir, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()

        context = text[:30000]
        chain = prompt | structured_llm
        results = chain.invoke({"context": context})
        
        # Save questions to output file
        out_path = os.path.join(output_dir, filename.replace(".txt", "_qa.txt"))
        with open(out_path, "w", encoding="utf-8") as out_f:
            for item in results.qa:
                out_f.write("Question:\n" + item.question + "\n")
                out_f.write("Answer:\n" + item.answer + qa_seperator)

