## Extract Documents from database and generate questions based on chunk number

In [None]:
import logging
from sqlalchemy import select
import ipywidgets as widgets

from notebooks.question_answer_generator.generate_q_a_pairs import process_document_or_chunk, write_question_answer_json_to_csv
from src.db.models.document import Document
from src.app_config import app_config

logger = logging.getLogger(__name__)

num_qa_per_chunk_or_doc = widgets.IntText(
    value=5,
    description='Number of questions per chunk/document:',
    disabled=False   
)
display(num_qa_per_chunk_or_doc)

llm_model = widgets.Dropdown(
    options=['gpt-3.5-turbo-instruct', 'gpt-4o', 'gpt-4o-mini'],
    value='gpt-4o',
    description='OpenAI LLM Model:',
    disabled=False,
)
display(llm_model)

In [None]:
question_gen_selection = input("Generate questions by chunk or document?")

In [None]:
with app_config.db_session() as db_session:
    documents = db_session.execute(select(Document)).scalars().all()
    fields = ["question", "answer", "document_name", "document_source", "document_id", "chunk_id"]
    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
    logging.info(f'Start processing with llm {llm_model.value}')

    for document in documents:
        chunk_list= document.chunks
        if question_gen_selection == "chunk":
            for chunk in chunk_list:
                chunk_q_a_json = process_document_or_chunk(document=chunk, num_of_chunks=num_qa_per_chunk_or_doc.value, llm=llm_model.value)
                write_question_answer_json_to_csv("question_answer_pairs.csv", fields, chunk_q_a_json)        
        else:
            document_q_a_json = process_document_or_chunk(document=document, num_of_chunks=num_qa_per_chunk_or_doc.value,llm=llm_model.value)
            write_question_answer_json_to_csv("question_answer_pairs.csv", fields, document_q_a_json)        
    logger.info("Finished processing")