## Extract Documents from database and generate questions based on chunk content

In [4]:
import logging
import ipywidgets as widgets

from notebooks.metrics.generate_q_a_pairs import process_document_or_chunk, write_question_answer_json_to_csv
from src.db.models.document import Document
from src.app_config import app_config

logger = logging.getLogger(__name__)

num_qa_per_chunk_or_doc = widgets.IntText(
    value=5,
    description='Number of questions per chunk/document:',
    disabled=False   
)

llm_model = widgets.Dropdown(
    options=['gpt-3.5-turbo-instruct', 'gpt-4o', 'gpt-4o-mini'],
    value='gpt-4o',
    description='OpenAI LLM Model:',
    disabled=False,
)

question_gen_selection = widgets.RadioButtons(
    options=['document', 'chunk'],
    value='chunk',
    description='Question source:',
    disabled=False
)

question_dataset = widgets.SelectMultiple(
    options=['Imagine LA', 'CA EDD', 'BEM'],
    value=['Imagine LA', 'CA EDD' ],
    description='Dataset source:',
    disabled=False
)
file_name = widgets.Text(
    value='question_answer_pairs.csv',
    description='Filename:',
    disabled=False   
)

display(
    num_qa_per_chunk_or_doc,
    llm_model,
    question_gen_selection,
    question_dataset,
    file_name)

IntText(value=5, description='Number of questions per chunk/document:')

Dropdown(description='OpenAI LLM Model:', index=1, options=('gpt-3.5-turbo-instruct', 'gpt-4o', 'gpt-4o-mini')…

RadioButtons(description='Question source:', index=1, options=('document', 'chunk'), value='chunk')

SelectMultiple(description='Dataset source:', index=(0, 1), options=('Imagine LA', 'CA EDD', 'BEM'), value=('I…

Text(value='question_answer_pairs.csv', description='Filename:')

In [6]:
with app_config.db_session() as db_session:
    selected_dataset = list(question_dataset.value)
    documents = db_session.query(Document).filter(Document.dataset.in_(selected_dataset)).all()

    fields = ["question", "answer", "document_name", "document_source", "dataset", "document_id", "chunk_id", "content_hash"]

    logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
    logging.info(f'Start processing with llm {llm_model.value}')

    dataset_list = ", ".join(question_dataset.value)
    for document in documents:
        chunk_list= document.chunks
        if question_gen_selection.value == "chunk":
            for chunk in chunk_list:
                chunk_q_a_json = process_document_or_chunk(document_or_chunk=chunk, num_of_chunks=num_qa_per_chunk_or_doc.value, llm=llm_model.value, dataset=dataset_list)
                write_question_answer_json_to_csv(file_name.value, fields, chunk_q_a_json)        
        else:
            document_q_a_json = process_document_or_chunk(document_or_chunk=document, num_of_chunks=num_qa_per_chunk_or_doc.value,llm=llm_model.value, dataset=dataset_list)
            write_question_answer_json_to_csv(file_name.value, fields, document_q_a_json)        
    print(f"Finished processing, output saved to {file_name.value}")

2024-12-05 14:31:43,003 - INFO - Constructed database configuration
2024-12-05 14:31:43,051 - INFO - connected to postgres db
2024-12-05 14:31:43,063 - INFO - Start processing with llm gpt-4o-mini
[92m14:31:43 - LiteLLM:INFO[0m: utils.py:2720 - 
LiteLLM completion() model= gpt-4o-mini; provider = openai
2024-12-05 14:31:43,076 - INFO - 
LiteLLM completion() model= gpt-4o-mini; provider = openai
2024-12-05 14:31:45,353 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[92m14:31:45 - LiteLLM:INFO[0m: utils.py:889 - Wrapper: Completed Call, calling success_handler
2024-12-05 14:31:45,366 - INFO - Wrapper: Completed Call, calling success_handler
[92m14:31:45 - LiteLLM:INFO[0m: utils.py:2720 - 
LiteLLM completion() model= gpt-4o-mini; provider = openai
2024-12-05 14:31:45,379 - INFO - 
LiteLLM completion() model= gpt-4o-mini; provider = openai
2024-12-05 14:31:47,342 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1

Finished processing, output saved to question_answer_pairs.csv
