In [1]:
import openai
import config

In [2]:
import os 
import json
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.document_loaders import JSONLoader, DirectoryLoader, UnstructuredHTMLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI

In [3]:
from langchain.schema import HumanMessage, SystemMessage

In [4]:
import re

In [5]:
#Key for llm used to generate the QA apirs
os.environ["OPENAI_API_KEY"] = config.api_key

In [6]:
html_dir_loader = DirectoryLoader('../kbasedocs/', glob="**/[!.]*.html", loader_cls=UnstructuredHTMLLoader)
data_html = html_dir_loader.load()

In [7]:
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")

In [8]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 20,
    length_function = len,
)
documents_html = text_splitter.split_documents(data_html)

In [9]:
# prompt used to extract questions
extraction_system_prompt="You are an expert user extracting information to quiz people on documentation. You will be passed a page extracted from the documentation, write a numbered list of questions that can be answered based *solely* on the given text."
def create_extraction_conversation_messages(text):
    """
    Takes a piece of text and returns a list of messages designed to extract questions from the text.
    
    Args:
        text (str): The input text for which questions are to be extracted.
    
    Returns:
        list: A list of messages that set up the context for extracting questions.
    """
    # Create a system message setting the context for the extraction task
    context_message = SystemMessage(content=extraction_system_prompt)
    
    # Create a human message containing the input text
    input_text_message = HumanMessage(content=text)
    
    # Return the list of messages to be used in the extraction conversation
    return [context_message, input_text_message]


In [10]:
def extract_questions_from_output(output):
    """
    Takes a numbered list of questions as a string and returns them as a list of strings.
    The input might have prefixes/suffixes that are not questions or incomplete questions.

    Args:
        output (str): A string containing a numbered list of questions.

    Returns:
        list of str: A list of extracted questions as strings.
    """
    # Define a regex pattern to match questions (lines starting with a number followed by a dot and a space)
    question_pattern = re.compile(r"^\s*\d+\.\s*(.+)$", re.MULTILINE)

    # Find all the questions matching the pattern in the input text
    questions = question_pattern.findall(output)

    # Check if the last question is incomplete (does not end with punctuation or a parenthesis)
    if (len(questions) > 0) and (not re.search(r"[.!?)]$", questions[-1].strip())):
        print(f"WARNING: Popping incomplete question: '{questions[-1]}'")
        questions.pop()

    return questions

In [11]:
def extract_questions_and_ref(ref_text):
    result = create_extraction_conversation_messages(ref_text)
    reference_text = result[1].content
    llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")
    output = llm(result)
    # Extracting questions
    questions = output.content.split('\n')

    # Remove empty elements and leading/trailing spaces
    questions = [question.strip() for question in questions if question.strip()]
    # Extracting questions and removing numbers
    questions = re.findall(r'\d+\.\s*(.*)', output.content)
    return reference_text, questions

In [12]:
# prompt used to answer a question
answering_system_prompt="You are an expert user answering questions. You will be passed a page extracted from a documentation and a question. Generate a comprehensive and informative answer to the question based *solely* on the given text."


def create_answering_conversation_messages(question, text):
    """
    Takes a question and a text and returns a list of messages designed to answer the question based on the text.
    
    Args:
        question (str): The question to be answered.
        text (str): The text containing information for answering the question.
    
    Returns:
        list: A list of messages that set up the context for answering the question.
    """
    # Create a system message setting the context for the answering task
    context_message = SystemMessage(content=answering_system_prompt)
    
    # Create a human message containing the input text
    input_text_message = HumanMessage(content=text)
    
    # Create a human message containing the question to be answered
    input_question_message = HumanMessage(content=question)
    
    # Return the list of messages to be used in the answering conversation
    
    return [context_message, input_text_message, input_question_message]

In [13]:
questions=[]
reference_text=[]
for item in documents_html[305:306]:
    rt, q = extract_questions_and_ref(item.page_content)
    reference_text.append(rt)
    questions.append(q)
#questions = list(chain(*questions))

In [14]:
documents_html[305:306]

[Document(page_content='gut environment, exploring the trade-offs in information provided by applying each metabolic flux modeling approach. Overall, we conclude that no single community modeling approach is better than the others, and often there is much to be gained by applying multiple approaches synergistically when exploring the ecology of a microbial community.', metadata={'source': '../kbasedocs/workflows_metabolic-models_metabolic-flux-models.html'})]

In [15]:
def extract_answers(questions,ref_text):
    # Create the input messages for the chat model

    answers=[]
    for item in questions:
        message = create_answering_conversation_messages(item, ref_text)
        answer = llm(message)
        answers.append(answer.content)
        # run the chat model with the input messages
    return answers

In [16]:
answers=[]
for q,r in zip(questions,reference_text):
    answers.append(extract_answers(q,r))

In [17]:
# Combine questions and answers into a list of dictionaries
data = []
for ref_text, q_list, a_list in zip(reference_text, questions, answers):
    for q, a in zip(q_list, a_list):
        data.append({
            "question": q,
            "answer": a,
            "reference_text": ref_text
        })

In [18]:
print(data[0])

{'question': 'What is the focus of the document?', 'answer': 'The focus of the document is on exploring different metabolic flux modeling approaches in the context of the gut environment and discussing the trade-offs in the information provided by each approach. The document emphasizes that no single community modeling approach is superior to others and suggests that combining multiple approaches can be beneficial when studying the ecology of a microbial community.', 'reference_text': 'gut environment, exploring the trade-offs in information provided by applying each metabolic flux modeling approach. Overall, we conclude that no single community modeling approach is better than the others, and often there is much to be gained by applying multiple approaches synergistically when exploring the ecology of a microbial community.'}


In [20]:

output_filepath = 'pairs_n3.json'
with open(output_filepath, 'w') as output_file:
    json.dump(data, output_file, indent=4)
    print(f"Results have been saved to {output_filepath}.")

Results have been saved to pairs_n3.json.
