## Create ground truth data

In [1]:
import pandas as pd
import json

# Load the .jsonl file into a list of dictionaries
def load_jsonl_to_df(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return pd.DataFrame(data)

# File path to your .jsonl file
file_path = '../data/processed_dataset/chunked_documents.jsonl'

# Load the data into a DataFrame
df = load_jsonl_to_df(file_path)

# Select only the required columns
df_filtered = df[['metadata', 'page_content']].copy()
df_filtered['location'] = df_filtered['metadata'].apply(lambda x: x['location'])
df_filtered['doc_id'] = df_filtered['metadata'].apply(lambda x: x['doc_id'])
df_filtered['chunk_id'] = df_filtered['metadata'].apply(lambda x: x['chunk_id'])

# Drop the original 'metadata' column
df_filtered = df_filtered[['location', 'doc_id', 'chunk_id', 'page_content']]

# Display the DataFrame
df_filtered.head()


Unnamed: 0,location,doc_id,chunk_id,page_content
0,"Tromsø, Norway",0,0,Booking Confirmation and Ticket Instructions:T...
1,"Tromsø, Norway",0,1,Important Information and Meeting Instructions...
2,"Tromsø, Norway",0,2,Tour Duration and Northern Lights Sightings:Im...
3,"Tromsø, Norway",0,3,Cancellation Policy and Contact Information:Ca...
4,"Tromsø, Norway",0,4,Additional Support and Contact Information:Con...


In [62]:
from langchain.prompts import ChatPromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
import ast

parser = StrOutputParser()

llm = ChatOpenAI(model='gpt-4o')

def generate_questions(page_content: str, location: str) -> list:
    """
    Generates 5 questions based on the provided page content and location.

    Args:
        page_content (str): The text content of the document.
        location (str): The location information relevant to the document.

    Returns:
        list: A list of 5 questions generated by the LLM.
    """

    # Combine page content and location to form the context
    context = f"Context: {page_content}\nLocation: {location}"
    # Define the prompt template
    prompt_template = """
    You are analyzing content from a part of a PDF file that is related to a travel booking document.
    Understand the context of the content.
    Formulate 5 questions this user might ask based on a provided context.
    Make the questions specific to this context.
    The record should contain the answer to the questions, and the questions should
    be complete and not too short. Use as fewer words as possible from the record. 

    Context: {context}

    Strictly return the questions in a list of strings:
    ["question1", "question2", ..., "question5"]
    """

    # Create the prompt template with LangChain
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", prompt_template),
            # ("human", "{context}"),
        ]
    )

    chain = prompt | llm | parser
    # Create the LLM chain

    # Execute the chain with the provided context
    result = chain.invoke({"context": context})

    # Extract the generated questions
    questions = ast.literal_eval(result)
    
    return questions


In [44]:
def generate_ground_truth(df: pd.DataFrame) -> pd.DataFrame:
    questions_list = []
    
    for index, row in df.iterrows():
        page_content = row['page_content']
        location = row['location']
        
        # Generate questions based on the page content and location
        questions = generate_questions(page_content, location)
        
        # Append the generated questions to the list
        for i, question in enumerate(questions):
            questions_list.append({
                'doc_id': row['doc_id'],
                'chunk_id': row['chunk_id'],
                'question': question
            })
    
    # Convert the list of questions to a DataFrame
    questions_df = pd.DataFrame(questions_list)
    return questions_df

In [57]:
page_content = "Booking Confirmation and Ticket Instructions:Thanks for your order, Nadine. For easy access to your ticket and to be able to manage your booking on the go, download our app. November 30, 2023, at 6:30 PM From Tromsø: Aurora Borealis Tour 4 Adults (Age 0 - 99) • English • 6 hours € 555. See activity details. Get the GetYourGuide app to access your activity. Access your activity with the GetYourGuide app. Just open your tickets in the app and you’ll be ready to go."
location = "Tromsø, Norway"
questions = generate_questions(page_content,location)


In [58]:
questions

['What is the date and time of the Aurora Borealis Tour in Tromsø?',
 'How many adults are included in the booking for the Aurora Borealis Tour?',
 'What is the total cost for the Aurora Borealis Tour for 4 adults?',
 'How long is the Aurora Borealis Tour in Tromsø?',
 'Which app should be downloaded for managing the booking and accessing tickets?']

In [63]:
ground_truth_df = generate_ground_truth(df_filtered)

In [64]:
ground_truth_df[:20]

Unnamed: 0,doc_id,chunk_id,question
0,0,0,What is the date and time of the Aurora Boreal...
1,0,0,How many adults are included in the booking?
2,0,0,What is the total price for the Aurora Boreali...
3,0,0,In which language will the Aurora Borealis Tou...
4,0,0,What is the duration of the Aurora Borealis Tour?
5,0,1,Where should I go to meet my guide?
6,0,1,At what time should I arrive at the meeting po...
7,0,1,What is the address of the meeting point?
8,0,1,Where does the activity end?
9,0,1,What is the name of the hotel outside of which...


In [65]:
ground_truth_df.to_csv('../data/ground-truth-data.csv', index=False)