### Importing the necessary libraries

In [1]:
import json
import warnings
import hashlib
from openai import OpenAI
from tqdm.auto import tqdm
import pandas as pd
import re

warnings.filterwarnings("ignore")

### Data Loading and Preprocessing

In [2]:
# loading raw file
with open('documents.json', 'rb') as file:
    docs_raw = json.load(file)

# flattening raw file into list of dictionaries
documents = [{'course': course_dict['course'], 'section': docs['section'], 'question': docs['question'], 'text': docs['text']} \
            for course_dict in docs_raw \
            for docs in course_dict['documents']]

In [3]:
documents[0]

{'course': 'data-engineering-zoomcamp',
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel."}

### Generating Stable IDs for documents

This ID will be used to reference the document in the evaluation process

In [4]:
# a simple function to generate hash ids based on the concatenation of all our dictionary values
def generate_doc_id(doc:dict) -> dict:

    # first let's Concatenate the different fields together
    combined = "-".join(doc.values())

    # now to hash our combined unique id
    hash_object = hashlib.md5(combined.encode()) # converts string to bytes

    # generates the MD5 hash of the encoded string and converts it to a hexidecimal string
    hash_hex = hash_object.hexdigest()

    return hash_hex[:8]  # only returning the first 8 characters of the hexidecimal string

In [5]:
# using our function to generate the IDs key-value pairs
documents_updated = [doc.update({'id': generate_doc_id(doc)}) or doc for doc in documents]

documents_updated[0]

{'course': 'data-engineering-zoomcamp',
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'id': '7000acaa'}

In [6]:
# let's quickly check if the ids we generated are unique
hashes = [doc['id'] for doc in documents_updated]

len(documents_updated) == len(set(hashes))

True

In [11]:
with open('documents_with_ids.json', 'w') as json_file:
    json.dump(documents_updated, json_file, indent=4)

### Using an LLM model to generate questions for each record ID

In [8]:
# intialising the openai client so that we can use the chatgpt-4o model to generate our questions for each record ID
openai_client = OpenAI()

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [8]:
# now to create out prompt template - we will use the template provided in the course

prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [9]:
# next we want to write a simple function that generates the question for each record:
def generate_questions(doc_dict):
    # each key in doc_dict corresponds to a placeholder in prompt_template, and the associated value will be inserted into the template
    prompt = prompt_template.format(**doc_dict)
    
    responses = openai_client.chat.completions.create(
        model = 'gpt-3.5-turbo',
        messages = [{'role':'user', 'content': prompt}]
    )
    
    return responses.choices[0].message.content

In [None]:
print(generate_questions(documents_updated[1]))

In [None]:
# now to generate the questions for each record id
results = [{'Course': doc['course'], 'document_ID': doc['id'], 'Questions': generate_questions(doc)} for doc in tqdm(documents_updated)]

In [None]:
# let's have a quick look at the results
results[0]

In [13]:
# lets write a function that is able to parse our questions in results from a JSON object into a python object

def parse_results(res_dict: dict) -> dict:
    clean_dict = {}
    
    clean_dict['Course'] = res_dict['Course']
    clean_dict['document_ID'] = res_dict['document_ID']
    
    try:
        tmp_ques = json.loads(res_dict['Questions'])
        if type(tmp_ques) == list:
            clean_dict['Question'] = tmp_ques
        else:
            clean_dict['Question'] = list(tmp_ques.values())
    except:
        # the error is specific - so a hot fix for this item
        clean_dict['Question'] = [
            "Why am I getting the error column c.relhasoids does not exist when using the command \\d <database name>?",
            "What should I do to resolve the error with pgcli?",
            "Should I uninstall pgcli to fix the issue?",
            "What steps should I take after reinstalling pgcli?",
            "Is restarting the PC necessary to resolve the error?"
        ]
    
    return clean_dict

In [None]:
parsed_results = [parse_results(res) for res in results]

parsed_results[108]

### Final Step - move parsed results dictionary into a dataframe and throw .csv output

In [None]:
# now to convert our results dictionary to a pandas dataframe

ground_truth = pd.DataFrame(data=parsed_results)
ground_truth.head()

In [16]:
# realised there were some issues with the question column - there are some outputs with nested listing
# hence we need to flatten list further using the followinf function before exploding

def flatten_list(obj):
    if isinstance(obj[0], list):
        return obj[0]
    elif isinstance(obj[0], dict):
        result = [i[0] for i in [list(item.values()) for item in obj]]
        return result
    else:
        return obj

In [17]:
ground_truth['Question'] = ground_truth['Question'].apply(lambda x : flatten_list(x))

In [18]:
# lets now explode the question column
ground_truth = ground_truth.explode('Question', ignore_index=True)

In [None]:
ground_truth.head()

In [20]:
# defining a helper function to clean the question column in our ground truth dataset

def ques_clean(ques_string):
    pattern = r'(^Q:\s|^[0-9]\.\s|^Q[0-9]:\s|^Q[0-9]\.\s)'
    
    result = re.sub(pattern, "", ques_string)
    
    return result

In [21]:
ground_truth['Question'] = ground_truth['Question'].apply(lambda x : ques_clean(x))

In [22]:
# finally to throw the output into a .csv file

ground_truth.to_csv("ground-truth-data.csv", index=False)