### Importing the necessary libraries

In [2]:
import json
import warnings
import hashlib

warnings.filterwarnings("ignore")

### Data Loading and Preprocessing

In [3]:
# loading raw file
with open('documents.json', 'rb') as file:
    docs_raw = json.load(file)

# flattening raw file into list of dictionaries
documents = [{'course': course_dict['course'], 'section': docs['section'], 'question': docs['question'], 'text': docs['text']} \
            for course_dict in docs_raw \
            for docs in course_dict['documents']]

In [6]:
documents[0]

{'course': 'data-engineering-zoomcamp',
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel."}

### Generating Stable IDs for documents

This ID will be used to reference the document in the evaluation process

In [23]:
def generate_doc_id(doc:dict) -> dict:

    # first let's Concatenate the different fields together
    combined = "-".join(doc.values())

    # now to hash our combined unique id
    hash_object = hashlib.md5(combined.encode()) # converts string to bytes

    # generates the MD5 hash of the encoded string and converts it to a hexidecimal string
    hash_hex = hash_object.hexdigest()

    return hash_hex[:8]  # only returning the first 8 characters of the hexidecimal string

In [25]:
documents_updated = [doc.update({'id': generate_doc_id(doc)}) or doc for doc in documents]

documents_updated[0]

{'course': 'data-engineering-zoomcamp',
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'id': '7000acaa'}