In [12]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [13]:
import hashlib

def generate_doc_id(doc):
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:30]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    doc_id = hash_hex[:8]

    return doc_id

In [14]:
from tqdm.auto import tqdm

In [15]:
for doc in tqdm(documents):
    doc['id']  = generate_doc_id(doc)

100%|██████████| 948/948 [00:00<00:00, 173990.29it/s]


In [16]:
documents[3]

{'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
 'section': 'General course-related questions',
 'question': 'Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?',
 'course': 'data-engineering-zoomcamp',
 'id': '67e0af99'}

In [18]:
from collections import defaultdict

In [20]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [21]:
len(hashes),len(documents)

(947, 948)

In [24]:
for k,v in hashes.items():
    if len(v) > 1:
        print(k,len(v))

61734b60 2


In [26]:
hashes['61734b60']

[{'text': "They both do the same, it's just less typing from the script.\nAsked by Andrew Katoch, Added by Edidiong Esu",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '61734b60'},
 {'text': "They both do the same, it's just less typing from the script.",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '61734b60'}]

In [27]:
import json

In [29]:
with open('documents-with-ids.json','wt') as f_out:
    json.dump(documents,f_out,indent=2)

In [32]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [36]:
import os

api_key = os.getenv('api_key')
api_key

In [None]:
from openai import OpenAI
client = OpenAI(api_key=None)

In [43]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [44]:
from tqdm.auto import tqdm

In [45]:
results = {}

for doc in tqdm(documents):
    doc_id = doc['id']

    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

  1%|          | 8/948 [00:13<26:46,  1.71s/it]


KeyboardInterrupt: 

In [46]:
results

{'61cad26f': '[\n  "When does the course officially begin?",\n  "What is the exact timetable for the first live \'Office Hours\'?",\n  "Can I subscribe to a calendar for the course schedule?",\n  "Where should I register before the course starts?",\n  "Is there an official Telegram channel for announcements?"\n]',
 '342004b3': '[\n  "Where can I find the prerequisites for this course?",\n  "What prerequisites should I check on GitHub for this course?",\n  "Does the DataTalks.Club data engineering course have any prerequisites on GitHub?",\n  "What are the course prerequisites listed on DataTalksClub GitHub for data engineering?",\n  "Where is the prerequisite information for the DataTalks.Club data engineering course available?"\n]',
 '9f7ddc2c': '["Can I join the course after it has started?",\n "Am I allowed to register late and still submit homeworks?",\n "If I don\'t register on time, can I still participate in the course activities?",\n "Is it possible to join the course post star

Ground Truth Dataset -> https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-vector-search/eval/ground-truth-data.csv