In [59]:
import requests
import hashlib
from collections import defaultdict
import json
import os
from dotenv import load_dotenv

from openai import OpenAI
from pathlib import Path
from tqdm.auto import tqdm

In [1]:

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [29]:
def generate_document_id(doc):
    #combined = f"{doc['course']}-{doc['question']}"
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [30]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [31]:
documents[5]

{'text': "There are 3 Zoom Camps in a year, as of 2024. However, they are for separate courses:\nData-Engineering (Jan - Apr)\nMLOps (May - Aug)\nMachine Learning (Sep - Jan)\nThere's only one Data-Engineering Zoomcamp “live” cohort per year, for the certification. Same as for the other Zoomcamps.\nThey follow pretty much the same schedule for each cohort per zoomcamp. For Data-Engineering it is (generally) from Jan-Apr of the year. If you’re not interested in the Certificate, you can take any zoom camps at any time, at your own pace, out of sync with any “live” cohort.",
 'section': 'General course-related questions',
 'question': 'Course - how many Zoomcamps in a year?',
 'course': 'data-engineering-zoomcamp',
 'id': '2ed9b986'}

In [32]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [33]:
len(hashes), len(documents)

(947, 948)

In [34]:
for k, v in hashes.items():
    if len(v) > 1:
        print(k, len(v)) 

593f7569 2


In [35]:
hashes['593f7569']

[{'text': "They both do the same, it's just less typing from the script.\nAsked by Andrew Katoch, Added by Edidiong Esu",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'},
 {'text': "They both do the same, it's just less typing from the script.",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'}]

In [40]:
with open('document_with_ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent= 2 )

In [41]:
!head document_with_ids.json

[
  {
    "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
    "section": "General course-related questions",
    "question": "Course - When will the course start?",
    "course": "data-engineering-zoomcamp",
    "id": "c02e79ef"
  },
  {
    "text": "GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites",


In [50]:
from openai import OpenAI


#parent_dir = Path(__file__).resolve().parent.parent #does not work for jupyter notebook
cwd = os.getcwd()
parent_dir = Path(cwd).parent
env_path = parent_dir / '.env'

load_dotenv(dotenv_path=env_path)
key = os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key=key)

In [51]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [52]:
doc = documents[2]
prompt = prompt_template.format(**doc)
print(prompt)

You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]


In [55]:
response = client.chat.completions.create(
        model = 'gpt-4o-mini',
        messages = [{"role": "user", "content": prompt}]
    )

json_response = response.choices[0].message.content
json_response


'["Is it possible to enroll in the course once it has already started?", "What happens if I miss the registration deadline for the course?", "Am I allowed to submit homework even if I join late?", "Are there specific deadlines I need to remember for the final projects?", "What should I keep in mind regarding project submissions as the course progresses?"]'

In [56]:
json.loads(json_response)

['Is it possible to enroll in the course once it has already started?',
 'What happens if I miss the registration deadline for the course?',
 'Am I allowed to submit homework even if I join late?',
 'Are there specific deadlines I need to remember for the final projects?',
 'What should I keep in mind regarding project submissions as the course progresses?']

In [66]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model = 'gpt-4o-mini',
        messages = [{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [67]:
results = {}

In [68]:

for doc in tqdm(documents):
    doc_id = doc['id']
    if doc_id in results:
        continue
    questions = generate_questions(doc)
    results[doc_id] = questions

100%|████████████████████████████████████████████████████████████████████████████████| 948/948 [28:50<00:00,  1.83s/it]


In [69]:
parsed_results = {}

for doc_id, json_questions in results.items():
    parsed_results[doc_id] = json.loads(json_questions)

In [72]:
doc_index = {d['id']: d for d in documents}

In [74]:
final_results = []

for doc_id, questions in parsed_results.items():
    course = doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q, course, doc_id))

In [76]:
import pandas as pd

In [78]:
df = pd.DataFrame(final_results, columns= ['question', 'course', 'document'])

In [79]:
df.to_csv('ground_truth_data.csv', index=False)

In [80]:
!head ground_truth_data.csv

question,course,document
What is the starting date and time for the course?,data-engineering-zoomcamp,c02e79ef
How do I subscribe to the course calendar?,data-engineering-zoomcamp,c02e79ef
Where can I find the registration link for the course?,data-engineering-zoomcamp,c02e79ef
Is there a Telegram channel for course announcements?,data-engineering-zoomcamp,c02e79ef
Do I need to register in DataTalks.Club's Slack?,data-engineering-zoomcamp,c02e79ef
What are the necessary qualifications to enroll in this course?,data-engineering-zoomcamp,1f6520ca
Is there a specific skill set required before taking this course?,data-engineering-zoomcamp,1f6520ca
Can you provide details on what prior knowledge is needed for this course?,data-engineering-zoomcamp,1f6520ca
Are there any recommended skills or tools to learn before starting this course?,data-engineering-zoomcamp,1f6520ca
