# **Downloading The FAQs**

In [2]:
# !pip install python-docx

In [7]:
# Import libraries
import io  # Import for handling binary data streams and file I/O operations.

import requests  # Import for making HTTP requests and handling responses.
import docx  # Import for working with Microsoft Word .docx files.

import json  # Import for JSON (JavaScript Object Notation) serialization and deserialization.


In [4]:
# Functions that actually do the work


def clean_line(line):
    """
    Clean up a line by stripping leading and trailing whitespace and Unicode BOM (Byte Order Mark).

    Args:
    - line (str): The line of text to clean.

    Returns:
    - str: The cleaned line.
    """
    line = line.strip()
    line = line.strip('\uFEFF')  # Strip Unicode BOM (Byte Order Mark)
    return line

def read_faq(file_id):
    """
    Read and parse an FAQ document from Google Docs exported as .docx format.

    Args:
    - file_id (str): The ID of the Google Docs file.

    Returns:
    - list: A list of dictionaries representing each FAQ entry, with keys 'text', 'section', and 'question'.
    """
    url = f'https://docs.google.com/document/d/{file_id}/export?format=docx'

    # Fetch the document from Google Docs
    response = requests.get(url)
    response.raise_for_status()  # Raise an exception for HTTP errors

    # Read the .docx content into a document object
    with io.BytesIO(response.content) as f_in:
        doc = docx.Document(f_in)

    questions = []

    question_heading_style = 'heading 2'
    section_heading_style = 'heading 1'

    heading_id = ''
    section_title = ''
    question_title = ''
    answer_text_so_far = ''

    # Iterate through paragraphs in the document
    for p in doc.paragraphs:
        style = p.style.name.lower()
        p_text = clean_line(p.text)

        if len(p_text) == 0:
            continue

        # Check for section heading style
        if style == section_heading_style:
            section_title = p_text
            continue

        # Check for question heading style
        if style == question_heading_style:
            # Store the previous answer if there is one
            answer_text_so_far = answer_text_so_far.strip()
            if answer_text_so_far != '' and section_title != '' and question_title != '':
                questions.append({
                    'text': answer_text_so_far,
                    'section': section_title,
                    'question': question_title,
                })
                answer_text_so_far = ''

            # Update the current question title
            question_title = p_text
            continue
        
        # Accumulate answer text under the current question
        answer_text_so_far += '\n' + p_text

    # Store the final answer if there is one
    answer_text_so_far = answer_text_so_far.strip()
    if answer_text_so_far != '' and section_title != '' and question_title != '':
        questions.append({
            'text': answer_text_so_far,
            'section': section_title,
            'question': question_title,
        })

    return questions


In [5]:
# Dictionary mapping FAQ document identifiers to their respective 
# Google Docs file IDs.
faq_documents = {
    'data-engineering-zoomcamp': '19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw',
    'machine-learning-zoomcamp': '1LpPanc33QJJ6BSsyxVg-pWNMplal84TdZtq10naIhD8',
    'mlops-zoomcamp': '12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0',
}


In [6]:
# Creates an empty document list
documents = []

# Iterate through each course and its corresponding Google Docs file ID
for course, file_id in faq_documents.items():
    print(course)  # Print the course identifier (optional, for debugging or monitoring progress)
    
    # Read FAQs from the Google Docs file associated with the current course
    course_documents = read_faq(file_id)
    
    # Create a dictionary entry for the current course with its FAQs
    course_entry = {
        'course': course,       # Store the course identifier
        'documents': course_documents  # Store the list of FAQs for the course
    }
    
    # Append the course entry dictionary to the documents list
    documents.append(course_entry)


data-engineering-zoomcamp
machine-learning-zoomcamp
mlops-zoomcamp


In [8]:
with open('documents.json', 'wt') as f_out:
    # Write the 'documents' list to 'documents.json' file
    # Using json.dump() to serialize 'documents' into JSON format
    # Indent=2 for pretty-printing with indentation level of 2 spaces
    json.dump(documents, f_out, indent=2)


In [9]:
# Display the first few lines of the documents.json file 
# using the 'head' command

!head documents.json

[
  {
    "course": "data-engineering-zoomcamp",
    "documents": [
      {
        "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
        "section": "General course-related questions",
        "question": "Course - When will the course start?"
      },
      {
