In [1]:
# step 1
import requests
from bs4 import BeautifulSoup

# Step 1: Scrape all the Help Articles from the Notion help center

# URL of the Notion Help Center
base_url = 'https://www.notion.so/help'

def get_help_articles_links(base_url):
    response = requests.get(base_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        # Find all links to the help articles
        article_links = soup.find_all('a', href=True)
        
        # Filter and get only the links that are help articles
        help_article_links = []
        for link in article_links:
            href = link['href']
            if '/help/' in href and 'notion.so' not in href:  # Ensure it's an internal help article link
                full_link = f'https://www.notion.so{href}'
                help_article_links.append(full_link)
        
        return help_article_links
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return []

# Get all the help article links
help_articles_links = get_help_articles_links(base_url)

# Print the links to verify
for i, link in enumerate(help_articles_links, 1):
    print(f"Article {i}: {link}")


Article 1: https://www.notion.so/help/reference
Article 2: https://www.notion.so/help/guides
Article 3: https://www.notion.so/help/reference
Article 4: https://www.notion.so/help/category/new-to-notion
Article 5: https://www.notion.so/help/category/new-to-notion
Article 6: https://www.notion.so/help/start-here
Article 7: https://www.notion.so/help/what-is-a-block
Article 8: https://www.notion.so/help/create-your-first-page
Article 9: https://www.notion.so/help/category/new-to-notion
Article 10: https://www.notion.so/help/category/meet-your-workspace
Article 11: https://www.notion.so/help/category/meet-your-workspace
Article 12: https://www.notion.so/help/intro-to-workspaces
Article 13: https://www.notion.so/help/navigate-with-the-sidebar
Article 14: https://www.notion.so/help/create-delete-and-switch-workspaces
Article 15: https://www.notion.so/help/category/meet-your-workspace
Article 16: https://www.notion.so/help/category/write-edit-and-customize
Article 17: https://www.notion.so/he

In [7]:
# step 1
def process_help_articles_links(links):
    # Step 1: Remove duplicate links
    unique_links = list(set(links))
    
    # Step 2: Filter out notion-academy links
    filtered_links = [link for link in unique_links if not link.startswith('https://www.notion.so/help/notion-academy/')]

    # Step 3: Categorize links into reference, guides, and category
    reference_links = []
    guides_links = []
    category_links = []

    for link in filtered_links:
        if link.startswith('https://www.notion.so/help/category/'):
            category_links.append(link)
        elif link.startswith('https://www.notion.so/help/guides/'):
            guides_links.append(link)
        elif link.startswith('https://www.notion.so/help/'):
            if link not in ["https://www.notion.so/help/reference","https://www.notion.so/help/guides","https://www.notion.so/help/notion-academy"]:
                reference_links.append(link)

    return reference_links, guides_links, category_links

# Process the links
reference_links, guides_links, category_links = process_help_articles_links(help_articles_links)

# Print the results to verify
print("Reference Links:")
for link in reference_links:
    print(link)

print("\nGuides Links:")
for link in guides_links:
    print(link)

print("\nCategory Links:")
for link in category_links:
    print(link)


Reference Links:
https://www.notion.so/help/comments-mentions-and-reminders
https://www.notion.so/help/appearance-settings
https://www.notion.so/help/back-up-your-data
https://www.notion.so/help/qna
https://www.notion.so/help/monday
https://www.notion.so/help/import-data-into-notion
https://www.notion.so/help/upgrade-or-downgrade-your-plan
https://www.notion.so/help/notion-calendar-integrations
https://www.notion.so/help/synced-blocks
https://www.notion.so/help/database-templates
https://www.notion.so/help/images-files-and-media
https://www.notion.so/help/create-your-first-page
https://www.notion.so/help/columns-headings-and-dividers
https://www.notion.so/help/notion-for-startups
https://www.notion.so/help/audit-log
https://www.notion.so/help/how-to-protect-yourself-from-malvertising
https://www.notion.so/help/start-with-a-template
https://www.notion.so/help/notion-calendar-keyboard-shortcuts
https://www.notion.so/help/what-is-a-database
https://www.notion.so/help/notion-ai-connectors-

In [10]:
# step 2; test version, dont run this one
import requests
from bs4 import BeautifulSoup

def extract_article_content(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract the article title
        title_tag = soup.find('h1', class_='title_title__DWL5N title_titleSizeL__4C9l9 title_titleWeightBold__838EK title_titleFamilyInter__Ra6_Q title_titleColorDark__Pqy5I')
        title = f"title: {title_tag.get_text(separator=' ', strip=True)}" if title_tag else "title: No Title"
        
        # Extract the main content
        main_content_tag = soup.find('h2', class_='helpArticle_helpArticlePrologueCopy__0cmaN')
        main_content = f"main content: {main_content_tag.get_text(separator=' ', strip=True)}" if main_content_tag else "main content:"
        
        # Collect all text content
        text_content = [title, main_content]
        
        # Initialize variables to track current section
        current_h2 = None
        current_h3 = None
        
        # Iterate over all elements that could be relevant
        for element in soup.find_all(['h2', 'h3', 'p', 'ul']):
            if element.name == 'h2' and 'title_titleSizeM__e46NM' in element['class']:
                current_h2 = element.get_text(separator=" ", strip=True)
                current_h3 = None  # Reset h3 when a new h2 is found
                text_content.append(f"H2: {current_h2}")
            elif element.name == 'h3' and 'title_titleSizeS__om4Io' in element['class']:
                current_h3 = element.get_text(separator=" ", strip=True)
                text_content.append(f"  H3: {current_h3}")
            elif element.name == 'p' and current_h2:
                text_content.append(f"  Paragraph: {element.get_text(separator=' ', strip=True)}")
            elif element.name == 'ul' and current_h2:
                # For lists, iterate over each list item
                for li in element.find_all('li'):
                    list_item = li.get_text(separator=" ", strip=True)
                    text_content.append(f"    Paragraph: {list_item}")

        return "\n".join(text_content)
    else:
        print(f"Failed to retrieve the article. Status code: {response.status_code} for URL: {url}")
        return ""

# Extract content from all reference links
reference_articles_content = []

for link in reference_links:
    content = extract_article_content(link)
    if content:
        reference_articles_content.append(content)

# Print the extracted content to verify
for i, content in enumerate(reference_articles_content, 1):
    print(f"\nArticle {i} Content:\n")
    print(content)
    print("\n" + "="*80 + "\n")



Article 1 Content:

title: Comments, mentions & reminders
main content: There are several ways to communicate with your teammates in Notion. Our collaboration tools help you work with others asynchronously and remember important deadlines 💬
H2: Comments
  H3: Top-level page discussions
  Paragraph: If you want to give or get high-level feedback on your page, you can leave a comment at the top of it to start a discussion.
    Paragraph: Hover over the top of any page and click Add comment .
    Paragraph: @-mention colleagues to refer to them or bring them into the conversation.
    Paragraph: You can edit or delete comments. Just hover over them to edit, delete, or resolve them.
    Paragraph: You can always re-open resolved comments by clicking the # resolved comments button at the top of the page.
  Paragraph: Hover over the top of any page and click Add comment .
  Paragraph: @-mention colleagues to refer to them or bring them into the conversation.
  Paragraph: You can edit or del

In [12]:
# step 2
import requests
from bs4 import BeautifulSoup

def extract_article_content(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract the article title
        title_tag = soup.find('h1', class_='title_title__DWL5N title_titleSizeL__4C9l9 title_titleWeightBold__838EK title_titleFamilyInter__Ra6_Q title_titleColorDark__Pqy5I')
        title = f"title: {title_tag.get_text(separator=' ', strip=True)}" if title_tag else "title: No Title"
        
        # Extract the main content
        main_content_tag = soup.find('h2', class_='helpArticle_helpArticlePrologueCopy__0cmaN')
        main_content = f"main content: {main_content_tag.get_text(separator=' ', strip=True)}" if main_content_tag else "main content:"
        
        # Collect all text content
        text_content = [title, main_content]
        
        # Initialize variables to track current section
        current_h2 = None
        current_h3 = None
        paragraph_content = []
        
        # Iterate over all elements that could be relevant
        for element in soup.find_all(['h2', 'h3', 'p', 'ul']):
            if element.name == 'h2' and 'title_titleSizeM__e46NM' in element['class']:
                if current_h3 and paragraph_content:
                    text_content.append(f"  Paragraph: {' '.join(paragraph_content)}")
                    paragraph_content = []
                current_h2 = element.get_text(separator=" ", strip=True)
                text_content.append(f"H2: {current_h2}")
            elif element.name == 'h3' and 'title_titleSizeS__om4Io' in element['class']:
                if current_h3 and paragraph_content:
                    text_content.append(f"  Paragraph: {' '.join(paragraph_content)}")
                    paragraph_content = []
                current_h3 = element.get_text(separator=" ", strip=True)
                text_content.append(f"  H3: {current_h3}")
            elif element.name == 'p' and current_h2:
                paragraph_content.append(element.get_text(separator=" ", strip=True))
            elif element.name == 'ul' and current_h2:
                if paragraph_content:
                    text_content.append(f"  Paragraph: {' '.join(paragraph_content)}")
                    paragraph_content = []
                text_content.append("  list:")
                for li in element.find_all('li'):
                    list_item = li.get_text(separator=" ", strip=True)
                    text_content.append(f"    - {list_item}")
        
        # Append any remaining paragraph content
        if paragraph_content:
            text_content.append(f"  Paragraph: {' '.join(paragraph_content)}")
        
        return "\n".join(text_content)
    else:
        print(f"Failed to retrieve the article. Status code: {response.status_code} for URL: {url}")
        return ""

# Extract content from all reference links
reference_articles_content = []

for link in reference_links:
    content = extract_article_content(link)
    if content:
        reference_articles_content.append(content)

# Print the extracted content to verify
for i, content in enumerate(reference_articles_content, 1):
    print(f"\nArticle {i} Content:\n")
    print(content)
    print("\n" + "="*80 + "\n")



Article 1 Content:

title: Comments, mentions & reminders
main content: There are several ways to communicate with your teammates in Notion. Our collaboration tools help you work with others asynchronously and remember important deadlines 💬
H2: Comments
  H3: Top-level page discussions
  Paragraph: If you want to give or get high-level feedback on your page, you can leave a comment at the top of it to start a discussion.
  list:
    - Hover over the top of any page and click Add comment .
    - @-mention colleagues to refer to them or bring them into the conversation.
    - You can edit or delete comments. Just hover over them to edit, delete, or resolve them.
    - You can always re-open resolved comments by clicking the # resolved comments button at the top of the page.
  Paragraph: Hover over the top of any page and click Add comment . @-mention colleagues to refer to them or bring them into the conversation. You can edit or delete comments. Just hover over them to edit, delete, or

In [15]:
# step3
# I am actually a little bit confused about the requirements
# to keep the headers in contents in the same chunk
# so I just keep every list and sentences in the same chunk and max len is 750
# if you require to put all contents inside one h3 title in same chunk 
# it is not implemented here but easy to change to
# article_chunks = [] is the final result
def split_into_chunks(content, max_length=750):
    chunks = []
    current_chunk = []

    # Track the current length of the chunk
    current_length = 0
    
    # Split the content by lines
    lines = content.split('\n')
    
    for line in lines:
        line_length = len(line)
        
        # Check if adding this line would exceed the max length
        if current_length + line_length > max_length:
            # If it would, finish the current chunk and start a new one
            chunks.append('\n'.join(current_chunk))
            current_chunk = [line]
            current_length = line_length
        else:
            # Otherwise, just add the line to the current chunk
            current_chunk.append(line)
            current_length += line_length
    
    # Add any remaining content as the last chunk
    if current_chunk:
        chunks.append('\n'.join(current_chunk))
    
    return chunks

# Split each article's content into chunks
article_chunks = []

for content in reference_articles_content:
    chunks = split_into_chunks(content)
    article_chunks.extend(chunks)

# Print the chunks to verify
for i, chunk in enumerate(article_chunks, 1):
    print(f"\nChunk {i}:\n")
    print(chunk)
    print("\n" + "="*80 + "\n")




Chunk 1:

title: Comments, mentions & reminders
main content: There are several ways to communicate with your teammates in Notion. Our collaboration tools help you work with others asynchronously and remember important deadlines 💬
H2: Comments
  H3: Top-level page discussions
  Paragraph: If you want to give or get high-level feedback on your page, you can leave a comment at the top of it to start a discussion.
  list:
    - Hover over the top of any page and click Add comment .
    - @-mention colleagues to refer to them or bring them into the conversation.
    - You can edit or delete comments. Just hover over them to edit, delete, or resolve them.



Chunk 2:

    - You can always re-open resolved comments by clicking the # resolved comments button at the top of the page.
  Paragraph: Hover over the top of any page and click Add comment . @-mention colleagues to refer to them or bring them into the conversation. You can edit or delete comments. Just hover over them to edit, delete,