## Setup and Imports

In [12]:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
import csv

## Save output as CSV

In [13]:
def save_chunks_to_csv(chunks, filename='notion_help_chunks.csv'):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Chunk Number', 'Content'])  # Write header
        for i, chunk in enumerate(chunks, 1):
            writer.writerow([i, chunk])
    print(f"Chunks saved to {filename}")

## Web Crawler Function

In [14]:
def get_all_help_articles(base_url):
    articles = []
    visited = set()

    def crawl(url):
        if url in visited:
            return
        visited.add(url)

        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Check if this is an article page
        if soup.find('article'):
            articles.append(url)

        # Find all links on the page
        for link in soup.find_all('a', href=True):
            href = link['href']
            full_url = urljoin(base_url, href)
            if full_url.startswith(base_url) and full_url not in visited:
                crawl(full_url)

    crawl(base_url)
    return articles

## Content Extraction Function

In [15]:
def extract_core_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    article = soup.find('article')
    if not article:
        return ""

    # Extract text from all relevant elements
    content = []
    for elem in article.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li']):
        content.append(elem.get_text().strip())

    return "\n".join(content)

## Text Chunking Function

In [16]:
def chunk_text(text, max_length=750):
    chunks = []
    current_chunk = []
    current_length = 0

    for line in text.split('\n'):
        if current_length + len(line) > max_length and current_chunk:
            chunks.append('\n'.join(current_chunk))
            current_chunk = []
            current_length = 0
        
        current_chunk.append(line)
        current_length += len(line)

    if current_chunk:
        chunks.append('\n'.join(current_chunk))

    return chunks

## Main Execution Function

In [17]:
def main():
    base_url = "https://www.notion.so/help"
    articles = get_all_help_articles(base_url)
    
    all_chunks = []
    for article_url in articles:
        content = extract_core_content(article_url)
        chunks = chunk_text(content)
        all_chunks.extend(chunks)

    # Print or save the chunks as needed
    for i, chunk in enumerate(all_chunks):
        print(f"Chunk {i + 1}:\n{chunk}\n{'='*50}\n")
    
    # Save chunks to CSV
    save_chunks_to_csv(all_chunks)
if __name__ == "__main__":
    main()

Chunk 1:
101: Introduction

Chunk 2:
Start here
Curious about what Notion is? We'll show you right here, and in the articles to come. Let's go! 📍

Chunk 3:
Start here
Curious about what Notion is? We'll show you right here, and in the articles to come. Let's go! 📍
We like to describe Notion as a set of building blocks for creating things you love to use on your computer, such as:
Documents
Documents
Databases
Databases
Public websites
Public websites
Knowledge bases
Knowledge bases
Project management systems
Project management systems
The world's most beautiful notes... 😉
The world's most beautiful notes... 😉
Notion is different from other software in a few ways. And once you master these basics, you can pretty much build whatever you want.
Most importantly, don't worry about not knowing everything you can do right away. We'll discover it together. Click below to dive right in!


Chunk 4:
What is a block?
Think of Notion as a bottomless bin of building blocks. Build whatever you want, 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

