In [1]:
import xml.etree.ElementTree as ET
import time

import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from tqdm import tqdm

from langchain_community.document_loaders import WebBaseLoader

In [2]:
URL = "https://outerbounds.com/blog"

In [11]:
# Setup the WebDriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Runs Chrome in headless mode.
driver = webdriver.Chrome(options=options)

# Open the webpage
driver.get(URL)

# Function to scroll the page in increments
def incremental_scroll(n=25):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        # Scroll down by a third of the page height
        scroll_height = driver.execute_script("return document.body.scrollHeight") / n
        for i in range(n):  # Split the scroll into three increments
            driver.execute_script(f"window.scrollBy(0, {scroll_height});")
            time.sleep(1/n)  # Wait after each increment
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break  # Exit the loop if the scroll height hasn't changed after the scroll
        last_height = new_height

# Use the function to scroll incrementally
incremental_scroll()

# Parse page source with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')
articles = soup.find_all('article')

# Process your articles here
for article in articles:
    print(article.text)  # Or any processing you need

driver.quit()

Free Live Courses: Full-Stack Machine Learning and Building Systems with LLMsJoin Hugo Bowne-Anderson for two live courses on building full-stack machine learning systems.March 28, 2024  · 3 min readThe Outerbounds Team
New in Outerbounds: Access NVIDIA Cloud GPUsYou can now apply to access cloud GPUs directly in NVIDIA's DGX Cloud through Outerbounds - just add @nvidia to access GPUs at scale!March 18, 2024  · 4 min readThe Outerbounds Team
Develop ML/AI systems easily on all cloudsOuterbounds is now available on all cloud marketplaces. Learn why and how to develop ML/AI systems quickly without cloud lock-in.March 15, 2024  · 9 min readThe Outerbounds Team
Spring Tour 2024Outerbounds is going global in 2024. This page shows our event calendar for spring 2024.March 12, 2024  · 5 min readThe Outerbounds Team
The 6 Steps to Cost-Optimized ML/AI/Data WorkloadsWe share a six-step process for understanding cloud costs, attributing, and minimizing them. The process is conveniently integrated

In [16]:
article = articles[0]

In [24]:
articles

[<article itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header class="header_slCM"><a href="/blog/announcing-full-stack-ml-courses/" itemprop="url"><img src="/blog/cover/announcing-full-stack-ml-courses.jpg"/></a><div class="headerContent_JF9x"><div><h2 itemprop="headline"><a href="/blog/announcing-full-stack-ml-courses/" itemprop="url">Free Live Courses: Full-Stack Machine Learning and Building Systems with LLMs</a></h2><p>Join Hugo Bowne-Anderson for two live courses on building full-stack machine learning systems.</p><time datetime="2024-03-28T01:25:58.000Z" itemprop="datePublished">March 28, 2024<!-- --> <!-- --> · 3 min read</time></div><div class="authors_Co8h"><div class="avatar"><a class="avatar__photo-link" href="https://outerbounds.com/" rel="noopener noreferrer" target="_blank"><img alt="The Outerbounds Team" class="avatar__photo" src="https://github.com/outerbounds.png"/></a><div class="avatar__intro" itemprop="author" itemscope="" itemtype="htt

In [32]:
urls = []
for article in articles:
    urls.append("https://outerbounds.com" + article.find('a')['href'])

In [33]:
len(urls)

75

In [34]:
urls[0]

'https://outerbounds.com/blog/announcing-full-stack-ml-courses/'

In [35]:
loader = WebBaseLoader(urls[:5])
docs = loader.load()

In [37]:
print(docs[0])

page_content='\n\n\n\n\nFree Live Courses: Full-Stack Machine Learning and Building Systems with LLMs | Outerbounds\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to main contentOuterboundsTry Metaflow SandboxStart for FreeOuterboundsPlatformMetaflowPricingResourcesData ScienceEngineeringMoreBlogAbout UsCareersSlackGithubYouTube← Back to main menuBlogFree Live Courses: Full-Stack Machine Learning and Building Systems with LLMsNews and UpdatesCommunityBest PracticesMarch 28, 2024  · 3 min readThe Outerbounds TeamFollowing the popular hands-on courses and workshops that we taught last year at SciPy, ODSC, and Uplimit, today we are announcing two new live courses that you can join online for free!The courses are taught by Hugo Bowne-Anderson whose courses on DataCamp and other platforms have been followed by over 4 million learners worldwide. The space is limited, so sign up to one or both of the courses below!Course 1: Full-Stack Machine Learning with MetaflowIn this course, you’ll learn how to\u200b

# Load sitemap

In [3]:
sitemap='https://outerbounds.com/sitemap.xml'

In [4]:
response = requests.get(sitemap)

In [5]:
sitemap_content = response.content
root = ET.fromstring(sitemap_content)
namespaces = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
urls=[]
for url in root.findall('ns:url/ns:loc', namespaces=namespaces):
    if 'blog' in url.text or 'docs' in url.text or 'engineering' in url.text:
        urls.append(url.text)

In [6]:
FIELDS = ['title', 'content', 'url', 'date', 'author', 'tags', 'canonical', 'alternate', 'description']

In [152]:
data = {field: [] for field in FIELDS}
for url in tqdm(urls):
    page_response = requests.get(url)
    page_content = page_response.content
    soup = BeautifulSoup(page_content, 'html.parser')
    head = soup.find('head')

    try:
        title = head.find('meta', attrs={"property": "og:title"}).get('content')
    except AttributeError:
        raise ValueError(f"og:title not found for {url}")
    try:
        description = head.find('meta', attrs={"property": "og:description"}).get('content')
    except AttributeError:
        raise ValueError(f"og:title not found for {url}")

    canonical = head.find('link', {'rel': 'canonical'})['href']
    alternate = head.find('link', {'rel': 'alternate'})['href']

    if 'blog' in url: 
        content = str(soup.find('div', id="post-content", class_="markdown firstLetter__Rps"))
        if content == 'None':
            content = pd.NA
        try:
            tags = head.find('meta', attrs={"property": "article:tag"}).get('content')
        except AttributeError:
            tags = pd.NA
        try:
            author = head.find('meta', attrs={"property": "article:author"}).get('content')
        except:
            author = pd.NA
        try:
            publish_date = head.find('meta', attrs={"property": "article:published_time"}).get('content')
        except:
            publish_date = pd.NA
    elif 'docs' in url or 'engineering' in url:
        content = str(soup.find('div', id="docMainContainer_gTbr"))
        if content == 'None':
            content = str(soup.find('main')) #, id="docMainContainer_gTbr"))
        if content == 'None':
            raise ValueError(f"docMainContainer_gTbr not found for {url}")
        tags = ','.join([a_tag.text for a_tag in soup.find_all('a', class_="tag_zVej tagRegular_sFm0")])
        author = pd.NA
        publish_date = pd.NA

    data['title'].append(title)
    data['content'].append(content)
    data['description'].append(description)
    data['url'].append(url)
    data['date'].append(publish_date)
    data['author'].append(author)
    data['canonical'].append(canonical)
    data['alternate'].append(alternate)
    data['tags'].append(tags)

100%|██████████| 216/216 [00:22<00:00,  9.55it/s]


In [153]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,title,content,url,date,author,tags,canonical,alternate
0,Building the ML-driven future | Outerbounds,,https://outerbounds.com/blog/,,,,https://outerbounds.com/blog/,https://outerbounds.com/blog/
1,Free Live Courses: Full-Stack Machine Learning...,"<div class=""markdown firstLetter__Rps"" id=""pos...",https://outerbounds.com/blog/announcing-full-s...,2024-03-28T01:25:58.000Z,https://outerbounds.com/,"infrastructure,machine learning,platform",https://outerbounds.com/blog/announcing-full-s...,https://outerbounds.com/blog/announcing-full-s...
2,Announcing the Metaflow Card Viewer | Outerbounds,"<div class=""markdown firstLetter__Rps"" id=""pos...",https://outerbounds.com/blog/announcing-metafl...,2023-04-13T19:08:08.000Z,https://outerbounds.com/,"sandbox,machine learning,data visualization",https://outerbounds.com/blog/announcing-metafl...,https://outerbounds.com/blog/announcing-metafl...
3,Announcing The Outerbounds Platform | Outerbounds,"<div class=""markdown firstLetter__Rps"" id=""pos...",https://outerbounds.com/blog/announcing-outerb...,2023-01-31T13:25:58.000Z,https://outerbounds.com/,"infrastructure,machine learning,platform",https://outerbounds.com/blog/announcing-outerb...,https://outerbounds.com/blog/announcing-outerb...
4,Building the ML-driven future | Outerbounds,,https://outerbounds.com/blog/archive/,,,,https://outerbounds.com/blog/,https://outerbounds.com/blog/


In [158]:
df.content.isna().sum()

2

In [160]:
df[df.content.isna()].url.values

array(['https://outerbounds.com/blog/',
       'https://outerbounds.com/blog/archive/'], dtype=object)

In [162]:
df.content.sample(5)

12     <div class="markdown firstLetter__Rps" id="pos...
181    <main class="docMainContainer_gTbr"><div class...
92     <main class="docMainContainer_gTbr"><div class...
106    <main class="docMainContainer_gTbr"><div class...
83     <main class="docMainContainer_gTbr"><div class...
Name: content, dtype: object

In [171]:
sample = df.content.sample(1).values[0]

In [172]:
from IPython.core.display import display, HTML
display(HTML(sample))

  from IPython.core.display import display, HTML
