## Scraping a website and generating its summary

In [1]:
# imports

import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI

# If you get an error running this cell, then please head over to the troubleshooting notebook!

In [2]:
# Load environment variables in a file called .env

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

# Check the key

if not api_key:
    print("No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!")
elif not api_key.startswith("sk-proj-"):
    print("An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")


API key found and looks good so far!


In [7]:
openai = OpenAI()

# If this doesn't work, try Kernel menu >> Restart Kernel and Clear Outputs Of All Cells, then run the cells from the top of this notebook down.
# If it STILL doesn't work (horrors!) then please see the Troubleshooting notebook in this folder for full instructions

In [8]:
# Step 1: Create your prompts
def scrape_website(url):
    # Fetch the webpage
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        raise Exception(f"Failed to fetch the webpage: {response.status_code}")
    
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract all text, ignoring HTML and JavaScript
    text = soup.get_text(separator='\n')
    
    return text

def user_prompt_for_website(message):
    user_prompt = "You are looking at the contents of a website with a lot statistics regarding the various LLMs"
    user_prompt += "\nThe contents of this website is as follows.\n\n"
    user_prompt += message
    user_prompt += "\nprovide a concise summary of the website ignoring the html and javascript and any other navigational content\n\n"
    user_prompt += "\nIf it includes news or announcements, then summarize these too.\n\n"
    return user_prompt

system_prompt = "you are the website summary generator"

# Step 2: Make the messages list

def messages_for_website(message):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_for_website(message)}
    ]

# Step 3: Call OpenAI
def summarize_website(message):
    response = openai.chat.completions.create(
        model = "gpt-4o-mini",
        messages = messages_for_website(message)
    )
    return response.choices[0].message.content

def display_summary_website(message):
    summary = summarize_website(message)
    display(Markdown(summary))

website_text = scrape_website("https://www.vellum.ai/llm-leaderboard")
response = display_summary_website(website_text)


The website belongs to Edward Donner, a co-founder and CTO of Nebula.io, where he focuses on applying AI to enhance talent discovery and engagement. He enjoys coding, experimenting with large language models (LLMs), DJing, and electronic music production. The site features various posts and resources related to LLMs and AI, including workshops and guides.

Recent announcements include:
- January 23, 2025: Resources for a workshop on LLMs and agents.
- December 21, 2024: Introduction to a community of SuperDataScientists.
- November 13, 2024: Resources for mastering AI and LLM engineering.
- October 16, 2024: Resources for transitioning from software engineering to AI data science. 

Visitors are encouraged to connect with Edward through email and social media.