In [None]:
import os
from symtable import Class

import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI

In [None]:
print("Hello, This is webpage summarizer demo")

In [None]:
load_dotenv(override=True)

In [None]:
api_key= os.getenv('OPENAI_API_KEY')

if not api_key:
    print("No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!")
elif not api_key.startswith("sk-proj-"):
    print("An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")

In [None]:
openai = OpenAI()

In [None]:
# A class to represent a Webpage
# If you're not familiar with Classes, check out the "Intermediate Python" notebook

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

#This class downloads a webpage, removes unnecessary elements, and extracts clean, readable text along with the title
class Website:
 def __init__(self,url):
#1.Fetch Webpage
#Uses the requests library to download the webpage content.
# headers=headers is optional but often used to mimic a real browser (so some websites don’t block the request).

     self.url = url
     response = requests.get(url, headers=headers)

#2.Parse HTML with BeautifulSoup
     soup = BeautifulSoup(requests.get(self.url).content, "html.parser")
     self.title = soup.title.string if soup.title else "No title found"

#3.Remove Irrelevant Elements and Extract clean text
     for irrelevant_tag in soup.body(["script", "style", "img", "input"]):
         irrelevant_tag.decompose() #removes irrelevant tags from the HTML tree completely, leaving only visible, readable text.
     self.text = soup.body.get_text(separator="\n", strip=True) #adds newlines between different blocks of text and removes leading/trailing spaces from each line


In [None]:
system_prompt = "You are an assistant that analyzes the contents of a website \
and provides a short summary, ignoring text that might be navigation related. \
Respond in markdown."

def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.title}"
    user_prompt += "\nThe contents of this website is as follows; \
please provide a short summary of this website in markdown. \
If it includes news or announcements, then summarize these too.\n\n"
    user_prompt += website.text
    return user_prompt


In [None]:
def messages_for(website):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_for(website)}
    ]

In [None]:
def summarize(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model = "gpt-4o-mini",
        messages = messages_for(website)
    )
    return response.choices[0].message.content

In [None]:
summarize("https://cnn.com")