### Business brochure
-First to get a website 

-Use LLM to Scrap main links from the website

-Use an LLM to -exclude certain info -summarize main info - respond in markdown


In [2]:
import os
import requests #The `requests.get()` method is used to make HTTP GET requests to a specified URL.
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup #Beautiful Soup is a library that makes it easy to scrape information from web page
from IPython.display import Markdown, display, update_display
from openai import OpenAI


In [3]:
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

MODEL = "gpt-4o-mini"
openai = OpenAI()

In [26]:
# A class to represent a Webpage
#Website scraped, only text, image removed 
# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

#Using self.body = response.content

here makes sense because it's being passed to BeautifulSoup, which can handle both bytes and strings, and some HTML pages might have different encodings that BeautifulSoup can better detect from the raw content.
A quick decision flowchart:
Working with APIs that return JSON? → Use response.json()
Downloading files or binary data? → Use response.content
Working with text/HTML content? → Use response.text

## First step: Have GPT-4o-mini figure out which links are relevant

### Use a call to gpt-4o-mini to read the links on a webpage, and respond in structured JSON.  
It should decide which links are relevant, and replace relative links such as "/about" with "https://company.com/about".  
We will use "Multi-shot prompting" in which we provide different examples of how it should respond in the prompt.

This is an excellent use case for an LLM, because it requires nuanced understanding. Imagine trying to code this without LLMs by parsing and analyzing the webpage - it would be very hard!

Sidenote: there is a more advanced technique called "Structured Outputs" in which we require the model to respond according to a spec. We cover this technique in Week 8 during our autonomous Agentic AI project.

In [69]:
#Multi-shot prompting at SYSTEM PROMPT

link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company or event the user request"
link_system_prompt += "Examples of relevant information: About, News, Career, Upcoming events dates, locations"

link_system_prompt += """respond in JSON as in this example:

{
    "links": [
        {"type": "Chicago 26.2", "url": "https://full.url/goes/here/event/chicago-262"},
        {"type": "running clubs", "url": "https://full.url/chicago/running-clubs"}
    ]
}

"""

link_system_prompt+="You should respond in JSON as in this example:"
link_system_prompt+="""
{
    "links":[
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""



link_system_prompt += """ You should filter out all social media links. For instance: 
{
    "links": [
          {url": "https://www.instagram.com/rest of url"}
    ]
}

"""


In [54]:
#building USER PROMPT

def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for the brochure, respond with the full https URL in JSON format. \
Do not include Terms of Service, Advertise, Marketing, Privacy, Social Media, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

### Use LLM for the links selection
Here is the heavy weight of the LLM. We do not have to code to exclude link for any website

In [70]:
def get_links(url):
    website = Website(url)
    #using OPENAI
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[{"role":"system","content":link_system_prompt},
                  {"role":"user","content":get_links_user_prompt(website)}
                 ],
        response_format={"type":"json_object"}
    )

    result = response.choices[0].message.content
    return json.loads(result)    

## Second step: make the brochure!
Assemble all the details into another prompt to GPT4-o


In [71]:
def get_all_details(url):
    result = "Landing page:\n"
    result+= Website(url).get_contents() #return content of the link in text
    links = get_links(url)

    for link in links["links"]:
        #if the model does not reply in JSON format in Dictionary with TYPE and URL this will crash!!!
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result
        

In [72]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a event website \
and creates a short brochure about the upcoming events. Respond in markdown.\
"

In [73]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a website called: {company_name}\n"
    user_prompt += f"Here are the contents of its main page and other relevant pages; use this information to build a short brochure including about, news, upcoming events. Respond  in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [74]:
def create_brochure(company_name,url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role":"system","content":system_prompt},
            {"role":"user","content":get_brochure_user_prompt(company_name,url)}
        ]
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [76]:
#create_brochure("Kratos Sports","https://www.kratossportsve.com")