In [2]:
#imports for the company brochure generator
from dotenv import load_dotenv
import os
from IPython.display import display, Markdown
from bs4 import BeautifulSoup
import requests
from openai import OpenAI
import json

In [3]:
#load environment variables and ensure api keys are set
load_dotenv(override=True, dotenv_path="../config/.env")

openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    raise ValueError ("OPENAI_API_KEY is not set!!")

ollama_base_url = os.getenv("OLLAMA_OPENAI_EP")
if not ollama_base_url:
    raise ValueError ("OLLAMA_OPENAI_EP is not set!!")

In [5]:
#Function to parse website and gather content infomation

headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
    }

def fetch_website_content(url):
    response = requests.get (url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    title = soup.title.string if soup.title else ""
    if soup.body:
        for item in soup.body(['script','style','img','input']):
            item.decompose()
        text = soup.body.get_text(separator="\n", strip=True)
    else:
        text = ""
    return (title + "\n\n" + text)[:2_000]
    

In [12]:
fetch_website_content("https://cnn.com")

"Breaking News, Latest News and Videos | CNN\n\nCNN values your feedback\n1. How relevant is this ad to you?\n2. Did you encounter any technical issues?\nVideo player was slow to load content\nVideo content never loaded\nAd froze or did not finish loading\nVideo content did not start after ad\nAudio on ad was too loud\nOther issues\nAd never loaded\nAd prevented/slowed the page from loading\nContent moved around while ad loaded\nAd was repetitive to ads I've seen previously\nOther issues\nCancel\nSubmit\nThank You!\nYour effort and contribution in providing this feedback is much\n                                        appreciated.\nClose\nAd Feedback\nClose icon\nUS\nWorld\nPolitics\nBusiness\nHealth\nEntertainment\nUnderscored\nStyle\nTravel\nSports\nScience\nClimate\nWeather\nUkraine-Russia War\nIsrael-Hamas War\nGames\nMore\nUS\nWorld\nPolitics\nBusiness\nHealth\nEntertainment\nUnderscored\nStyle\nTravel\nSports\nScience\nClimate\nWeather\nUkraine-Russia War\nIsrael-Hamas War\nGame

In [6]:
def fetch_website_links(url):
    """
    Return the links on the webiste at the given url
    I realize this is inefficient as we're parsing twice! This is to keep the code in the lab simple.
    Feel free to use a class and optimize it!
    """
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    links = [link.get("href") for link in soup.find_all("a")]
    return [link for link in links if link]

In [15]:
fetch_website_links("https://cnn.com")

['https://www.cnn.com',
 'https://www.cnn.com/us',
 'https://www.cnn.com/world',
 'https://www.cnn.com/politics',
 'https://www.cnn.com/business',
 'https://www.cnn.com/health',
 'https://www.cnn.com/entertainment',
 'https://www.cnn.com/cnn-underscored',
 'https://www.cnn.com/style',
 'https://www.cnn.com/travel',
 'https://www.cnn.com/sports',
 'https://www.cnn.com/science',
 'https://www.cnn.com/climate',
 'https://www.cnn.com/weather',
 'https://www.cnn.com/world/europe/ukraine',
 'https://www.cnn.com/world/middleeast/israel',
 'https://www.cnn.com/games',
 'https://www.cnn.com/us',
 'https://www.cnn.com/world',
 'https://www.cnn.com/politics',
 'https://www.cnn.com/business',
 'https://www.cnn.com/health',
 'https://www.cnn.com/entertainment',
 'https://www.cnn.com/cnn-underscored',
 'https://www.cnn.com/style',
 'https://www.cnn.com/travel',
 'https://www.cnn.com/sports',
 'https://www.cnn.com/science',
 'https://www.cnn.com/climate',
 'https://www.cnn.com/weather',
 'https://www

In [23]:
MODEL_NAME="gpt-4.1-mini"

system_prompt = """
You are a helpful assistant that can review a list of links from a company's website and identify which of those
links are relevant for a brochure about the company, such as about page, company products, careers/jobs page.
You should respond is JSON format as provided in the example below.

{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

def get_relevant_links(url, links):
    links_str = "\n".join(links)

    user_prompt = f"""
    You are provided links from the webpage {url}. Please decide which of those links are relevant to include
    in a brochure representing that company. Respond in full URLs and in JSON format. Do not include terms of service,
    privacy notices, email links etc.

    Below is the list of all the links from the company's webpage
    {links_str}

    """
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
    openai_client = OpenAI()
    response = openai_client.chat.completions.create(
        model=MODEL_NAME,
        messages=messages
    )
    return json.loads(response.choices[0].message.content)

In [10]:
url = "https://cnn.com"
links = fetch_website_links(url)
relevalt_links = get_relevant_links(url, links)
print(json.loads(relevalt_links))

{'links': [{'type': 'about page', 'url': 'https://www.cnn.com/about'}, {'type': 'careers page', 'url': 'https://careers.wbd.com/cnnjobs'}, {'type': 'products/services page', 'url': 'https://www.cnn.com/subscription?source=sub_web_footersubnav-link'}, {'type': 'company leadership', 'url': 'https://www.cnn.com/profiles/cnn-leadership'}]}


In [21]:
def get_data_for_brochure(company, url):
    contents = fetch_website_content(url)
    links = fetch_website_links(url)
    relevant_links = get_relevant_links(url, links)
    print(relevant_links)

    result = f"## Landing Page:\n\n{contents}\n## Relevant Links:\n"
    for link in relevant_links['links']:
        result += f"\n\n### Link: {link['type']}\n"
        result += fetch_website_content(link["url"])
    return result

In [24]:
get_data_for_brochure("CNN", "https://cnn.com")

{'links': [{'type': 'about page', 'url': 'https://www.cnn.com/about'}, {'type': 'careers page', 'url': 'https://careers.wbd.com/cnnjobs'}, {'type': 'company products', 'url': 'https://www.cnn.com/subscription?source=sub_web_footersubnav-link'}, {'type': 'company products', 'url': 'https://www.cnn.com/subscription?source=sub_web_footerlink-link'}, {'type': 'newsletters page', 'url': 'https://www.cnn.com/newsletters'}, {'type': 'company leadership', 'url': 'https://www.cnn.com/profiles/cnn-leadership'}, {'type': 'company leadership', 'url': 'https://www.cnn.com/profiles'}]}


'## Landing Page:\n\nBreaking News, Latest News and Videos | CNN\n\nCNN values your feedback\n1. How relevant is this ad to you?\n2. Did you encounter any technical issues?\nVideo player was slow to load content\nVideo content never loaded\nAd froze or did not finish loading\nVideo content did not start after ad\nAudio on ad was too loud\nOther issues\nAd never loaded\nAd prevented/slowed the page from loading\nContent moved around while ad loaded\nAd was repetitive to ads I\'ve seen previously\nOther issues\nCancel\nSubmit\nThank You!\nYour effort and contribution in providing this feedback is much\n                                        appreciated.\nClose\nAd Feedback\nClose icon\nUS\nWorld\nPolitics\nBusiness\nHealth\nEntertainment\nUnderscored\nStyle\nTravel\nSports\nScience\nClimate\nWeather\nUkraine-Russia War\nIsrael-Hamas War\nGames\nMore\nUS\nWorld\nPolitics\nBusiness\nHealth\nEntertainment\nUnderscored\nStyle\nTravel\nSports\nScience\nClimate\nWeather\nUkraine-Russia War\nI

In [25]:
brochure_system_prompt = """
You are an assistant that analyzes the contents of several relevant pages from a company website
and creates a short brochure about the company for prospective customers, investors and recruits.
Respond in markdown without code blocks.
Include details of company culture, customers and careers/jobs if you have the information.
"""

def create_brochure(company, url):
    user_prompt = f"""
    You are looking at a company called: {company}
    Here are the contents of its landing page and other relevant pages;
    use this information to build a short brochure of the company in markdown without code blocks.\n\n
    """

    user_prompt += get_data_for_brochure(company, url)

    openai_client = OpenAI()
    messages = [
        {"role": "system", "content": brochure_system_prompt},
        {"role": "user", "content": user_prompt[:5_000]}
    ]
    response = openai_client.chat.completions.create(
        model=MODEL_NAME,
        messages=messages
    )
    return response.choices[0].message.content

In [27]:
brochure_content = create_brochure("CNN", "https://cnn.com")

{'links': [{'type': 'about page', 'url': 'https://www.cnn.com/about'}, {'type': 'careers page', 'url': 'https://careers.wbd.com/cnnjobs'}, {'type': 'company products', 'url': 'https://www.cnn.com/subscription?source=sub_web_footersubnav-link'}, {'type': 'company products', 'url': 'https://www.cnn.com/subscription?source=sub_web_footerlink-link'}, {'type': 'newsletters', 'url': 'https://www.cnn.com/newsletters'}]}


In [28]:
display(Markdown(brochure_content))

# CNN Company Brochure

---

## About CNN

CNN (Cable News Network) is a global leader in breaking news and comprehensive news coverage. With a wide-reaching digital presence, CNN delivers the latest updates, videos, and in-depth reports spanning critical topics including US and world politics, business, health, entertainment, sports, science, climate change, and ongoing global conflicts such as the Ukraine-Russia and Israel-Hamas wars. CNN also offers specialized content through channels and editions such as CNN International, CNN en Español, and Arabic. The network’s commitment to delivering timely and relevant news is supported by various digital features like newsletters, podcasts, video content, interactive games, and more.

---

## Company Culture

CNN thrives on innovation, inclusivity, and the pursuit of truth to keep the public informed. The company values user feedback highly, actively encouraging audience participation to enhance user experience and content relevance. With a multi-faceted editorial approach, CNN balances hard-hitting investigative journalism with human interest stories and lifestyle content under brands like CNN Underscored and innovative initiatives exploring the future of work and technology. The environment nurtures creativity and excellence in storytelling, emphasizing accuracy and integrity in reporting.

---

## Customers and Audience

CNN serves a diverse global audience, providing content in multiple languages and adapting coverage to regional interests. Their customers range from everyday news consumers seeking factual updates to businesses and investors relying on financial markets and economic insights, as well as enthusiasts of entertainment, sports, and science. Through platforms like CNN Digital and CNN Max, the company offers accessible and engaging content tailored to various preferences, from quick headlines to immersive video documentaries and podcasts.

---

## Careers at CNN

CNN offers diverse career opportunities across a multitude of departments including:

- Consumer Products  
- Corporate & Administrative  
- Creative, Content & Editorial  
- Data & Research  
- Early Careers (Internships and entry-level roles)  
- Executive Leadership  
- Finance & Accounting  
- Game Development  
- Human Resources  
- Legal & Business Affairs  
- Marketing & Communications  
- Product Management  
- Production  
- Sales & Distribution  
- Studio Operations  
- Technology  

Employees can find roles within CNN and its associated brands such as HLN, CNN International, and CNN en Español, promoting growth and professional development in a dynamic media environment.

---

## Why Choose CNN?

- A global platform with an unmatched reputation in news and media  
- Commitment to quality, accuracy, and audience engagement  
- Innovative digital products and wide-ranging content offerings  
- Opportunities for professional growth in a variety of fields  
- Inclusive workplace culture valuing diverse perspectives and ideas  

Experience the power of live, reliable journalism and cutting-edge media innovation with CNN—where your story matters.

---

## Connect with CNN

- Visit [cnn.com](https://www.cnn.com) for the latest news and subscription options  
- Engage with personalized content through newsletters and podcasts  
- Explore career opportunities to join a world-class media team  

CNN: Bringing the world’s stories to your fingertips.