In [18]:
import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI
import json

In [2]:
# Initialize and constants

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
MODEL = 'gpt-4o-mini'
openai = OpenAI()

API key looks good so far


In [3]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"
    
    

In [6]:
w = Website("https://www.cnn.com/")
print(w.get_contents()[:1000])  # Print the first 1000 characters of the content
print(w.links[:10])  # Print the first 10 links

Webpage Title:
Breaking News, Latest News and Videos | CNN
Webpage Contents:
CNN values your feedback
1. How relevant is this ad to you?
2. Did you encounter any technical issues?
Video player was slow to load content
Video content never loaded
Ad froze or did not finish loading
Video content did not start after ad
Audio on ad was too loud
Other issues
Ad never loaded
Ad prevented/slowed the page from loading
Content moved around while ad loaded
Ad was repetitive to ads I've seen previously
Other issues
Cancel
Submit
Thank You!
Your effort and contribution in providing this feedback is much
                                        appreciated.
Close
Ad Feedback
Close icon
US
World
Politics
Business
Health
Entertainment
Style
Travel
Sports
Science
Climate
Weather
Ukraine-Russia War
Israel-Hamas War
Underscored
Amazon Prime Day
Games
More
US
World
Politics
Business
Health
Entertainment
Style
Travel
Sports
Science
Climate
Weather
Ukraine-Russia War
Israel-Hamas War
Underscored
Amazon Prime

In [4]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}



In [5]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [8]:
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [None]:
result = get_links("https://www.cnn.com/")
result

{'links': [{'type': 'about page', 'url': 'https://www.cnn.com/about'},
  {'type': 'careers page', 'url': 'https://careers.wbd.com/cnnjobs'},
  {'type': 'profiles page', 'url': 'https://www.cnn.com/profiles'},
  {'type': 'leadership page',
   'url': 'https://www.cnn.com/profiles/cnn-leadership'},
  {'type': 'newsletters page', 'url': 'https://www.cnn.com/newsletters'}]}

In [11]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

get_all_details("https://www.cnn.com/")

Found links: {'links': [{'type': 'about page', 'url': 'https://www.cnn.com/about'}, {'type': 'careers page', 'url': 'https://careers.wbd.com/cnnjobs'}, {'type': 'company page', 'url': 'https://www.cnn.com/profiles'}, {'type': 'leadership page', 'url': 'https://www.cnn.com/profiles/cnn-leadership'}, {'type': 'newsletters page', 'url': 'https://www.cnn.com/newsletters'}]}


'Landing page:\nWebpage Title:\nBreaking News, Latest News and Videos | CNN\nWebpage Contents:\nCNN values your feedback\n1. How relevant is this ad to you?\n2. Did you encounter any technical issues?\nVideo player was slow to load content\nVideo content never loaded\nAd froze or did not finish loading\nVideo content did not start after ad\nAudio on ad was too loud\nOther issues\nAd never loaded\nAd prevented/slowed the page from loading\nContent moved around while ad loaded\nAd was repetitive to ads I\'ve seen previously\nOther issues\nCancel\nSubmit\nThank You!\nYour effort and contribution in providing this feedback is much\n                                        appreciated.\nClose\nAd Feedback\nClose icon\nUS\nWorld\nPolitics\nBusiness\nHealth\nEntertainment\nStyle\nTravel\nSports\nScience\nClimate\nWeather\nUkraine-Russia War\nIsrael-Hamas War\nUnderscored\nAmazon Prime Day\nGames\nMore\nUS\nWorld\nPolitics\nBusiness\nHealth\nEntertainment\nStyle\nTravel\nSports\nScience\nClimat

In [12]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':

# system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
# and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
# Include details of company culture, customers and careers/jobs if you have the information."

In [15]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

print(get_brochure_user_prompt("CNN", "https://www.cnn.com/")[:1000])  # Print the first 1000 characters

Found links: {'links': [{'type': 'about page', 'url': 'https://www.cnn.com/about'}, {'type': 'careers page', 'url': 'https://careers.wbd.com/cnnjobs'}, {'type': 'profiles page', 'url': 'https://www.cnn.com/profiles'}, {'type': 'leadership page', 'url': 'https://www.cnn.com/profiles/cnn-leadership'}]}
You are looking at a company called: CNN
Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.
Landing page:
Webpage Title:
Breaking News, Latest News and Videos | CNN
Webpage Contents:
CNN values your feedback
1. How relevant is this ad to you?
2. Did you encounter any technical issues?
Video player was slow to load content
Video content never loaded
Ad froze or did not finish loading
Video content did not start after ad
Audio on ad was too loud
Other issues
Ad never loaded
Ad prevented/slowed the page from loading
Content moved around while ad loaded
Ad was repetitive to ads I've seen previously
Othe

In [14]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

create_brochure("CNN", "https://www.cnn.com/")

Found links: {'links': [{'type': 'about page', 'url': 'https://www.cnn.com/about'}, {'type': 'careers page', 'url': 'https://careers.wbd.com/cnnjobs'}, {'type': 'company profile', 'url': 'https://www.cnn.com/profiles'}, {'type': 'company leadership', 'url': 'https://www.cnn.com/profiles/cnn-leadership'}]}


# CNN Company Brochure

## Overview
CNN (Cable News Network) stands at the forefront of delivering timely and compelling news from around the globe. With roots in pioneering 24-hour news coverage, CNN has become a trusted source for breaking news, analysis, and in-depth features covering a variety of topics including politics, business, health, science, and entertainment.

---

## Our Mission
CNN is dedicated to providing accurate and dynamic news coverage to its audience. Our aim is to keep the public informed on evolving situations, with a commitment to journalistic integrity and innovation.

---

## Company Culture
At CNN, we believe that fostering a diverse and inclusive environment is key to our success. Our teams are composed of individuals from various backgrounds and experiences, bringing unique perspectives to our news coverage. We encourage collaboration and creativity, ensuring that every employee has a voice, and nurturing an atmosphere where innovation thrives.

---

## Customer Engagement
We are deeply committed to our audience's feedback. Whether it involves technical issues or content relevance, CNN values listener and viewer feedback as an essential part of our service. We encourage interactive engagement through multiple platforms, including live TV, podcasts, and online articles, catering to the varying preferences of our diverse audience.

---

## Careers at CNN
CNN is always on the lookout for passionate individuals who want to make a difference in journalism. Whether you're a seasoned professional or a new graduate eager to start your career, we offer a broad range of opportunities. Our job roles span areas like reporting, production, digital media, and marketing, all designed to promote professional development in a supportive and stimulating environment.

---

## Our Audience
CNN serves a global audience seeking trustworthy news. From politics and business to health and entertainment, we strive to provide comprehensive coverage that informs and shapes public discourse. Our audience ranges from casual viewers to industry professionals who rely on us for in-depth reporting and analysis.

---

## Join Us
If you’re interested in being part of a leading global news organization that values integrity, creativity, and feedback, we invite you to explore career opportunities with CNN. Together, we can ensure that important stories are reported, and critical issues are addressed.

---

For more information about our programs, team culture, and career opportunities, visit [CNN Careers](https://www.cnn.com/careers).

---

Stay informed, engage with us directly, and be part of the global conversation with CNN.

In [16]:
def stream_brochure(company_name, url):
    stream = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [19]:
stream_brochure("CNN", "https://www.cnn.com/")

Found links: {'links': [{'type': 'about page', 'url': 'https://www.cnn.com/about'}, {'type': 'careers page', 'url': 'https://careers.wbd.com/cnnjobs'}, {'type': 'company profile', 'url': 'https://www.cnn.com/profiles'}, {'type': 'leadership profile', 'url': 'https://www.cnn.com/profiles/cnn-leadership'}, {'type': 'newsletters', 'url': 'https://www.cnn.com/newsletters'}]}


# CNN Brochure

### Company Overview
CNN (Cable News Network) is a premier global news organization that provides breaking news, in-depth analysis, and compelling video content to audiences around the world. From politics to climate change, sports to entertainment, CNN covers a diverse array of topics, delivering accurate and timely reports that keep viewers informed on critical issues.

### Mission Statement
At CNN, our mission is clear: to provide the most trustworthy and impartial news and analysis in an ever-changing world. We strive to empower our audience to make informed decisions by presenting facts first.

---

### Our Audience
CNN serves millions of viewers worldwide, including those in the U.S., Europe, Asia, and beyond. Our audience comprises diverse ages and backgrounds, reflecting a wide spectrum of interests from politics and business to health and entertainment.

---

### Company Culture
CNN fosters a dynamic and inclusive work environment. Employees are encouraged to voice their opinions, share feedback, and contribute to the success of the organization. Our culture is built on collaboration, innovation, and a commitment to excellence. We believe in the power of diverse perspectives to create impactful journalism and strive to support continuous professional development and personal growth within our teams.

### Careers at CNN
Join a passionate team of journalists, media professionals, and content creators! CNN offers exciting career opportunities across various departments, including journalism, technology, marketing, and production. We seek innovative thinkers who are ready to make a difference in the world of news and media.

**Current Openings Include:**
- Reporters and Editors 
- Digital Content Producers 
- Technical Support Specialists 
- Media Analysts 

Discover more about your future career with CNN by visiting our [Careers page](#).

---

### Customer Engagement
CNN values your feedback! We actively encourage our audience to provide insights about their user experience, including technical issues or ad relevance. Your contributions help us improve our services and maintain our commitment to quality journalism.

---

### Connect with Us
Stay informed with the latest updates by subscribing to our newsletters or tuning into our live broadcasts. Access a world of news, analysis, and reports on our website or through our CNN app.

Explore more about us or check the latest news:
- [Live TV](#)
- [Watch CNN TV Schedules](#)
- [Subscribe to CNN](#)

### Contact Information
For additional inquiries or information, please reach out to us through our website. Join us in exploring the world of news, analytics, and engaging storytelling.

---

**CNN:** Where news is always happening. 
Find your place in the newsroom of the future!