In [25]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [4]:
# Initialize and constants

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
MODEL = 'gpt-4o-mini'
openai = OpenAI()

API key looks good so far


In [5]:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"


In [6]:
ed = Website("https://huggingface.co")
ed.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/zai-org/GLM-4.6',
 '/ServiceNow-AI/Apriel-1.5-15b-Thinker',
 '/deepseek-ai/DeepSeek-V3.2-Exp',
 '/neuphonic/neutts-air',
 '/inclusionAI/Ring-1T-preview',
 '/models',
 '/spaces/Wan-AI/Wan2.2-Animate',
 '/spaces/enzostvs/deepsite',
 '/spaces/zerogpu-aoti/wan2-2-fp8da-aoti-faster',
 '/spaces/ibm-granite/Granite-4.0-WebGPU',
 '/spaces/Selfit/ImageEditPro',
 '/spaces',
 '/datasets/openai/gdpval',
 '/datasets/Agent-Ark/Toucan-1.5M',
 '/datasets/fka/awesome-chatgpt-prompts',
 '/datasets/facebook/seamless-interaction',
 '/datasets/zai-org/CC-Bench-trajectories',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/Writer',
 '/docs/tran

In [7]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

In [8]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}



In [9]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [10]:
print(get_links_user_prompt(ed))

Here is the list of links on the website of https://huggingface.co - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
/
/models
/datasets
/spaces
/docs
/enterprise
/pricing
/login
/join
/spaces
/models
/zai-org/GLM-4.6
/ServiceNow-AI/Apriel-1.5-15b-Thinker
/deepseek-ai/DeepSeek-V3.2-Exp
/neuphonic/neutts-air
/inclusionAI/Ring-1T-preview
/models
/spaces/Wan-AI/Wan2.2-Animate
/spaces/enzostvs/deepsite
/spaces/zerogpu-aoti/wan2-2-fp8da-aoti-faster
/spaces/ibm-granite/Granite-4.0-WebGPU
/spaces/Selfit/ImageEditPro
/spaces
/datasets/openai/gdpval
/datasets/Agent-Ark/Toucan-1.5M
/datasets/fka/awesome-chatgpt-prompts
/datasets/facebook/seamless-interaction
/datasets/zai-org/CC-Bench-trajectories
/datasets
/join
/pricing#endpoints
/pricing#spaces
/pricing
/enterprise
/enterprise
/enterprise
/enterprise
/enterprise
/ent

In [11]:
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [12]:
huggingface = Website("https://huggingface.co")
huggingface.links


['/',
 '/models',
 '/datasets',
 '/spaces',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/zai-org/GLM-4.6',
 '/ServiceNow-AI/Apriel-1.5-15b-Thinker',
 '/deepseek-ai/DeepSeek-V3.2-Exp',
 '/neuphonic/neutts-air',
 '/inclusionAI/Ring-1T-preview',
 '/models',
 '/spaces/Wan-AI/Wan2.2-Animate',
 '/spaces/enzostvs/deepsite',
 '/spaces/zerogpu-aoti/wan2-2-fp8da-aoti-faster',
 '/spaces/ibm-granite/Granite-4.0-WebGPU',
 '/spaces/Selfit/ImageEditPro',
 '/spaces',
 '/datasets/openai/gdpval',
 '/datasets/Agent-Ark/Toucan-1.5M',
 '/datasets/fka/awesome-chatgpt-prompts',
 '/datasets/facebook/seamless-interaction',
 '/datasets/zai-org/CC-Bench-trajectories',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/Writer',
 '/docs/tran

In [13]:
get_links("https://huggingface.co")

{'links': [{'type': 'about page', 'url': 'https://huggingface.co/about'},
  {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'},
  {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'},
  {'type': 'blog page', 'url': 'https://huggingface.co/blog'},
  {'type': 'docs page', 'url': 'https://huggingface.co/docs'},
  {'type': 'community page', 'url': 'https://discuss.huggingface.co'},
  {'type': 'github page', 'url': 'https://github.com/huggingface'},
  {'type': 'twitter page', 'url': 'https://twitter.com/huggingface'},
  {'type': 'linkedin page',
   'url': 'https://www.linkedin.com/company/huggingface/'}]}

Second step: make the brochure!
Assemble all the details into another prompt to GPT4-o

In [15]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [16]:
print(get_all_details("https://huggingface.co"))

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'discussion forum', 'url': 'https://discuss.huggingface.co'}, {'type': 'company GitHub', 'url': 'https://github.com/huggingface'}, {'type': 'company Twitter', 'url': 'https://twitter.com/huggingface'}, {'type': 'company LinkedIn', 'url': 'https://www.linkedin.com/company/huggingface/'}]}
Landing page:
Webpage Title:
Hugging Face – The AI community building the future.
Webpage Contents:
Hugging Face
Models
Datasets
Spaces
Community
Docs
Enterprise
Pricing
Log In
Sign Up
The AI community building the future.
The platform where the machine learning community collaborates on models, datasets, and applications.
Explore

In [26]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."


In [18]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [19]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'company page', 'url': 'https://huggingface.co/enterprise'}]}


'You are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title:\nHugging Face – The AI community building the future.\nWebpage Contents:\nHugging Face\nModels\nDatasets\nSpaces\nCommunity\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 1M+ models\nTrending on\nthis week\nModels\nzai-org/GLM-4.6\nUpdated\n7 days ago\n•\n13.8k\n•\n483\nServiceNow-AI/Apriel-1.5-15b-Thinker\nUpdated\nabout 4 hours ago\n•\n5.65k\n•\n293\ndeepseek-ai/DeepSeek-V3.2-Exp\nUpdated\n7 days ago\n•\n18.1k\n•\n556\nneuphonic/neutts-air\nUpdated\n4 days ago\n•\n4.09k\n•\n223\ninclusionAI/Ring-1T-preview\nUpdated\n6 days ago\n•\n1.48k\n•\n242\nBrowse 1M+ models\nSpaces\nRunning\n1.5k\n1.5k\nWan2.

In [20]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [21]:
create_brochure("HuggingFace", "https://huggingface.co")


Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'community discussion', 'url': 'https://discuss.huggingface.co'}, {'type': 'GitHub page', 'url': 'https://github.com/huggingface'}, {'type': 'Twitter profile', 'url': 'https://twitter.com/huggingface'}, {'type': 'LinkedIn profile', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


# Hugging Face Company Brochure

## About Us
**Hugging Face** is at the forefront of the AI community, dedicated to building the future of machine learning. Our platform empowers users to collaborate on models, datasets, and applications, fostering innovation and creativity. We are home to over **1 million models** and **250,000 datasets**, making us the leading resource for machine learning enthusiasts and professionals alike.

## What We Offer
### Models & Datasets
- **Browse** over **1M+ models** and **250K+ datasets**: Collaborate and contribute to a vast repository of AI resources.
- **Popular Models**: Explore trending models updated regularly for cutting-edge performance.

### Collaborative Spaces
- **Join Spaces**: Engage in collaborative application creation with running applications like AI-powered image editing and video generation tools.
- **Community-Driven**: Host and share your projects freely within the Hugging Face community.

### Enterprise Solutions
- Hugging Face provides **Compute and Enterprise solutions** to accelerate your deployment needs, tailored for teams requiring advanced security and efficient resource management starting at **$20/user/month**.

### Open Source Initiatives
- We lead the way in open source with initiatives such as **Transformers** and **Diffusers**, totaling thousands of contributors and frequent updates.

## Our Culture
At Hugging Face, our culture is centered around **collaboration, innovation, and community engagement**. We encourage a diverse and inclusive environment where everyone can contribute to the world of machine learning. Our team values curiosity, creativity, and a commitment to open-source principles, supporting each other in a dynamic workplace that thrives on shared knowledge.

## Our Customers
With over **50,000 organizations**, including industry giants like **Google, Microsoft, Amazon**, and **Grammarly**, leveraging our platform, Hugging Face is a trusted partner in advancing AI technologies. Our commitment to quality and expertise in the field enhances our customers' capabilities in the digital landscape.

## Careers at Hugging Face
Join our team of innovators! We are continuously looking for passionate individuals eager to advance their careers in AI. Our job listings are frequently updated, and we offer positions that encompass various skills and expertise. Discover opportunities to grow with us in a supportive and collaborative environment.

## Connect with Us
Explore our offerings, contribute to the community, and elevate your AI projects with Hugging Face. 

- **Website**: [Hugging Face](https://huggingface.co)
- **Join Us**: [Careers](https://huggingface.co/jobs)
- **Community**: Engage with us on GitHub, Twitter, LinkedIn, and Discord.

Together, let’s build the future of AI. 

---

_This brochure encapsulates the essence of Hugging Face — your partner in transforming AI dreams into reality._

Finally - a minor improvement for typewriter animation 

In [22]:
def stream_brochure(company_name, url):
    stream = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [24]:
stream_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'blog', 'url': 'https://huggingface.co/blog'}, {'type': 'company page', 'url': 'https://www.linkedin.com/company/huggingface/'}, {'type': 'twitter page', 'url': 'https://twitter.com/huggingface'}, {'type': 'github page', 'url': 'https://github.com/huggingface'}, {'type': 'discussion forum', 'url': 'https://discuss.huggingface.co'}, {'type': 'status page', 'url': 'https://status.huggingface.co/'}]}


# Hugging Face Brochure

## About Us
**Hugging Face** is at the forefront of the AI community, dedicated to building the future of machine learning. As a collaborative platform, we provide tools and resources for developers, researchers, and businesses to contribute to the rapidly evolving field of AI.

## Our Offerings
### Collaboration at Its Best
- **Models**: Access over **1 million models** for various applications, including text, image, and audio processing.
- **Datasets**: Explore **250k+ datasets** to fuel your machine learning projects.
- **Spaces**: Innovate and share AI applications with our user-friendly environment.

### Enterprise Solutions
- Tailored solutions for organizations looking to implement AI with enterprise-grade security and dedicated support. Starting at **$20/user/month**.

### Open Source
Our commitment to open-source technology is embodied through projects like:
- **Transformers**: State-of-the-art AI models for PyTorch.
- **Tokenizers**: Fast tokenizers for optimized performance.
- **Diffusers**: Advanced diffusion models.

## Who We Serve
Our platform is utilized by over **50,000 organizations**, including major players such as:
- **Microsoft**
- **Google**
- **Amazon**
- **Meta** 

These companies leverage our tools to enhance their AI capabilities, setting the standard in the industry.

## Company Culture
At Hugging Face, we foster a vibrant and inclusive culture centering on collaboration and innovation. We believe in:
- **Community Engagement**: Bringing together a diverse group of AI enthusiasts to share knowledge and ideas.
- **Continuous Learning**: Encouraging personal and professional growth among our team members.
- **Open Collaboration**: Promoting transparent communication and creativity within the workforce.

## Careers at Hugging Face
We're always on the lookout for passionate individuals eager to contribute to the AI revolution. If you share our vision and desire to make an impact, join us! Explore current job openings and become a part of our dynamic team.

## Connect with Us
Stay updated on our latest developments:
- [GitHub](https://github.com/huggingface)
- [Twitter](https://twitter.com/huggingface)
- [LinkedIn](https://www.linkedin.com/company/huggingface)
- [Discord](https://discord.gg/)

Together, let’s build the future of machine learning!

---

**For more information, visit us at [Hugging Face](https://huggingface.co)**