In [1]:
import os
import json
import requests
import openai
from typing import Any, Dict, List, Optional
from dotenv import load_dotenv
from IPython.display import Markdown, display, update_display
from bs4 import BeautifulSoup

In [33]:
load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("Please set the OPENAI_API_KEY environment variable.")

MODEL = 'gpt-4o-mini'

headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

In [34]:
class Website():
    """A class to represente website object"""
    url: str
    title: str
    body: str
    links: List[str]
    text: str
    
    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else 'No title'
        if soup.body:
            for irrelevant in soup(['script', 'style', 'img', 'input']):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator='\n', strip=True)
        else:
            self.text = ''
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if links]
    
    def get_contents(self):
        return f"Webpage Title: \n{self.title}\n\
            Webpage Contents: \n{self.text}\n\n" 

In [None]:
jp = Website('https://www.oreilly.com/pub/au/7412')
jp.links

['#maincontent',
 'https://www.oreilly.com/online-learning/teams.html',
 'https://www.oreilly.com/online-learning/government.html',
 'https://www.oreilly.com/online-learning/academic.html',
 'https://www.oreilly.com/online-learning/individuals.html',
 'https://www.oreilly.com/content-marketing-solutions.html',
 'https://www.oreilly.com',
 'https://www.oreilly.com/online-learning/teams.html',
 'https://www.oreilly.com/online-learning/government.html',
 'https://www.oreilly.com/online-learning/academic.html',
 'https://www.oreilly.com/online-learning/individuals.html',
 'https://www.oreilly.com/content-marketing-solutions.html',
 '/search/skills',
 'https://www.oreilly.com/online-learning/features.html',
 'https://www.oreilly.com/online-learning/features.html',
 'https://www.oreilly.com/online-learning/courses.html',
 'https://www.oreilly.com/online-learning/feature-certification.html',
 'https://www.oreilly.com/online-learning/intro-interactive-learning.html',
 'https://www.oreilly.com/

In [9]:
link_system_prompt = """" 
    You are provide with a list of links found on a webpage.
    You are able to decide wich of the links would be most relevant to include in a brochure about company.
    such as links to an About page, a Company page, or Carrers/Jobs page.\n
    You should respond in JSON as in this example:\n
    {
        "links": [
            {type: "about page": "url": "https://example.com/about"},
            {type: "careers page", "url": "https://example.com/careers"}
       ]
    }
"""

In [10]:
def get_links_user_prompt(website):
    user_prompt = f"""Here is the list of links on the website {website.url}\n
    please decide which of the links would be most relevant to include in a brochure about company.
    respond with the full https:// url of the link.
    do not include terms of Service, Privacy Policy, or any other irrelevant links.\n\n
    """
    user_prompt += "\n".join(website.links)
    return user_prompt

In [11]:
print(get_links_user_prompt(jp))

Here is the list of links on the website https://www.oreilly.com/pub/au/7412

    please decide which of the links would be most relevant to include in a brochure about company.
    respond with the full https:// url of the link.
    do not include terms of Service, Privacy Policy, or any other irrelevant links.


    #maincontent
https://www.oreilly.com/online-learning/teams.html
https://www.oreilly.com/online-learning/government.html
https://www.oreilly.com/online-learning/academic.html
https://www.oreilly.com/online-learning/individuals.html
https://www.oreilly.com/content-marketing-solutions.html
https://www.oreilly.com
https://www.oreilly.com/online-learning/teams.html
https://www.oreilly.com/online-learning/government.html
https://www.oreilly.com/online-learning/academic.html
https://www.oreilly.com/online-learning/individuals.html
https://www.oreilly.com/content-marketing-solutions.html
/search/skills
https://www.oreilly.com/online-learning/features.html
https://www.oreilly.com/

In [12]:
def get_links(url):
    website = Website(url)
    completion = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
        ],
        response_format={"type": "json_object"}
    )
    result = completion.choices[0].message.content
    return json.loads(result)

In [13]:
get_links('https://www.oreilly.com/pub/au/7412')

{'links': [{'type': 'about page', 'url': 'https://www.oreilly.com/about/'},
  {'type': 'careers page', 'url': 'https://www.oreilly.com/careers/'},
  {'type': 'contact page',
   'url': 'https://www.oreilly.com/about/contact.html'},
  {'type': 'press page', 'url': 'https://www.oreilly.com/press/'},
  {'type': 'diversity page', 'url': 'https://www.oreilly.com/diversity/'}]}

In [24]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    
    for link in links['links']:
        result += f"\n\n{link['type']}\n"
        result += Website(link['url']).get_contents()
    return result

In [25]:
get_all_details('https://www.oreilly.com/pub/au/7412')

Found links: {'links': [{'type': 'about page', 'url': 'https://www.oreilly.com/about/'}, {'type': 'careers page', 'url': 'https://www.oreilly.com/careers/'}, {'type': 'company page', 'url': 'https://www.oreilly.com/about/oreilly-approach-to-generative-ai.html'}]}


"Landing page:\nWebpage Title: \nJose Portilla\n            Webpage Contents: \nSkip to Main Content\nFor Enterprise\nFor Government\nFor Higher Ed\nFor Individuals\nFor Sponsorship\nFor Enterprise\nFor Government\nFor Higher Ed\nFor Individuals\nFor Sponsorship\nExplore Skills\nFeatures\nAll Features\nCourses\nCertifications\nInteractive Learning\nLive Events\nAnswers\nInsights Reporting\nRadar Blog\nClose\nSearch\nPlans\nSign In\nTry Now\nO’Reilly Platform\nJose Portilla\nLive online courses, books, and videos on O’Reilly\nVideos\nSee all\nAnalyzing Data Using Spark 2.0 DataFrames With Python\nBio\nJose Marcial Portilla has a BS and MS in Mechanical Engineering from Santa Clara University and years of experience as a professional instructor and trainer for Data Science and programming. He has publications and patents in various fields such as microfluidics, materials science, and data science technologies. Over the course of his career he has developed a skill set in analyzing data a

In [26]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

In [27]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [28]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'discussion page', 'url': 'https://discuss.huggingface.co'}, {'type': 'LinkedIn page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


'You are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title: \nHugging Face – The AI community building the future.\n            Webpage Contents: \nHugging Face\nModels\nDatasets\nSpaces\nPosts\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 1M+ models\nTrending on\nthis week\nModels\nnari-labs/Dia-1.6B\nUpdated\n2 days ago\n•\n80.3k\n•\n1.47k\nQwen/Qwen3-235B-A22B\nUpdated\nabout 4 hours ago\n•\n367\nsand-ai/MAGI-1\nUpdated\n2 days ago\n•\n489\nostris/Flex.2-preview\nUpdated\n4 days ago\n•\n4.82k\n•\n277\nmicrosoft/bitnet-b1.58-2B-4T\nUpdated\nabout 21 hours ago\n•\n35.9k\n•\n866\nBrowse 1M+ models\nSpaces\nRunning\non\nZero\n910\n910\nDia 1.6B\n👯\nGenerate r

In [29]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [30]:
create_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'community page', 'url': 'https://discuss.huggingface.co'}, {'type': 'social media page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


# Hugging Face Brochure

## Overview
Welcome to **Hugging Face**, the vibrant AI community dedicated to building the future of machine learning. Our platform fosters collaboration among its users, providing a space to explore and develop models, datasets, and innovative applications. With over **1 million models** and **250k+ datasets**, we empower developers and researchers to accelerate their machine learning projects.

## Company Culture
At Hugging Face, we believe in the power of community. Our culture is defined by collaboration, innovation, and openness. We are committed to democratizing machine learning, making it accessible and useful for everyone. We value contributions from all members of our community and encourage sharing knowledge and resources.

### Our Mission
- **Democratize Machine Learning:** We strive to make state-of-the-art machine learning resources accessible to everyone.
- **Foster Collaboration:** By providing a collaborative platform, we enable users to share their work and build collective knowledge.
- **Innovate Responsibly:** We are committed to ethical AI development and creating tools that positively impact the world.

## Products & Services
- **Models & Datasets:** Browse and utilize a vast library of machine learning models and datasets for your projects.
- **Enterprise Solutions:** With enterprise-grade security and dedicated support, we offer a tailored platform for organizations to build advanced AI solutions.
- **Compute Services:** Deploy and optimize applications with our flexible compute resources, starting at $0.60/hour for GPU access.
  
## Customers
Over **50,000 organizations** trust Hugging Face, including:
- **AI2** (Non-profit)
- **Meta**
- **Amazon**
- **Google**
- **Microsoft**
- **Grammarly**

These organizations leverage our platform to access state-of-the-art AI tools, contributing to a collective growth in the field of machine learning.

## Careers at Hugging Face
Join our mission to advance machine learning and AI! We are continuously on the lookout for talented individuals who share our values. Working at Hugging Face means being part of an enthusiastic and innovative team, where you can contribute to meaningful projects that make a real difference.

### Why Work with Us?
- Inspiring mission and values centered around community and democratization.
- Opportunities for professional growth and skill enhancement.
- Supportive environment that encourages experimentation and creativity.

## Get Involved
Whether you're a developer, researcher, or an enthusiast, there are many ways to engage with the Hugging Face community:
- **Explore Models & Apps:** Discover and utilize thousands of models and applications.
- **Contribute to Open Source:** Join our open-source initiatives and help improve ML tools and libraries.
- **Join our Community:** Follow us on [Twitter](https://twitter.com/huggingface), [LinkedIn](https://www.linkedin.com/company/huggingface), and join our discussions on [Discord](https://discord.com/invite/huggingface).

Together, we can shape the future of machine learning. Welcome to Hugging Face!

In [31]:
def stream_brochure(company_name, url):
    stream = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [32]:
stream_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'discussion page', 'url': 'https://discuss.huggingface.co'}, {'type': 'linkedin page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


# Hugging Face Brochure

---

**Welcome to Hugging Face**  
*The AI community building the future.*

At Hugging Face, we empower the machine learning community to create, discover, and collaborate on innovative models, datasets, and applications. With our user-friendly platform, members can easily access over a million AI models and applications designed to accelerate machine learning workflows.

---

### **What We Offer**

- **Models:** Access and collaborate on 1M+ machine learning models covering various tasks, from text and image processing to audio and 3D.
- **Datasets:** Explore and utilize 250k+ diverse datasets catered to various ML tasks, enabling enhanced model training and validation.
- **Spaces:** Create and share AI applications seamlessly, enhancing your portfolio and gaining visibility within the community through our hosted applications.
- **Compute Solutions:** Deploy optimized inference endpoints or increase your app's performance with GPU support, starting at just $0.60/hour.
- **Enterprise Solutions:** Harness enterprise-grade security, dedicated support, and customized solutions for teams starting from $20/user/month.

---

### **Company Culture**

At Hugging Face, we embody a collaborative and inclusive culture that emphasizes open-source innovation. We believe in democratizing AI and making it accessible to everyone. Our passionate community comprises over 50,000 organizations, including notable names like Google, Amazon, Microsoft, and Grammarly, fostering an environment ripe for creativity and learning.

---

### **Our Customer Base**

Hugging Face serves a wide range of clients from **startups** to **Fortune 500 companies**. Our platform facilitates collaboration among:
- Researchers and students developing cutting-edge AI models.
- Enterprises seeking robust machine learning solutions and support.
- Developers looking to enhance their applications through advanced ML capabilities.

---

### **Careers at Hugging Face**

Join us in our mission to redefine machine learning! We are constantly on the lookout for talented individuals who share our passion for technology and innovation. We offer a dynamic work environment enriched with opportunities for professional growth and collaboration.

#### **Why Work with Us?**
- **Impactful Projects:** Contribute to significant advancements in AI and machine learning.
- **Collaborative Team:** Work alongside some of the leading experts in the field.
- **Diverse Opportunities:** Explore a range of roles from engineering and data science to product management and community engagement.

Explore our current job openings and be a part of the AI revolution!

---

### **Get Involved!**

Join our community today and start exploring what Hugging Face has to offer. Together, let’s build the future of artificial intelligence!

- **Visit:** [Hugging Face](https://huggingface.co)
- **Connect:** Follow us on GitHub, Twitter, LinkedIn, and Discord for updates and collaboration opportunities.

---

**Hugging Face: Where AI community meets innovation.**