In [77]:
# Importing libraries
from openai import OpenAI
import anthropic
from IPython.display import display,Markdown,update_display
from dotenv import load_dotenv
from bs4 import BeautifulSoup
import requests
import os
from typing import List
import json
import gradio as gr

In [78]:
# Load environment variables from the .env file
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['ANTHROPIC_API_KEY'] = os.getenv('ANTHROPIC_API_KEY', 'your-key-if-not-using-env')

In [79]:
# Initialize OpenAI and Claude clients
openai = OpenAI()
claude = anthropic.Anthropic()

In [80]:
# Website class for extracting title, text, and links from a webpage
class Website:
    link: List[str]
    text:str
    url :str
    body :str
    title: str
    
    def __init__(self,url):
        self.url = url
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        self.title = soup.title.string if soup.title else "no title"
        if soup.body:
            for irrelevant in soup.body(["script","style","img","input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator='\n', strip = True)
        else:
            self.text = ""
        self.links = [link.get('href') for link in soup.find_all('a') if link.get('href')]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}"

In [81]:
nvidia = Website("https://www.nvidia.com/en-me/")
nvidia.links

['https://www.nvidia.com',
 '#page-content',
 'https://www.nvidia.com/en-me/geforce/',
 'https://www.nvidia.com/en-me/geforce/graphics-cards/',
 'https://www.nvidia.com/en-me/geforce/laptops/',
 'https://www.nvidia.com/en-me/geforce/products/g-sync-monitors/',
 'https://www.nvidia.com/en-me/studio/laptops-desktops/',
 'https://www.nvidia.com/en-eu/shield/',
 'https://www.nvidia.com/en-me/ai-on-rtx/',
 'https://www.nvidia.com/en-me/geforce/laptops/',
 'https://www.nvidia.com/en-eu/design-visualization/desktop-graphics/',
 'https://www.nvidia.com/en-eu/design-visualization/rtx-professional-laptops/',
 'https://www.nvidia.com/en-eu/data-center/dgx-station/',
 'https://www.nvidia.com/en-eu/deep-learning-ai/solutions/data-science/workstations/',
 'https://www.nvidia.com/en-eu/data-center/',
 'https://www.nvidia.com/en-eu/data-center/grace-cpu/',
 'https://www.nvidia.com/en-eu/data-center/dgx-platform/',
 'https://www.nvidia.com/en-eu/data-center/products/egx/',
 'https://www.nvidia.com/en-e

In [82]:
link_system_prompt = """
You are provided with a list of links found on a website.
Your task is to decide which links are relevant for a company brochure, such as "About Us" pages, "Careers" pages, etc.
Please return the result in the following JSON format:

{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://full.url/goes/here/careers"},
        {"type": "products page", "url": "https://full.url/goes/here/products"}
    ]
}
"""


In [83]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url}. "
    user_prompt += "Please decide which of these links are relevant for a company brochure. "
    user_prompt += "Relevant links may include: About, Company, Careers, Products, Services, etc. "
    user_prompt += "Do not include terms of service, privacy policies, email addresses, or unrelated links. "
    user_prompt += "\n".join(website.links)
    return user_prompt

In [84]:
print(get_links_user_prompt(nvidia))

Here is the list of links on the website of https://www.nvidia.com/en-me/. Please decide which of these links are relevant for a company brochure. Relevant links may include: About, Company, Careers, Products, Services, etc. Do not include terms of service, privacy policies, email addresses, or unrelated links. https://www.nvidia.com
#page-content
https://www.nvidia.com/en-me/geforce/
https://www.nvidia.com/en-me/geforce/graphics-cards/
https://www.nvidia.com/en-me/geforce/laptops/
https://www.nvidia.com/en-me/geforce/products/g-sync-monitors/
https://www.nvidia.com/en-me/studio/laptops-desktops/
https://www.nvidia.com/en-eu/shield/
https://www.nvidia.com/en-me/ai-on-rtx/
https://www.nvidia.com/en-me/geforce/laptops/
https://www.nvidia.com/en-eu/design-visualization/desktop-graphics/
https://www.nvidia.com/en-eu/design-visualization/rtx-professional-laptops/
https://www.nvidia.com/en-eu/data-center/dgx-station/
https://www.nvidia.com/en-eu/deep-learning-ai/solutions/data-science/workst

In [85]:
def get_links(url):
    website = Website(url)
    completion = openai.chat.completions.create(
    model = 'gpt-4o-mini',
    messages=[
        {"role": "system", "content": link_system_prompt},
        {"role": "user", "content": get_links_user_prompt(website)}
    ],
        response_format={"type": "json_object"}
    )
    result = completion.choices[0].message.content
    return json.loads(result)

In [86]:
get_links("https://www.nvidia.com/en-me/")

{'links': [{'type': 'about page',
   'url': 'https://www.nvidia.com/en-us/about-nvidia/'},
  {'type': 'careers page',
   'url': 'https://www.nvidia.com/en-us/about-nvidia/careers/'},
  {'type': 'about page', 'url': 'https://www.nvidia.com/en-eu/about-nvidia/'},
  {'type': 'careers page',
   'url': 'https://www.nvidia.com/en-eu/about-nvidia/careers/'},
  {'type': 'products page', 'url': 'https://www.nvidia.com/en-me/geforce/'},
  {'type': 'products page', 'url': 'https://www.nvidia.com/en-us/gtc'},
  {'type': 'products page',
   'url': 'https://www.nvidia.com/en-gb/lp/high-performance-computing/hpc-ai-cloud-computing-ebook'},
  {'type': 'company policies page',
   'url': 'https://www.nvidia.com/en-eu/about-nvidia/company-policies/'}]}

In [87]:
def get_all_details(url):
    result = "Landing page:\n"
    result+= Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links['links']:
        result+= f"{link['type']}"
        result+= Website(link['url']).get_contents()
    return result

In [88]:
# get_all_details("https://www.nvidia.com/en-me/")

In [89]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

In [90]:
def get_brochure_user_prompt(company_name, url, model, tone):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short {tone.lower()} brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:20_000] # Truncate if more than 20,000 characters
    return user_prompt

In [91]:
def generate_brochure(company_name, url, model, tone):
    # Initialize response variable
    response = ""
    
    # Get the user prompt for the brochure
    user_prompt = get_brochure_user_prompt(company_name, url, model, tone)
    
    if model == 'GPT':
        # Get the full response from GPT (no streaming)
        result = openai.chat.completions.create(
            model='gpt-4o-mini',
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
        )
        # Extract the content of the response
        response = result.choices[0].message.content
    
    elif model == 'Claude':
        # Get the full response from Claude (no streaming)
        result = claude.messages.create(
            model='claude-3-haiku-20240307',
            max_tokens=1000,
            temperature=0.7,
            system=system_prompt,
            messages=[
                {"role": "user", "content": user_prompt}
            ]
        )
        # Extract the content of the response
        response = result.content[0].text
    
    # Return the response directly for Gradio to render
    return response

In [92]:
view = gr.Interface(
    fn=generate_brochure,
    inputs=[
        gr.Textbox(label="Company name"),
        gr.Textbox(label="Landing page URL including http:// or https://"),
        gr.Dropdown(choices=["GPT", "Claude"], label="Select Model"),
        gr.Textbox(label="Tone (e.g., Professional, Friendly, etc.)")
    ],
    outputs=gr.Markdown(label="Brochure:"),
    flagging_mode="never"
)
view.launch(share=True)

* Running on local URL:  http://127.0.0.1:7888

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.


