In [1]:
import os
import json
from scrapper2 import get_contents, get_website_links

from dotenv import load_dotenv
from openai import OpenAI

In [2]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

# Step1: narrowing down to only releavant links

### building the prompts

In [3]:
link_system_prompt = """
You are provided with ta list of links found on a webpage.
You are able to decide which of the links would be most relevant to include ina a brochure about eh company, 
such as links to an About page, or a Company page, or  Careers/Jobs pages.
You should respond in JSON as in this example:

{
    "links": [
        {"type": "about page", "url":"https://full.url/goes/here/about"},
        {"type": "careers page", "url":"https://full.url/careers"},
    ]
}
"""


def get_links_user_prompt(url):
    user_prompt = f"""
    Here is the list of links on the website {url} -
    Please decide which of these are relevant web links for a brochure about the company,
    respond with the full https URL in JSON format.
    Do not include Terms of SErvice, Privacy, email links.

    Links (some might be relative links):
    """

    links = get_website_links(url)
    user_prompt += "\n".join(links)
    return user_prompt

### making the call to openai

In [4]:
def select_relevant_links(url):
    openai = OpenAI()
    response = openai.chat.completions.create(
        model="gpt-5-nano",
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(url)}
        ],
       response_format={"type":"json_object"}
    )

    result = response.choices[0].message.content
    links = json.loads(result)
    print(f"Found {len(links['links'])} relevant links")
    return links


#select_relevant_links("https://edwarddonner.com")

## Step2: making the brochure

In [5]:
def fetch_page_and_all_relevant_links(url):
    """
    this function iterates over the relevant links and get its contents
    """
    contents = get_contents("https://edwarddonner.com")
    relevant_links = select_relevant_links(url)
    result = f"## Landing page:\n\n{contents}\n## Relevant Links:\n"
    for link in relevant_links['links']:
        result += f"\n\n### Link: {link['type']}\n"
        result += get_contents(link["url"])
    return result

In [6]:
#print(fetch_page_and_all_relevant_links("https://huggingface.co"))

## Step3: creating the prompts

In [7]:
brochure_system_prompt = """
You are an assistant that analyzes the contents of sevral relevant pages from a company website
and creates a short brochure about the company for prospective customers, investor and recuits.
Respond in markdown without code blocks.
Include details of company culture, customers and careers/jobs if you the information
"""

## or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate the 'tone':

# brochure_system_prompt = """
# You are an assistant that analyzes the contents of sevral relevant pages from a company website
# and creates a short, humorous , entertaining, witty brochure about the company for prospective customers, investor and recuits.
# Respond in markdown without code blocks.
# Include details of company culture, customers and careers/jobs if you the information
# """


In [8]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"""
    You are looking at a company called: {company_name}
    Here are the contents of its landing page and other relevant pages;
    use this information to build a short brochure of the company in markdown without code blocks.\n\n
    """

    user_prompt  += fetch_page_and_all_relevant_links(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5000 characters
    return user_prompt

In [9]:
# checking how the user_prompt is looking like
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

Found 3 relevant links


'\n    You are looking at a company called: HuggingFace\n    Here are the contents of its landing page and other relevant pages;\n    use this information to build a short brochure of the company in markdown without code blocks.\n\n\n    ## Landing page:\n\nHome - Edward Donner\n\nHome\nConnect Four\nOutsmart\nAn arena that pits LLMs against each other in a battle of diplomacy and deviousness\nAbout\nPosts\nWell, hi there.\nI’m Ed. I like writing code and experimenting with LLMs, and hopefully you’re here because you do too. I also enjoy DJing (but I’m badly out of practice), amateur electronic music production (\nvery\namateur) and losing myself in\nHacker News\n, nodding my head sagely to things I only half understand.\nI’m the co-founder and CTO of\nNebula.io\n. We’re applying AI to a field where it can make a massive, positive impact: helping people discover their potential and pursue their reason for being. Recruiters use our product today to source, understand, engage and manage 