In [1]:
import os
import json
from scrapper2 import get_contents, get_website_links

from dotenv import load_dotenv
from openai import OpenAI

In [2]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

# Step1: narrowing down to only releavant links

### building the prompts

In [3]:
link_system_prompt = """
You are provided with ta list of links found on a webpage.
You are able to decide which of the links would be most relevant to include ina a brochure about eh company, 
such as links to an About page, or a Company page, or  Careers/Jobs pages.
You should respond in JSON as in this example:

{
    "links": [
        {"type": "about page", "url":"https://full.url/goes/here/about"},
        {"type": "careers page", "url":"https://full.url/careers"},
    ]
}
"""


def get_links_user_prompt(url):
    user_prompt = f"""
    Here is the list of links on the website {url} -
    Please decide which of these are relevant web links for a brochure about the company,
    respond with the full https URL in JSON format.
    Do not include Terms of SErvice, Privacy, email links.

    Links (some might be relative links):
    """

    links = get_website_links(url)
    user_prompt += "\n".join(links)
    return user_prompt

### making the call to openai

In [4]:
def select_relevant_links(url):
    openai = OpenAI()
    response = openai.chat.completions.create(
        model="gpt-5-nano",
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(url)}
        ],
       response_format={"type":"json_object"}
    )

    result = response.choices[0].message.content
    links = json.loads(result)
    print(f"Found {len(links['links'])} relevant links")
    return links


#select_relevant_links("https://edwarddonner.com")

## Step2: making the brochure

In [5]:
def fetch_page_and_all_relevant_links(url):
    """
    this function iterates over the relevant links and get its contents
    """
    contents = get_contents("https://edwarddonner.com")
    relevant_links = select_relevant_links(url)
    result = f"## Landing page:\n\n{contents}\n## Relevant Links:\n"
    for link in relevant_links['links']:
        result += f"\n\n### Link: {link['type']}\n"
        result += get_contents(link["url"])
    return result

In [6]:
print(fetch_page_and_all_relevant_links("https://huggingface.co"))

Found 4 relevant links
## Landing page:

Home - Edward Donner

Home
Connect Four
Outsmart
An arena that pits LLMs against each other in a battle of diplomacy and deviousness
About
Posts
Well, hi there.
I‚Äôm Ed. I like writing code and experimenting with LLMs, and hopefully you‚Äôre here because you do too. I also enjoy DJing (but I‚Äôm badly out of practice), amateur electronic music production (
very
amateur) and losing myself in
Hacker News
, nodding my head sagely to things I only half understand.
I‚Äôm the co-founder and CTO of
Nebula.io
. We‚Äôre applying AI to a field where it can make a massive, positive impact: helping people discover their potential and pursue their reason for being. Recruiters use our product today to source, understand, engage and manage talent. I‚Äôm previously the founder and CEO of AI startup untapt,
acquired in 2021
.
We work with groundbreaking, proprietary LLMs verticalized for talent, we‚Äôve
patented
our matching model, and our award-winning platfor