<a href="https://colab.research.google.com/github/micah-shull/LLMs/blob/main/LLM_008_webpage_features_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install Libraries

In [3]:
# !pip install python-dotenv
# !pip install openai
# !pip install google-generativeai
# !pip install anthropic
# !pip install gradio

### Import Libraries

In [4]:
import os
from dotenv import load_dotenv
from openai import OpenAI
import gradio as gr
import requests
import json
from typing import List
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display

### Write API Keys to .env file

### Load Environment Variables

In [13]:
# Load the environment variables from the .env file
load_dotenv('/content/API_KEYS.env')  # Ensure this is the correct path to your file

# Get the API keys from the environment
openai_api_key = os.getenv("OPENAI_API_KEY")
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
google_api_key = os.getenv("GOOGLE_API_KEY")

# Check if the keys are loaded correctly and print a portion of them
if openai_api_key:
    print(f"OpenAI API Key loaded: {openai_api_key[0:10]}...")  # Only print part of the key
else:
    print("OpenAI API key not loaded correctly.")

if anthropic_api_key:
    print(f"Anthropic API Key loaded: {anthropic_api_key[0:10]}...")
else:
    print("Anthropic API key not loaded correctly.")

if google_api_key:
    print(f"Google API Key loaded: {google_api_key[0:10]}...")
else:
    print("Google API key not loaded correctly.")

OpenAI API Key loaded: sk-proj-e1...
Anthropic API Key loaded: sk-ant-api...
Google API Key loaded: AIzaSyDh3a...


In [14]:
import openai
import anthropic
import google.generativeai

# Connect to OpenAI
openai.api_key = openai_api_key  # Set OpenAI API key

# Connect to Anthropic (Claude)
claude = anthropic.Anthropic(api_key=anthropic_api_key)  # Set Anthropic API key

# Connect to Google Generative AI
google.generativeai.configure(api_key=google_api_key)  # Set Google API key

# Extract Webpage Features Script

In [15]:
script_content = r"""

import os
from dotenv import load_dotenv
from openai import OpenAI
import openai
import gradio as gr
import requests
import json
from typing import List
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from IPython.display import Markdown, display, update_display

# Load the environment variables from the .env file
load_dotenv('/content/API_KEYS.env')  # Ensure this is the correct path to your file

# Get the API keys from the environment
openai_api_key = os.getenv("OPENAI_API_KEY")

# Connect to OpenAI
openai.api_key = openai_api_key  # Set OpenAI API key

MODEL = 'gpt-4o-mini'

class Website:
    def __init__(self, url: str):
        self.url = url
        self.title = ""
        self.text = ""
        self.links = []
        self._fetch_webpage()

    def _fetch_webpage(self):
        try:
            response = requests.get(self.url)
            response.raise_for_status()  # Raises an HTTPError for bad responses
            soup = BeautifulSoup(response.content, 'html.parser')

            # Set the title
            self.title = soup.title.string if soup.title else "No title found"

            # Process the body text
            if soup.body:
                for irrelevant in soup.body(["script", "style", "img", "input"]):
                    irrelevant.decompose()
                self.text = soup.body.get_text(separator="\n", strip=True)

            # Collect all absolute links
            self.links = self._extract_links(soup)

        except requests.exceptions.RequestException as e:
            print(f"Error fetching {self.url}: {e}")

    def _extract_links(self, soup: BeautifulSoup) -> List[str]:
        links = [link.get('href') for link in soup.find_all('a', href=True)]
        absolute_links = [urljoin(self.url, link) for link in links if link]
        return absolute_links

    def get_contents(self) -> str:
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"


link_system_prompt = '''
You are provided with a list of links found on a webpage.
Your task is to decide which of the links would be most relevant to include in a brochure about the most popular products or services the company offers.
These could include links to an About page, exciting adventure vacation rental options, unique experiences, fun things to do, etc..

Respond in JSON format, as shown in the following example:

{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/rentals"},
        {"type": "contact page", "url": "https://full.url/goes/here/adventure"}
    ]
}

If no relevant links are found, return an empty list.
'''

def get_links_user_prompt(website):
    # Step 1: Initialize the user_prompt string with an introduction and context about the website
    user_prompt = f"Here is the list of links from the website {website.url} - "

    # Step 2: Add additional instructions to the user_prompt about what to do with the links
    user_prompt += (
        "Please decide which of these are relevant web links for a brochure "
        "highlighting the company’s best and most popular offerings. Respond with the full https URL in JSON format. "
        "Do not include links related to Terms of Service, Privacy, or email links.\n"
    )

    # Step 3: Add more context, noting that some links may be relative
    user_prompt += "Links (some might be relative):\n"

    # Step 4: Append the actual list of links to the user_prompt, joined by newlines
    user_prompt += "\n".join(website.links)

    # Step 5: Return the complete user_prompt
    return user_prompt




def get_links(url):
    # Step 1: Create a Website object to extract the links from the webpage
    website = Website(url)

    # Step 2: Call the OpenAI chat.completions API with a system prompt and user prompt
    completion = openai.chat.completions.create(
        model=MODEL,  # Specify the OpenAI model to use (e.g., "gpt-3.5-turbo", "gpt-4")
        messages=[
            {"role": "system", "content": link_system_prompt},  # System prompt: sets the assistant's behavior
            {"role": "user", "content": get_links_user_prompt(website)}  # User prompt: tells the model what task to do with the links
        ],
        response_format={"type": "json_object"}  # Ensures that the response is returned as a JSON object
    )

    # Step 3: Extract the result (the relevant links) from the model's response
    result = completion.choices[0].message.content

    # Step 4: Parse the result (in JSON format) and return it as a Python dictionary
    return json.loads(result)


def get_all_details(url):
    # Step 1: Create a list to accumulate the result contents
    result_parts = []

    # Step 2: Add the landing page content
    landing_page = Website(url)
    result_parts.append("Landing page:\n")
    result_parts.append(landing_page.get_contents())

    # Step 3: Get relevant links from the landing page
    links = get_links(url)
    print("Found links:", links)  # Debug: Print the found links

    # Step 4: Loop through each relevant link, fetch its content, and add it to the result
    for link in links.get("links", []):  # Ensure links are available
        try:
            result_parts.append(f"\n\n{link['type']}\n")
            linked_page = Website(link["url"])  # Create a Website object for each link
            result_parts.append(linked_page.get_contents())
        except Exception as e:
            print(f"Error fetching {link['url']}: {e}")  # Error handling for bad links

    # Step 5: Join the result_parts list into a single string and return it
    return ''.join(result_parts)

def get_brochure_user_prompt(company_name, url):
    # Step 1: Create a multi-line string for clarity
    user_prompt = f'''
    You are looking at a company called: {company_name}
    Here are the contents of its landing page and other relevant pages.
    Use this information to build a short brochure of the company in markdown.
    '''

    # Step 2: Append the details from the landing page and relevant links
    user_prompt += get_all_details(url)

    # Step 3: Truncate the prompt if it exceeds 20,000 characters to avoid API limits
    # The OpenAI API can have limits on prompt length, so we truncate it if it's too long
    user_prompt = user_prompt[:20_000]

    # Step 4: Return the final user prompt
    return user_prompt

# Professional tone brochure prompt
system_prompt = '''
You are an assistant that analyzes the contents of several relevant pages from a company website
and creates a short brochure about the company for prospective customers. Respond in markdown.
Include details about exciting products and services available and highlight the most fun and popular options.
'''

def create_brochure(company_name, url):
    # Step 1: Call the OpenAI API using the specified model and prompts
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},  # Provide the system prompt
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}  # Provide the user prompt
        ],
    )

    # Step 2: Extract the generated brochure from the API response
    result = response.choices[0].message.content

    # Step 3: Display the result in markdown format for better presentation
    display(Markdown(result))

    return result

# -----------  create brochure and save to json file ------------#

def save_brochure_to_json(brochure_data, json_filename='brochure_data.json'):
    if brochure_data is None:
        print("No brochure data to save.")
        return

    try:
        # Check if the file already exists
        if os.path.exists(json_filename):
            with open(json_filename, 'r+') as file:
                # Load existing data and append new brochure data
                data = json.load(file)
                data.append(brochure_data)
                file.seek(0)
                json.dump(data, file, indent=4)
        else:
            # Create a new file if it doesn't exist
            with open(json_filename, 'w') as file:
                json.dump([brochure_data], file, indent=4)

        print(f"Brochure saved to {json_filename}")

    except Exception as e:
        print(f"Error saving brochure to {json_filename}: {e}")


def load_brochure_data(json_filename):
    '''Loads the brochure data from a JSON file.'''
    try:
        with open(json_filename, 'r') as file:
            data = json.load(file)
            return data
    except FileNotFoundError:
        print(f"File {json_filename} not found.")
        return None
    except json.JSONDecodeError:
        print(f"Error decoding JSON from {json_filename}")
        return None


def format_for_chatbot(brochure_data):
    '''Formats the brochure data into a user-friendly prompt for the chatbot.'''
    if not brochure_data:
        return "No data available."

    # Assume brochure_data is a list of dictionaries
    user_prompt = ""
    for company in brochure_data:
        user_prompt += f"Company Name: {company['company_name']}\n"
        user_prompt += f"Website: {company['url']}\n"
        user_prompt += "Brochure Content:\n"
        user_prompt += f"{company['brochure']}\n\n"

    # Truncate if too long
    return user_prompt[:20_000]  # Ensure the prompt stays within the token limit


"""
with open('webpage_extraction_utils.py', 'w') as file:
    file.write(script_content)

print("webpage_extraction_utils.py created successfully")
# Reload script to make functions available for use
import importlib
import webpage_extraction_utils
importlib.reload(webpage_extraction_utils)

from webpage_extraction_utils import *




webpage_extraction_utils.py created successfully


### Kayak Rental in Hawaii

In [19]:
from webpage_extraction_utils import Website, get_links, get_all_details, create_brochure, save_brochure_to_json

url = 'https://www.kailuabeachadventures.com/kailua-lanikai-kayak-rentals'
kayaks = Website(url)
# print(kayaks.get_contents()[0:101])
# print("Links:",kayaks.links)

# get_links(url)
# get_all_details(url)
# create_brochure("Kailua Kayak Rental", url)
# Generate the brochure content
brochure_data = create_brochure("Kailua Beach Adventures", url)

# Save the brochure to JSON
save_brochure_to_json(brochure_data)


Found links: {'links': [{'type': 'home page', 'url': 'https://www.kailuabeachadventures.com/'}, {'type': 'tours and lessons', 'url': 'https://www.kailuabeachadventures.com/tours-lessons'}, {'type': 'kayak tours', 'url': 'https://www.kailuabeachadventures.com/kailua-lanikai-kayak-tours'}, {'type': 'self guided kayaking tour', 'url': 'https://www.kailuabeachadventures.com/self-guided-kayaking-tour'}, {'type': 'paddle boarding lessons', 'url': 'https://www.kailuabeachadventures.com/paddle-boarding-lessons-oahu'}, {'type': 'self guided SUP tour', 'url': 'https://www.kailuabeachadventures.com/self-guided-sup-tour'}, {'type': 'Kailua beach experience', 'url': 'https://www.kailuabeachadventures.com/kailua-beach-experience'}, {'type': 'activities', 'url': 'https://www.kailuabeachadventures.com/activities'}, {'type': 'kayak rentals', 'url': 'https://www.kailuabeachadventures.com/kailua-lanikai-kayak-rentals'}, {'type': 'paddle board rentals', 'url': 'https://www.kailuabeachadventures.com/kailua

# Kailua Beach Adventures

Welcome to **Kailua Beach Adventures** (KBA), your go-to destination for thrilling water sports experiences in the heart of Kailua, Hawaii. With over 40 years of experience, we pride ourselves on providing fun, engaging, and sustainable adventures that the entire family can enjoy.

---

## Exciting Offerings

### **Kayak Rentals & Tours**
- **Single Kayaks**:
  - *1/2 Day*: $69
  - *Full Day*: $84
  - Experience the beautiful Kailua Bay and the iconic Mokulua Islands at your own pace with our excellent single kayak rentals.

- **Double & Triple Kayaks**:
  - *Double Kayaks*: 
    - *1/2 Day*: $85 
    - *Full Day*: $99
  - *Triple Kayaks*
    - *1/2 Day*: $95
    - *Full Day*: $109
  - Perfect for couples or families looking for a shared adventure!

- **Guided Tours**:
  - *Self-Guided Kayak Tours*: $99/adult, $89/child
  - Explore hidden islands and scenic beaches while enjoying provisions such as lunch and snorkel gear. 

### **Stand Up Paddleboarding (SUP)**
- **Lessons & Rentals**:
  - Want to ride the waves on a Stand Up Paddleboard? Join us for lessons or rent equipment for your own adventure!

### **All-in-One Gear Rentals**
- We offer a variety of water sports gear:
  - **Bikes**, **Surfboards**, **Boogie Boards**, and **Snorkeling Equipment**.
  - Beach essentials including **Chairs & Umbrellas** for a perfect day by the shore.

---

## Why Choose Us?

- **Prime Location**: Walking distance to Kailua Beach Park with hassle-free parking and a convenient cart system to save time.
- **Sustainable Practices**: We are committed to preserving our beautiful environment through beach cleanups and wildlife restoration projects. 
- **Safety First**: Our certified guides ensure that your adventure is not only fun but also safe.
- **Community-Oriented**: Join our Kailua Beach Club for volunteer opportunities, promotions, and events.

---

## Join Us Today!

Come visit us at **Kailua Beach Adventures** and immerse yourself in some of the most stunning scenery Hawaii has to offer. Whether you're an experienced kayaker or a first-time paddler, we have something for everyone! 

**Location:**  
130 Kailua Rd., Kailua, HI 96734

**Contact Us:**  
📞 +1 808-262-2555  
📧 info@kailuabeachadventures.com  

**Follow us on Instagram for updates and stunning views of our adventures!**  
@kailuabeachadventures

---

Get ready for an unforgettable Hawaiian experience with fun-filled activities that your family will treasure forever!


Brochure saved to brochure_data.json


In [None]:
brochure = load_brochure_data("/content/brochure_data.json")
print(brochure)

["# Kailua Beach Adventures\n\nWelcome to **Kailua Beach Adventures**, your ultimate destination for fun and adventure on the beautiful beaches of Oahu, Hawaii! Whether you're looking to explore the stunning coastline, try your hand at exciting water sports, or simply relax on the sand, we have something for everyone.\n\n## Our Exciting Offerings\n\n### **Kayak Rentals**\nExperience the breathtaking beauty of Kailua Bay on your own kayak! We provide single and double kayaks, perfect for paddling out to the stunning Mokulua Islands or exploring the calm waters of the bay. Ideal for beginners and seasoned kayakers alike!\n\n### **Stand-Up Paddleboarding (SUP)**\nJoin the SUP craze! Our stand-up paddleboards are easy to use and a great way to enjoy the scenic views while getting a workout. Rentals include everything you need to get started, and we offer lessons for those who are new to the sport.\n\n### **Snorkeling Tours**\nDiscover the vibrant underwater world of Oahu with our guided sn

# Modified Script

The key to making the brochure data more accessible for the chatbot is organizing the content in a structured, hierarchical format within the JSON file. Instead of saving everything as one long string (especially with Markdown formatting), you can break down the brochure into key sections, making it easier to retrieve specific information.

Here’s how I suggest organizing the data:

### Structured JSON Layout

You could structure the data around the different components of a brochure, such as:

1. **Company Information**:
   - Company Name
   - Website URL

2. **Sections**:
   - Introduction
   - Key Product Offerings (with details about each product)
   - Services (descriptions of services provided)
   - Contact Information (e.g., address, phone number, email)
   - Key Features (specific highlights or features of the company)
   
3. **Product Details**:
   - Product Name
   - Description
   - Price Range (if applicable)
   - Link to More Info (if applicable)

### Example Structured JSON Format:

```json
{
  "company_name": "Kailua Beach Adventures",
  "website_url": "https://www.kailuabeachadventures.com",
  "brochure": {
    "introduction": "Welcome to Kailua Beach Adventures, where you can explore the best of Hawaii...",
    "products": [
      {
        "name": "Kayak Rentals",
        "description": "Rent kayaks to explore the waters of Kailua and Lanikai.",
        "price_range": "$50 - $100",
        "url": "https://www.kailuabeachadventures.com/kailua-lanikai-kayak-rentals"
      },
      {
        "name": "Paddle Board Rentals",
        "description": "Stand up paddle boards for rent at Kailua Beach.",
        "price_range": "$30 - $70",
        "url": "https://www.kailuabeachadventures.com/kailua-paddle-board-rentals"
      }
    ],
    "services": [
      {
        "name": "Guided Kayak Tours",
        "description": "Join a guided kayak tour with our expert guides to explore the waters around Lanikai."
      },
      {
        "name": "Snorkeling Rentals",
        "description": "Snorkeling equipment available for rent, explore the underwater beauty of Kailua Beach."
      }
    ],
    "contact_information": {
      "phone": "+1 (800) 123-4567",
      "email": "info@kailuabeachadventures.com",
      "address": "123 Beachfront Ave, Kailua, HI"
    }
  }
}
```

### Advantages of This Approach:
1. **Easily Parseable**: Each section is clearly defined, making it easier to pull out specific information, like product offerings, services, or contact information.
2. **Structured Queries**: The chatbot can access specific parts of the JSON (e.g., "products" or "services") to generate responses or recommendations.
3. **Scalability**: As the content grows, you can easily add more products, services, or sections without restructuring the entire JSON.
4. **Cleaner Data**: Markdown formatting is avoided, which reduces parsing complexity.

### Implementation Plan:

1. **Modify `create_brochure()`**: Instead of generating one long Markdown string, generate structured sections and store them in a dictionary.
2. **Save to JSON**: When saving the data, structure it according to the JSON layout above.
3. **Access for the Chatbot**: When retrieving the JSON data, it can easily reference specific parts, such as product offerings or services.

### Next Steps:

If you agree with this structure, we can:
1. Modify the `create_brochure()` function to generate data in this structured format.
2. Update the JSON saving logic to store the data in this structured format.
3. Ensure the chatbot can access the data more easily by referencing specific sections when needed.


In [17]:
script_content = r"""

import os
from dotenv import load_dotenv
from openai import OpenAI
import openai
import gradio as gr
import requests
import json
from typing import List
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from IPython.display import Markdown, display, update_display
import time

# Load the environment variables from the .env file
load_dotenv('/content/API_KEYS.env')  # Ensure this is the correct path to your file

# Get the API keys from the environment
openai_api_key = os.getenv("OPENAI_API_KEY")

# Connect to OpenAI
openai.api_key = openai_api_key  # Set OpenAI API key

MODEL = 'gpt-4o-mini'


class Website:
    def __init__(self, url: str):
        self.url = url
        self.title = ""
        self.text = ""
        self.links = []
        self._fetch_webpage()

    def _fetch_webpage(self):
        try:
            response = requests.get(self.url)
            response.raise_for_status()  # Raises an HTTPError for bad responses
            soup = BeautifulSoup(response.content, 'html.parser')

            # Set the title
            self.title = soup.title.string if soup.title else "No title found"

            # Process the body text
            if soup.body:
                for irrelevant in soup.body(["script", "style", "img", "input"]):
                    irrelevant.decompose()
                self.text = soup.body.get_text(separator="\n", strip=True)

            # Collect all absolute links
            self.links = self._extract_links(soup)

        except requests.exceptions.RequestException as e:
            print(f"Error fetching {self.url}: {e}")

    def _extract_links(self, soup: BeautifulSoup) -> List[str]:
        links = [link.get('href') for link in soup.find_all('a', href=True)]
        absolute_links = [urljoin(self.url, link) for link in links if link]
        return absolute_links

    def get_contents(self) -> str:
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

# -----------    Link System Prompt    ------------#

link_system_prompt = '''
You are provided with a list of links found on a company's webpage.
Your task is to categorize and organize the information into a well-structured JSON format for a brochure.
This brochure should describe the company, its products, services, and contact information.

Please organize the brochure as follows:

{
    "company_name": "Company Name",
    "website_url": "https://full.website.url",
    "brochure": {
        "introduction": "A short introduction about the company and its offerings.",
        "products": [
            {
                "name": "Product Name",
                "description": "Short description of the product.",
                "price_range": "Optional price range if available",
                "url": "https://full.url/to/product/page"
            },
            # Add more products as necessary
        ],
        "services": [
            {
                "name": "Service Name",
                "description": "Short description of the service.",
                "url": "https://full.url/to/service/page"
            },
            # Add more services as necessary
        ],
        "contact_information": {
            "phone": "Optional phone number",
            "email": "Optional email address",
            "address": "Optional physical address"
        }
    }
}

Only include valid and relevant sections. If a section is not available, you may leave it out.

If no relevant links are found, return an empty list.
'''

# ---------    Get Links System Prompt    ------------#

def get_links_user_prompt(website):
    # Step 1: Initialize the user_prompt string with an introduction and context about the website
    user_prompt = f"Here is the list of links from the website {website.url}. "
    user_prompt += (
        "Please decide which of these are relevant web links for a brochure "
        "highlighting the company’s best and most popular offerings. Respond in a well-structured JSON format with sections for products, services, and contact information. "
        "Do not include links related to Terms of Service, Privacy, or email links.\n"
    )

    # Step 2: Add more context, noting that some links may be relative
    user_prompt += "Links (some might be relative):\n"

    # Step 3: Append the actual list of links to the user_prompt, joined by newlines
    user_prompt += "\n".join(website.links)

    # Step 4: Return the complete user_prompt
    return user_prompt


# ---------     Get Links Function    ------------#

def get_links(url):
    # Step 1: Create a Website object to extract the links from the webpage
    website = Website(url)

    # Step 2: Call the OpenAI chat.completions API with a system prompt and user prompt
    completion = openai.chat.completions.create(
        model=MODEL,  # Specify the OpenAI model to use (e.g., "gpt-3.5-turbo", "gpt-4")
        messages=[
            {"role": "system", "content": link_system_prompt},  # System prompt: sets the assistant's behavior
            {"role": "user", "content": get_links_user_prompt(website)}  # User prompt: tells the model what task to do with the links
        ],
        response_format={"type": "json_object"}  # Ensures that the response is returned as a JSON object
    )

    # Step 3: Extract the result (the relevant links) from the model's response
    result = completion.choices[0].message.content

    # Step 4: Parse the result (in JSON format) and return it as a Python dictionary
    return json.loads(result)


# def get_links(url, retries=3, delay=2):
#     # Step 1: Create a Website object to extract the links from the webpage
#     website = Website(url)

#     for attempt in range(retries):
#         try:
#             # Step 2: Call the OpenAI chat.completions API with a system prompt and user prompt
#             completion = openai.chat.completions.create(
#                 model=MODEL,  # Specify the OpenAI model to use (e.g., "gpt-3.5-turbo", "gpt-4")
#                 messages=[
#                     {"role": "system", "content": link_system_prompt},  # System prompt: sets the assistant's behavior
#                     {"role": "user", "content": get_links_user_prompt(website)}  # User prompt: tells the model what task to do with the links
#                 ]
#             )

#             # Step 3: Extract the result (the relevant links) from the model's response
#             result = completion.choices[0].message.content

#             # Step 4: Parse the result (in JSON format) and return it as a Python dictionary
#             return json.loads(result)

#         except openai.error.RateLimitError:
#             print(f"429 Too Many Requests. Retrying in {delay} seconds...")
#             time.sleep(delay)
#             delay *= 2  # Exponential backoff

#         except Exception as e:
#             print(f"Error fetching links from {url}: {e}")
#             return None

#     print(f"Failed to fetch links from {url} after {retries} attempts.")
#     return None

# ---------     Get All Details Function    ------------#

def get_all_details(url):
    # Step 1: Create a dictionary to accumulate the brochure content
    brochure = {
        "company_name": "",
        "website_url": url,
        "brochure": {
            "introduction": "",
            "products": [],
            "services": [],
            "contact_information": {}
        }
    }

    # Step 2: Add the landing page content (Introduction)
    landing_page = Website(url)
    brochure["company_name"] = landing_page.title if landing_page.title else "No Company Name"
    brochure["brochure"]["introduction"] = landing_page.get_contents()

    # Step 3: Get relevant links from the landing page
    links = get_links(url)
    print("Found links:", links)  # Debug: Print the found links

    # Step 4: Loop through each relevant link and categorize it into the structured format
    for link in links.get("links", []):
        try:
            linked_page = Website(link["url"])  # Create a Website object for each link
            link_content = linked_page.get_contents()

            # Categorize the link based on its type
            if link["type"] == "product":
                brochure["brochure"]["products"].append({
                    "name": link["type"],  # Use the link type as the product name
                    "description": link_content,  # Store the fetched content as the description
                    "url": link["url"]
                })
            elif link["type"] == "service":
                brochure["brochure"]["services"].append({
                    "name": link["type"],  # Use the link type as the service name
                    "description": link_content,  # Store the fetched content as the description
                    "url": link["url"]
                })
            elif link["type"] == "contact page":
                # Extract contact info from the contact page (if available)
                brochure["brochure"]["contact_information"] = {
                    "description": link_content,
                    "url": link["url"]
                }
            else:
                # For any other types of links, we can decide how to categorize them
                pass

        except Exception as e:
            print(f"Error fetching {link['url']}: {e}")  # Error handling for bad links

    # Step 5: Return the structured brochure data
    return brochure

# ---------     Get Brochure Function    ------------#

def get_brochure_user_prompt(company_name, url):
    # Step 1: Create the structured user prompt
    user_prompt = f'''
    You are reviewing a company called: {company_name}.
    The following is the content of its landing page and other relevant pages.
    Please build a structured brochure with an introduction, products, services, and contact information.

    Ensure the brochure is in the following structured JSON format:

    {{
        "company_name": "Company Name",
        "website_url": "https://example.com",
        "brochure": {{
            "introduction": "Introduction text goes here.",
            "products": [
                {{
                    "name": "Product Name",
                    "description": "Product description goes here.",
                    "price_range": "Optional price range",
                    "url": "https://product-link.com"
                }}
            ],
            "services": [
                {{
                    "name": "Service Name",
                    "description": "Service description goes here.",
                    "url": "https://service-link.com"
                }}
            ],
            "contact_information": {{
                "phone": "Optional phone number",
                "email": "Optional email address",
                "address": "Optional address"
            }}
        }}
    }}

    Below is the content to help you build this brochure.
    '''

    # Step 2: Append the details from the landing page and relevant links in a structured way
    brochure_details = get_all_details(url)

    # Convert the details into a string format to include in the prompt
    user_prompt += json.dumps(brochure_details, indent=4)

    # Step 3: Ensure the prompt does not exceed the API character limits
    user_prompt = user_prompt[:20_000]  # Ensure it stays within the API limit

    # Step 4: Return the final user prompt
    return user_prompt



# -----------     Create Brochure Function    ------------#

system_prompt = '''
You are an assistant that analyzes the contents of several relevant pages from a company website
and creates a structured brochure in JSON format for prospective customers.
Include details about the company’s exciting products and services, and highlight the most fun and popular options.

Ensure the brochure is in the following structured JSON format:

{
    "company_name": "Company Name",
    "website_url": "https://example.com",
    "brochure": {
        "introduction": "Introduction text goes here.",
        "products": [
            {
                "name": "Product Name",
                "description": "Product description goes here.",
                "price_range": "Optional price range",
                "url": "https://product-link.com"
            }
        ],
        "services": [
            {
                "name": "Service Name",
                "description": "Service description goes here.",
                "url": "https://service-link.com"
            }
        ],
        "contact_information": {
            "phone": "Optional phone number",
            "email": "Optional email address",
            "address": "Optional address"
        }
    }
}

Return the brochure in JSON format.
'''

# ---------     Create Brochure Function    ------------#

def create_brochure(company_name, url):
    # Step 1: Call the OpenAI API using the specified model and prompts
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},  # Provide the system prompt
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}  # Provide the user prompt
        ],
    )

    # Step 2: Extract the generated brochure from the API response
    result = response.choices[0].message.content

    # Step 3: Display the result in markdown format for better presentation
    display(Markdown(result))

    return result

# -----------  save brochure to json file ------------#

def save_brochure_to_json(brochure_data, json_filename='brochure_data.json'):
    if brochure_data is None:
        print("No brochure data to save.")
        return

    try:
        # Step 1: Check if the file already exists and load existing data
        if os.path.exists(json_filename):
            try:
                with open(json_filename, 'r') as file:
                    # Load existing JSON data
                    data = json.load(file)
            except json.JSONDecodeError:
                # If the file is corrupted or empty, reset data to an empty list
                print(f"Error reading {json_filename}. The file might be corrupted. Creating a new file.")
                data = []
        else:
            # If the file doesn't exist, start with an empty list
            data = []

        # Step 2: Append the new brochure data to the list
        data.append(brochure_data)

        # Step 3: Write the updated data back to the file
        with open(json_filename, 'w') as file:
            json.dump(data, file, indent=4)

        print(f"Brochure saved to {json_filename}")

    except Exception as e:
        print(f"Error saving brochure to {json_filename}: {e}")



# -----------  load brochure from json file ------------#

def load_brochure_data(json_filename):
    '''Loads the brochure data from a JSON file.'''
    try:
        with open(json_filename, 'r') as file:
            data = json.load(file)
            return data
    except FileNotFoundError:
        print(f"File {json_filename} not found. Returning an empty list.")
        return []
    except json.JSONDecodeError:
        print(f"Error decoding JSON from {json_filename}. The file might be corrupted.")
        return []
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return []

#----------  format for chat bot ------------#

def format_for_chatbot(brochure_data):
    '''Formats the brochure data into a user-friendly prompt for the chatbot.'''
    if not brochure_data:
        return "No brochure data available."

    user_prompt = ""

    # Loop through the companies in the brochure data
    for company in brochure_data:
        # Add company information
        user_prompt += f"**Company Name**: {company['company_name']}\n"
        user_prompt += f"**Website**: {company['website_url']}\n\n"

        # Add the introduction
        user_prompt += "**Introduction**:\n"
        user_prompt += f"{company['brochure']['introduction']}\n\n"

        # Add products section
        if company['brochure']['products']:
            user_prompt += "**Products**:\n"
            for product in company['brochure']['products']:
                user_prompt += f"- **Product Name**: {product['name']}\n"
                user_prompt += f"  Description: {product['description']}\n"
                user_prompt += f"  URL: {product['url']}\n\n"

        # Add services section
        if company['brochure']['services']:
            user_prompt += "**Services**:\n"
            for service in company['brochure']['services']:
                user_prompt += f"- **Service Name**: {service['name']}\n"
                user_prompt += f"  Description: {service['description']}\n"
                user_prompt += f"  URL: {service['url']}\n\n"

        # Add contact information section
        if company['brochure']['contact_information']:
            user_prompt += "**Contact Information**:\n"
            contact_info = company['brochure']['contact_information']
            if "phone" in contact_info:
                user_prompt += f"- Phone: {contact_info['phone']}\n"
            if "email" in contact_info:
                user_prompt += f"- Email: {contact_info['email']}\n"
            if "address" in contact_info:
                user_prompt += f"- Address: {contact_info['address']}\n"
        user_prompt += "\n"

    # Return the formatted user prompt
    return user_prompt[:20_000]  # Ensure the prompt stays within the token limit



"""
with open('url_extraction_utils.py', 'w') as file:
    file.write(script_content)

print("url_extraction_utils.py created successfully")
# Reload script to make functions available for use
import importlib
import url_extraction_utils
importlib.reload(url_extraction_utils)

from url_extraction_utils import *






url_extraction_utils.py created successfully


## Test Script

In [18]:
from url_extraction_utils import Website, get_links, get_all_details, get_brochure_user_prompt, create_brochure, save_brochure_to_json, load_brochure_data
import importlib
importlib.reload(url_extraction_utils)


URL = "https://www.kailuabeachadventures.com"

# Step 1: Generate the brochure data
brochure_data = create_brochure("Kailua Beach Adventures", URL)

# Step 2: Save the brochure data to a JSON file
save_brochure_to_json(brochure_data, "brochure_data.json")

# Step 3: Print a portion of the brochure data to verify
if brochure_data:
    print("Brochure Data (Partial):")
    print(json.dumps(brochure_data, indent=4)[:500])  # Print the first 500 characters of the brochure data

# Step 4: Load the saved JSON file to ensure it was saved correctly
loaded_brochure_data = load_brochure_data("brochure_data.json")
if loaded_brochure_data:
    print("\nLoaded Brochure Data (Partial):")
    print(json.dumps(loaded_brochure_data, indent=4)[:500])  # Print the first 500 characters of the loaded data





Found links: {'company_name': 'Kailua Beach Adventures', 'website_url': 'https://www.kailuabeachadventures.com', 'brochure': {'introduction': 'Kailua Beach Adventures offers a variety of water sports and rental services to enhance your experience in beautiful Kailua, Oahu. From kayaking to paddle boarding, we provide everything you need for a fun-filled day at the beach.', 'products': [{'name': 'Kailua Lanikai Kayak Tours', 'description': 'Explore the stunning waters around Lanikai and Kailua on our guided kayak tours. Perfect for both beginners and experienced kayakers.', 'url': 'https://www.kailuabeachadventures.com/kailua-lanikai-kayak-tours'}, {'name': 'Snorkel Rental', 'description': 'Rent high-quality snorkeling gear and explore the vibrant underwater world of Oahu.', 'url': 'https://www.kailuabeachadventures.com/snorkel-rental-lanikai-oahu'}, {'name': 'Stand Up Paddle Board Rentals', 'description': 'Enjoy the calm waters of Kailua with our stand-up paddle board rentals. A great 

```json
{
    "company_name": "Kailua Beach Adventures",
    "website_url": "https://www.kailuabeachadventures.com",
    "brochure": {
        "introduction": "Kailua Beach Adventures is the ultimate destination for water sports enthusiasts in Hawaii. With over 37 years of experience, we offer guided kayak tours, stand up paddle lessons, rentals, and a commitment to sustainability. Explore the turquoise waters and stunning beaches of Kailua and Lanikai while enjoying unforgettable adventures. Whether you're looking to rent kayaks, stand up paddleboards (SUP), or take lessons, we've got you covered!",
        "products": [
            {
                "name": "Guided Kayak Tours",
                "description": "Experience the beauty of Kailua Bay and surrounding islands with our guided kayak tours. Perfect for all skill levels, our knowledgeable guides ensure a fun and safe journey.",
                "url": "https://www.kailuabeachadventures.com/guided-kayak-tours"
            },
            {
                "name": "Self Guided Kayaking Tour",
                "description": "Explore at your own pace with our self-guided kayak rentals. Discover the picturesque beaches and hidden gems of Kailua and Lanikai.",
                "url": "https://www.kailuabeachadventures.com/self-guided-kayaking"
            },
            {
                "name": "Stand Up Paddle Lessons",
                "description": "Learn how to stand up paddleboard with our professional instructors in a friendly and safe environment. Ideal for beginners!",
                "url": "https://www.kailuabeachadventures.com/stand-up-paddle-lessons"
            },
            {
                "name": "Kayak Rentals",
                "description": "Rent a kayak and enjoy a day on the water at your leisure. We have a variety of kayaks for you to choose from.",
                "url": "https://www.kailuabeachadventures.com/kayak-rentals"
            }
        ],
        "services": [
            {
                "name": "Gear Delivery",
                "description": "For your convenience, we offer gear delivery directly to Kailua Beach so you can start your adventure without any hassle.",
                "url": "https://www.kailuabeachadventures.com/gear-delivery"
            },
            {
                "name": "Community Involvement",
                "description": "Join us for beach cleanups and environmentally focused events as we give back to the beautiful islands we explore.",
                "url": "https://www.kailuabeachadventures.com/community-involvement"
            }
        ],
        "contact_information": {
            "phone": "+1-808-262-2555",
            "email": "info@kailuabeachadventures.com",
            "address": "130 Kailua Rd, Kailua, HI 96734"
        }
    }
}
```

Brochure saved to brochure_data.json
Brochure Data (Partial):
"```json\n{\n    \"company_name\": \"Kailua Beach Adventures\",\n    \"website_url\": \"https://www.kailuabeachadventures.com\",\n    \"brochure\": {\n        \"introduction\": \"Kailua Beach Adventures is the ultimate destination for water sports enthusiasts in Hawaii. With over 37 years of experience, we offer guided kayak tours, stand up paddle lessons, rentals, and a commitment to sustainability. Explore the turquoise waters and stunning beaches of Kailua and Lanikai while enjoying unforgett

Loaded Brochure Data (Partial):
[
    "# Kailua Beach Adventures\n\n**Dive into the adventure of a lifetime at Kailua Beach Adventures, your ultimate destination for water sports and exploration in breathtaking Kailua and Lanikai, Hawaii!**\n\n## What We Offer\n\n### \ud83d\udef6 Kayak Rentals\nExplore stunning Kailua Bay at your own pace with our diverse kayak options:\n- **Single Kayaks**: Ideal for solo adventurers.  \n  - **Pric