https://github.com/trancethehuman/ai-workshop-code/blob/main/Web_scraping_for_LLM_in_2024.ipynb

## Reader API by Jina AI


In [1]:
import requests

def scrape_jina_ai(url: str) -> str:
  response = requests.get("https://r.jina.ai/" + url)
  return response.text
     

## Firecrawl from Mendable


In [3]:
import firecrawl
import getpass

FIRECRAWL_API_KEY = getpass.getpass("Mendable API Key: ")

def scrape_firecrawl(url: str):
    app = firecrawl.FirecrawlApp(api_key=FIRECRAWL_API_KEY)
    scraped_data = app.scrape_url(url)["markdown"]
    return scraped_data

## Token Usage

In [4]:
import tiktoken

def count_tokens(input_string: str) -> int:
    tokenizer = tiktoken.get_encoding("cl100k_base")

    tokens = tokenizer.encode(input_string)

    return len(tokens)

def calculate_cost(input_string: str, cost_per_million_tokens: float = 5) -> float:
    num_tokens = count_tokens(input_string)

    total_cost = (num_tokens / 1_000_000) * cost_per_million_tokens

    return total_cost

# Example usage:
# input_string = "What's the difference between beer nuts and deer nuts? Beer nuts are about 5 dollars. Deer nuts are just under a buck."
# cost = calculate_cost(input_string)
# print(f"The total cost for using gpt-4o is: $US {cost:.6f}")

## View Scraped Content

In [5]:
from typing import List, Callable, Dict
from prettytable import PrettyTable, ALL
from tqdm import tqdm

def view_scraped_content(scrape_url_functions: List[Dict[str, Callable[[str], str]]], sites_list: List[Dict[str, str]], characters_to_display: int = 500, table_max_width: int = 50) -> List[Dict[str, str]]:
    content_table_headers = ["Site Name"] + [f"{func['name']} content" for func in scrape_url_functions]
    cost_table_headers = ["Site Name"] + [f"{func['name']} cost" for func in scrape_url_functions]

    content_table = PrettyTable()
    content_table.field_names = content_table_headers

    cost_table = PrettyTable()
    cost_table.field_names = cost_table_headers

    scraped_data = []

    for site in sites_list:
        content_row = [site['name']]
        cost_row = [site['name']]
        site_data = {"provider": site['name'], "sites": []}

        for scrape_function in scrape_url_functions:
            function_name = scrape_function['name']
            for _ in tqdm([site], desc=f"Processing site {site['name']} using {function_name}"):
                try:
                    content = scrape_function['function'](site['url'])
                    content_snippet = content[:characters_to_display]
                    content_row.append(content_snippet)

                    cost = calculate_cost(content)
                    cost_row.append(f"${cost:.6f}")

                    site_data["sites"].append({"name": function_name, "content": content})
                except Exception as e:
                    error_message = f"Error: {str(e)}"
                    content_row.append(error_message)
                    cost_row.append("Error")

                    site_data["sites"].append({"name": function_name, "content": error_message})
                    continue

        content_table.add_row(content_row)
        cost_table.add_row(cost_row)
        scraped_data.append(site_data)

    content_table.max_width = table_max_width
    content_table.hrules = ALL

    cost_table.max_width = table_max_width
    cost_table.hrules = ALL

    print("Content Table:")
    print(content_table)

    print("\nCost Table:\nThis is how much it would cost to use gpt-4o to parse this content for extraction.")
    print(cost_table)

    return scraped_data


In [8]:
sites = [
    {"name": "Vespa", "url": "https://vespa.ai/"},
    {"name": "Rresail", "url": "https://presail.com/"},
]

In [9]:
list_of_scraper_functions = [
      # {"name": "Beautiful Soup", "function": beautiful_soup_scrape_url},
      {"name": "Firecrawl", "function": scrape_firecrawl},
      {"name": "Jina AI", "function": scrape_jina_ai}
      ]

all_content = view_scraped_content(list_of_scraper_functions, sites, 700, 20)
     

Processing site Vespa using Firecrawl: 100%|██████████| 1/1 [00:06<00:00,  6.50s/it]
Processing site Vespa using Jina AI: 100%|██████████| 1/1 [00:01<00:00,  1.06s/it]
Processing site Rresail using Firecrawl: 100%|██████████| 1/1 [00:03<00:00,  3.87s/it]
Processing site Rresail using Jina AI: 100%|██████████| 1/1 [00:01<00:00,  1.18s/it]

Content Table:
+-----------+----------------------+----------------------+
| Site Name |  Firecrawl content   |   Jina AI content    |
+-----------+----------------------+----------------------+
|   Vespa   | [![Vespa logo](https | Title: Vespa - data  |
|           | ://vespa.ai/assets/l |     + AI, online     |
|           |     ogo.png)](/)     |                      |
|           |                      |     URL Source:      |
|           | [](https://search.ve |  https://vespa.ai/   |
|           |       spa.ai/)       |                      |
|           |                      |  Markdown Content:   |
|           | [Get Started](https: |  Vespa - data + AI,  |
|           | //cloud.vespa.ai/en/ |        online        |
|           |  [Features](/feature |                      |
|           |          s)          |                      |
|           |                      | [![Image 1: Vespa lo |
|           |      Solutions       | go](https://vespa.ai |
|           |            




In [19]:
print(all_content[0]["sites"][1]["content"])

Title: Vespa - data + AI, online

URL Source: https://vespa.ai/

Markdown Content:
Vespa - data + AI, online
                   

[![Image 1: Vespa logo](https://vespa.ai/assets/logo.png)](https://vespa.ai/)

[](https://search.vespa.ai/)

[Get Started](https://cloud.vespa.ai/en/getting-started) [Features](https://vespa.ai/features)

Solutions

[Vespa Executive Overview](https://vespa.ai/executive-overview.html) [Unlocking eCommerce Growth](https://vespa.ai/unlocking-ecommerce-growth.html)

Documentation

[Vespa Overview Deck](https://docs.google.com/presentation/d/1TO_QX451CUvLzI3rVMcjB0qQXutUaoF6rTVfQLrfvPg) [Vespa Documentation](https://docs.vespa.ai/) [Video Archive](https://vespa.ai/resources) [Vespa and ElasticSearch / Solr](https://vespa.ai/vespa-elastic-solr)

[Blog](https://blog.vespa.ai/)

Vespa Cloud

[Vespa Cloud](https://cloud.vespa.ai/) [Vespa Cloud Console](https://console.vespa.oath.cloud/)

Company

[About](https://vespa.ai/team) [Jobs](https://vespa.ai/we-are-hiring) [

In [17]:
from pprint import pprint
pprint(all_content[0]["sites"][0]["content"])

('[![Vespa logo](https://vespa.ai/assets/logo.png)](/)\n'
 '\n'
 '[](https://search.vespa.ai/)\n'
 '\n'
 '[Get Started](https://cloud.vespa.ai/en/getting-started)\n'
 ' [Features](/features)\n'
 '\n'
 'Solutions\n'
 '\n'
 '[Vespa Executive Overview](/executive-overview.html)\n'
 ' [Unlocking eCommerce Growth](/unlocking-ecommerce-growth.html)\n'
 '\n'
 'Documentation\n'
 '\n'
 '[Vespa Overview '
 'Deck](https://docs.google.com/presentation/d/1TO_QX451CUvLzI3rVMcjB0qQXutUaoF6rTVfQLrfvPg)\n'
 ' [Vespa Documentation](https://docs.vespa.ai/)\n'
 ' [Video Archive](/resources)\n'
 ' [Vespa and ElasticSearch / Solr](/vespa-elastic-solr)\n'
 '\n'
 '[Blog](https://blog.vespa.ai/)\n'
 '\n'
 'Vespa Cloud\n'
 '\n'
 '[Vespa Cloud](https://cloud.vespa.ai/)\n'
 ' [Vespa Cloud Console](https://console.vespa.oath.cloud/)\n'
 '\n'
 'Company\n'
 '\n'
 '[About](/team)\n'
 ' [Jobs](/we-are-hiring)\n'
 ' [Events](/events)\n'
 ' [Press and news](/press)\n'
 '\n'
 '[Search](https://search.vespa.ai/)\n'
 ' '
 

## Extracting Entities from web content with Azure OpenAI

In [32]:
import os
import openai
import dotenv

dotenv.load_dotenv(".env")

True

In [33]:
endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
api_key = os.environ["AZURE_OPENAI_API_KEY"]
chat_model = os.environ["AZURE_OPENAI_CHAT_MODEL"]
api_version = os.environ["AZURE_OPENAI_API_VERSION"]

client = openai.AzureOpenAI(
    azure_endpoint=endpoint,
    api_key=api_key,
    api_version=api_version
)

In [29]:
sys_msg_comp_overview = """Get me the main information about the company, including their goal, detailed summary of the company, established time, the link to the page which contains information about the members of the company (usually a site within the same domain). Return as a JSON with four keys: {company_goal: str, company_summary: str, established_time: str, members_page: str}."""

In [30]:
def extract(user_input: str, system_message: str):
  entity_extraction_system_message = {"role": "system", "content": system_message}
  messages = [entity_extraction_system_message]
  messages.append({"role": "user", "content": user_input})

  response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        stream=False,
        response_format={"type": "json_object"}
    )

  return response.choices[0].message.content

In [27]:
def display_extracted_content(results: List[Dict[str, any]], num_objects: int):
    table = PrettyTable()
    table.field_names = ["Site", "Provider Name", "Extracted Content"]

    # Ensure num_objects does not exceed the length of the results list
    num_objects = min(num_objects, len(results))

    # Process the specified number of items from the results list with a progress bar
    for result in tqdm(results[:num_objects], desc="Processing results"):
        provider_name = result["provider"]

        for site in result["sites"]:
            function_name = site["name"]
            content = site["content"]

            # Progress bar for each function
            for _ in tqdm(range(1), desc=f"Extracting content with {provider_name} for {function_name}"):
                extracted_content = extract(content, sys_msg_comp_overview)
                table.add_row([provider_name, function_name, extracted_content])

    table.max_width = 50  # Set the maximum width for better display
    table.hrules = ALL

    print("Extracted Content Table:")
    print(table)
     


In [28]:
display_extracted_content(all_content, num_objects=9)


Processing results:   0%|          | 0/2 [00:00<?, ?it/s]
Extracting content with Vespa for Firecrawl:   0%|          | 0/1 [00:00<?, ?it/s][A
Extracting content with Vespa for Firecrawl: 100%|██████████| 1/1 [00:03<00:00,  3.54s/it][A

Extracting content with Vespa for Jina AI:   0%|          | 0/1 [00:00<?, ?it/s][A
Extracting content with Vespa for Jina AI: 100%|██████████| 1/1 [00:03<00:00,  3.87s/it][A
Processing results:  50%|█████     | 1/2 [00:07<00:07,  7.42s/it]
Extracting content with Rresail for Firecrawl:   0%|          | 0/1 [00:00<?, ?it/s][A
Extracting content with Rresail for Firecrawl: 100%|██████████| 1/1 [00:03<00:00,  3.43s/it][A

Extracting content with Rresail for Jina AI:   0%|          | 0/1 [00:00<?, ?it/s][A
Extracting content with Rresail for Jina AI: 100%|██████████| 1/1 [00:03<00:00,  3.99s/it][A
Processing results: 100%|██████████| 2/2 [00:14<00:00,  7.42s/it]

Extracted Content Table:
+---------+---------------+----------------------------------------------------+
|   Site  | Provider Name |                 Extracted Content                  |
+---------+---------------+----------------------------------------------------+
|  Vespa  |   Firecrawl   |                         {                          |
|         |               |   "company_goal": "Apply AI to your data, online.  |
|         |               |    At any scale, with unbeatable performance.",    |
|         |               |    "company_summary": "Vespa is a fully featured   |
|         |               |  search engine and vector database that supports   |
|         |               | vector search (ANN), lexical search, and search in |
|         |               |     structured data, all in the same query. It     |
|         |               | integrates machine-learned model inference to make |
|         |               | sense of data in real-time, providing capabilities |
|  




## Agentic Workflow for scraping member information

[1;3;38;5;200mThought: The current language of the user is English. I need to use a tool to help me answer the question.
Action: parse_member_page
Action Input: {'page_url': 'https://presail.com/about-us'}
[0m[![](https://cdn.prod.website-files.com/64dd0eb691b3ca8e63119b8b/64e07c019305239d53af91ab_Logo%20Text%20center%20Aligned%2C%20but%20smaller%20(1)](/)

Products

#### Operations

[Fundraising\
\
Elevate your fundraising game with a perfect blend of customizability and user-friendliness.](/features/fundraising)
[Allocation management\
\
Maintain a single source of truth for all investments, ensuring accurate and up-to-date records.](/features/allocation-management)
[Token distribution\
\
Distribute tokens with unmatched accuracy to hundreds of investors at the click of a button. Seamless, swift, and spot-on every time.](/features/token-vesting)
[OTC marketplace\
\
Enable seamless trading of allocations, offering your investors liquidity while generating passive revenue through com

AgentChatResponse(response='Here are the members of the company from the provided page:\n\n1. **Michelle** - Head of Content [LinkedIn](https://www.linkedin.com/in/michellejunewu/)\n2. **Emmanuel** - Senior Developer [LinkedIn](https://www.linkedin.com/in/cousinemmanuel/)\n3. **Dan** - Senior Developer [LinkedIn](https://www.linkedin.com/in/danbrooking)\n4. **Anna** - Principal Recruiter [LinkedIn](https://www.linkedin.com/in/anna-serbo-/)\n5. **Håkon** - Customer Success [LinkedIn](https://www.linkedin.com/in/h%C3%A5kon-%C3%B8verby-64534a22b/)\n6. **Glen** - Senior Frontend Developer [LinkedIn](https://www.linkedin.com/in/glen-curtis-562439117/)\n7. **Pablo** - Web 3 Domain Specialist [LinkedIn](https://www.linkedin.com/in/pabloruizdotco/)\n8. **Maral** - Head of Compliance [LinkedIn](https://www.linkedin.com/in/maral-mirshahi-a1394440/)\n9. **Tomas** - CTO [LinkedIn](https://www.linkedin.com/in/tomas-veiden/)\n10. **Hamza** - Head of Product [LinkedIn](https://www.linkedin.com/in/ham

In [39]:
from llama_index.core.tools import FunctionTool
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.core.agent import ReActAgent

llm = AzureOpenAI(
    azure_deployment=chat_model,
    temperature=0.1,
    azure_endpoint=endpoint,
    api_key=api_key,
    api_version=api_version,
)

sys_msg_comp_members="""Get all the company members mentioned in the given page content. Return as a JSON with the key 'members' containing a list of dictionaries with keys 'name', 'role' and 'linkedin_page'. Note that when linkedIn page is not available, it should be an empty string.""" 

def parse_member_page(page_url: str):
    """
    This function scrapes a page containing information about members of a company.
    :param page_url: str, the url to scrape
    :return: 
    """
    member_page_content = scrape_firecrawl(page_url)
    # print(member_page_content)
    extracted_member_info = extract(member_page_content,sys_msg_comp_members)
    return extracted_member_info

def check_member_page_validity(members_page_url: str, company_page_url: str):
    """
    This function checks if the url of the members page if valid.
    Usually the members page is in the same domain as the company page.
    :param members_page_url: str, the url of the members page
    :param company_page_url: str, the url of the company page
    :return: 
    """
    members_page_url = members_page_url.strip("https://").strip("http://")
    company_page_url = company_page_url.strip("https://").strip("http://")
    if company_page_url in members_page_url:
        return True
    return False

def parse_company_page(page_url: str):
    """
    This function scrapes the main page of a company.
    :param page_url: str, the url to scrape
    :return:
    """
    company_page_content = scrape_firecrawl(page_url)
    extracted_company_info = extract(company_page_content, sys_msg_comp_overview)
    return extracted_company_info

member_scraping_tool = FunctionTool.from_defaults(fn=parse_member_page)
company_page_tool = FunctionTool.from_defaults(fn=parse_company_page)
member_page_validity_tool = FunctionTool.from_defaults(fn=check_member_page_validity)

agent = ReActAgent.from_tools([member_scraping_tool, company_page_tool, member_page_validity_tool], llm=llm, verbose=True)
# agent.chat("Get the information about the CEO and/or the founder of the company from this page https://presail.com/about-us")
agent.chat("Get me the overall information about the company, also the information of CEO and/or the founder of the company from this page https://presail.com/")

[1;3;38;5;200mThought: The current language of the user is English. I need to use a tool to help me answer the question.
Action: parse_company_page
Action Input: {'page_url': 'https://presail.com/'}
[0m[1;3;34mObservation: {
    "company_goal": "Unify all aspects of Web 3.0 investments in one platform—from fundraising and allocation management to token distribution, compliance, and OTC trading.",
    "company_summary": "Presail, soon to be rebranded to Spring and found at www.spring.net, offers a comprehensive platform designed to simplify the complex processes involved in Web 3.0 investments. This includes tools for fundraising, allocation management, token distribution, OTC marketplace, and an investor portal. Their products also cover back-office features like compliance and spreadsheet imports, ensuring a seamless transition from manual processes to automated solutions. Key use cases include services for investment groups, blockchains, and projects, facilitating efficient operat

AgentChatResponse(response='### Company Information\n**Company Name:** Presail (soon to be rebranded to Spring)\n**Website:** [www.spring.net](http://www.spring.net)\n**Goal:** Unify all aspects of Web 3.0 investments in one platform—from fundraising and allocation management to token distribution, compliance, and OTC trading.\n**Summary:** Presail offers a comprehensive platform designed to simplify the complex processes involved in Web 3.0 investments. This includes tools for fundraising, allocation management, token distribution, OTC marketplace, and an investor portal. Their products also cover back-office features like compliance and spreadsheet imports, ensuring a seamless transition from manual processes to automated solutions. Key use cases include services for investment groups, blockchains, and projects, facilitating efficient operation, scaling, and management of investor data.\n**Established Time:** © 2021 - 2023 Presail. All rights reserved.\n\n### CEO Information\n**Name: