In [55]:
from langgraph.graph import StateGraph, START, END
from typing_extensions import TypedDict, Literal, Optional, List
from IPython.display import Image, display
from langgraph.prebuilt import create_react_agent
from langchain import hub
from langchain_community.agent_toolkits import MultionToolkit
import json
from langchain.output_parsers import PydanticOutputParser
from langchain_aws import ChatBedrockConverse
import os
from dotenv import load_dotenv
from langchain_community.tools import DuckDuckGoSearchRun
from langchain_community.document_loaders import WebBaseLoader
from langchain_huggingface import HuggingFaceEndpoint
from langchain_huggingface import ChatHuggingFace
from langchain_community.agent_toolkits import PlayWrightBrowserToolkit
from playwright.async_api import async_playwright
import nest_asyncio

nest_asyncio.apply()

load_dotenv()

True

None


In [4]:
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")

In [56]:
Industry = Literal[
    "advertising-marketing",
    "automotive",
    "retail-consumer-goods",
    "energy-utilities",
    "financial-services",
    "gametech",
    "health",
    "industrial",
    "manufacturing",
    "media",
    "telecom",
]

class State(TypedDict):
    url: str
    bio: Optional[str]
    industry: Optional[str]  
    news: Optional[List[dict]]      
    usecases: Optional[str]

In [6]:
from pydantic import BaseModel
class BioInfo(BaseModel):
    industry: Industry | None 
    about: str | None

In [None]:
# repo_id = "deepseek-ai/DeepSeek-R1-0528"

# llm_endpoint = HuggingFaceEndpoint(
#     repo_id=repo_id,
#     # max_length=128,
#     temperature= 1,
#     huggingfacehub_api_token="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",
#     provider="auto",  # set your provider here hf.co/settings/inference-providers
# )
# llm = ChatHuggingFace(llm=llm_endpoint)

In [26]:
# this agent to generate company bio and recognise industry
extract_prompt = """
You are a web content extractor. Your job is to collect raw text from company websites for analysis.

Available tools:
- NavigateTool (navigate_browser) - navigate to a URL
- NavigateBackTool (previous_page) - wait for an element to appear
- ClickTool (click_element) - click on an element (specified by selector)
- ExtractTextTool (extract_text) - use beautiful soup to extract text from the current web page
- ExtractHyperlinksTool (extract_hyperlinks) - use beautiful soup to extract hyperlinks from the current web page
- GetElementsTool (get_elements) - select elements by CSS selector
- CurrentPageTool (current_page) - get the current page URL

Steps to follow:
1. Navigate to the provided URL 
2. Visit the homepage first - extract key headlines, taglines, and descriptive text about what the company does
3. Check if homepage has ENOUGH info by these criteria:
   - Can you find what the company's main product/service is?
   - Is there a clear description of what they do or who they serve?
   - Are there taglines, headlines, or hero text that explain their business?
4. If homepage is missing clear business description, check "About", "About Us", or "Company" pages
5. If still unclear what they do, try "Services", "Products", or "Solutions" pages

EXTRACTION RULES:
- Return ONLY direct quotes from the website (copy exact text)
- Focus on sentences that describe the company's business, services, or mission
- Extract company taglines, value propositions, and service descriptions
- Include any industry-specific terminology you find
- Do NOT interpret, summarize, or rephrase - just extract verbatim
- Do NOT follow links to case studies, blogs, press releases, or customer stories
"""
extract_llm = ChatBedrockConverse(
    model_id="us.anthropic.claude-3-5-haiku-20241022-v1:0",
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
)
# async_browser = create_async_playwright_browser()
# web_toolkit = PlayWrightBrowserToolkit.from_browser(async_browser=async_browser)
# tools = web_toolkit.get_tools()
async def make_toolkit(headless=True):
    p = await async_playwright().start()
    browser = await p.chromium.launch(headless=headless)
    toolkit = PlayWrightBrowserToolkit.from_browser(async_browser=browser)
    return toolkit, browser, p

toolkit, browser, p = await make_toolkit()
tools = toolkit.get_tools()

extract_agent = create_react_agent(
    model = extract_llm,
    tools = tools,
    prompt = extract_prompt
    
)

async def run_extract_agent(state: State):
    """
    Analyzes a company website and extracts information.
    """
    result = await extract_agent.ainvoke({"messages":[
        {"role": "user", "content": f"Please extract information from this company website: {state['url']}"}
    ]})
    print(result)

    response_text = result["messages"][-1].content

    return response_text


In [28]:
response = await run_extract_agent({"url": "https://tz.net/"})

{'messages': [HumanMessage(content='Please extract information from this company website: https://tz.net/', additional_kwargs={}, response_metadata={}, id='87d7bdf0-cb67-4110-9831-f566d6ce929b'), AIMessage(content=[{'type': 'text', 'text': "I'll help you extract information from the TZ.NET website. I'll follow the systematic approach to gather key information about the company.\n\nFirst, I'll navigate to the website:"}, {'type': 'tool_use', 'name': 'navigate_browser', 'input': {'url': 'https://tz.net/'}, 'id': 'tooluse_nfx4I83VQQGJ1z2VqwVaPg'}], additional_kwargs={}, response_metadata={'ResponseMetadata': {'RequestId': '80e7e5f9-2dee-4a7d-a8cc-ecd27726f239', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Fri, 05 Sep 2025 00:31:34 GMT', 'content-type': 'application/json', 'content-length': '594', 'connection': 'keep-alive', 'x-amzn-requestid': '80e7e5f9-2dee-4a7d-a8cc-ecd27726f239'}, 'RetryAttempts': 0}, 'stopReason': 'tool_use', 'metrics': {'latencyMs': [2249]}, 'model_name': 'us.anth

  response = await run_extract_agent({"url": "https://tz.net/"})


In [None]:
await browser.close()
await p.stop()


In [29]:
print(response)

Based on the homepage text, I can provide a comprehensive extraction of TZ.NET's business description:

Company Tagline/Description:
- "We are a smart system solution specialist."
- "We create solutions that manage micro-access control, smart storage and property and tenant management functions effectively, securely, and reliably."
- "Our solutions can be found in the world's leading companies around the globe."

Core Business Areas:
1. Smart Locker Solutions
   - Employee Storage
   - End of Trip Lockers
   - Corporate Mail
   - University Mail Centres
   - Residential Click n' Collect (Retail)
   - Portal and Logistics

2. Tenant and Property Services
   - Commercial
   - Residential
   - High Rise
   - Mixed Use
   - Build to Rent
   - Master Planned Communities
   - Retirement and Assisted Living

3. Data Centre Solutions
   - Security
   - Enterprise DCs
   - Colocation DCs
   - Managed Services DCs
   - Cloud DCs

Unique Technology Highlights:
- Proprietary Locking Technology
- S

In [39]:
# this agent to generate company bio and recognise industry
writer_prompt = """
You are a company bio writer and industry classifier. You receive exact quotes from company websites and must create a concise company summary and identify the correct industry.

Your task:
1. Read the provided website quotes carefully
2. Write a clear, one-paragraph company bio (5-8 sentences max)
3. Classify the company into the correct industry category

INDUSTRY CATEGORIES: advertising-marketing, automotive, retail-consumer-goods, energy-utilities, financial-services, gametech, health, industrial, manufacturing, media, telecom

BIO WRITING GUIDELINES:
- Clearly state what the company does (main product/service)
- Mention target market/customers if obvious
- Keep it simple and jargon-free
- Write like you're explaining to someone unfamiliar with the company

INDUSTRY CLASSIFICATION RULES:
- Choose the PRIMARY business focus, not secondary services
- If company serves multiple industries, pick their main specialization
- Use "gametech" for gaming/esports companies
- Use "health" for healthcare, medical, wellness companies
- Use "advertising-marketing" for agencies, martech, adtech
- Use "media" for content creation, publishing, broadcasting
- Set to null only if truly unclear from the provided text

You MUST return your final response as a JSON object in exactly this format:
```json
{
  "industry": "one of the predefined industry values or null",
  "about": "company bio/summary as a string"
}
```

Example good response:
{
  "industry": "financial-services",
  "about": "FinServe Solutions is a leading provider of innovative financial technology services. We specialize in delivering secure payment processing, digital banking solutions, and fraud prevention tools to banks and fintech companies worldwide. Our mission is to empower financial institutions with cutting-edge technology that enhances customer experiences and drives operational efficiency. With a team of industry experts and a commitment to excellence, FinServe Solutions is dedicated to helping our clients navigate the evolving financial landscape. Whether it's through our advanced analytics, seamless integrations, or 24/7 support, we strive to be the trusted partner for all their financial technology needs."}

"""
writer_llm = ChatBedrockConverse(
    model_id="global.anthropic.claude-sonnet-4-20250514-v1:0",
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
)

# writer_agent = create_react_agent(
#     model = extract_llm,
#     prompt = writer_prompt
    
# )
bio_parser = PydanticOutputParser(pydantic_object=BioInfo)

def run_writer(state, response: str):
    """
    Analyzes a company website and extracts bio and industry information.
    """
    msgs = [
        ("system", writer_prompt),
        ("human", f"Please analyze the extracted information from a company website and provide the structured JSON response: {response}")
    ]
    result = writer_llm.invoke(msgs)
    # print(result)

    response_text = result["messages"][-1].content
    try:
        bio_data = bio_parser.parse(response_text)
        return {**state, "bio": bio_data.about, "industry": bio_data.industry}
    except Exception as e:
        return {**state, "bio": None, "industry": None}


In [47]:
run_writer({"url": "https:/xyz"},response)

```json
{
  "industry": "industrial",
  "about": "TZ.NET is a smart system solution specialist that develops innovative access control and management technologies for various industries. The company creates secure solutions for micro-access control, smart storage, and property management using their proprietary shape memory alloy actuated IoT smart locks - the world's first of its kind. TZ.NET serves major corporations globally through three main platforms: smart locker solutions for employee storage and logistics, tenant and property management services for commercial and residential buildings, and data center security solutions. Their technology offers compact, low-power, silent operation with smart remote control capabilities. Listed on the Australian Securities Exchange (ASX: TZL), TZ.NET provides both software subscription services and fully managed cloud services to support their comprehensive smart system solutions."
}
```


{'url': 'https:/xyz',
 'bio': "TZ.NET is a smart system solution specialist that develops innovative access control and management technologies for various industries. The company creates secure solutions for micro-access control, smart storage, and property management using their proprietary shape memory alloy actuated IoT smart locks - the world's first of its kind. TZ.NET serves major corporations globally through three main platforms: smart locker solutions for employee storage and logistics, tenant and property management services for commercial and residential buildings, and data center security solutions. Their technology offers compact, low-power, silent operation with smart remote control capabilities. Listed on the Australian Securities Exchange (ASX: TZL), TZ.NET provides both software subscription services and fully managed cloud services to support their comprehensive smart system solutions.",
 'industry': 'industrial'}

In [48]:
state = {'url': 'https:/xyz',
 'bio': "TZ.NET is a smart system solution specialist that develops innovative access control and management technologies for various industries. The company creates secure solutions for micro-access control, smart storage, and property management using their proprietary shape memory alloy actuated IoT smart locks - the world's first of its kind. TZ.NET serves major corporations globally through three main platforms: smart locker solutions for employee storage and logistics, tenant and property management services for commercial and residential buildings, and data center security solutions. Their technology offers compact, low-power, silent operation with smart remote control capabilities. Listed on the Australian Securities Exchange (ASX: TZL), TZ.NET provides both software subscription services and fully managed cloud services to support their comprehensive smart system solutions.",
 'industry': 'industrial'}

In [None]:
def bio_agent_node(state: State) -> State:
    response = await run_extract_agent(state["url"])
    new_state = run_writer(state,response)
    return new_state

In [46]:
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
# ---- Structured output you want from the LLM ----
class Article(BaseModel):
    name: str = Field(..., description="Article headline/title")
    link: str = Field(..., description="Canonical URL to the article")
    summary: str = Field(..., description="3-5 sentence summary of the news article")

class Top3Articles(BaseModel):
    items: List[Article] = Field(..., description="Must contain exactly 2-4 items", min_items=2, max_items=4)
    
articles_parser = PydanticOutputParser(pydantic_object=Top3Articles)


def industry_news_node(state: State):
    """
    Search for industry news and filter for relevance using LLM
    Returns: {"news": [{"name","link","summary"}]}
    """
    industry = state["industry"]
    company_bio = state.get("bio", "")

    if not industry:
        return {"news": "No industry specified"}

    search_query = f"{industry} industry news trends developments"

    ddg_wrapper = DuckDuckGoSearchAPIWrapper(
        region="au-en",    # bias to English
        time="y",          # past year
        safesearch="moderate",
        max_results=12,
        output_format="list"
    )
    ddg_search = DuckDuckGoSearchResults(api_wrapper=ddg_wrapper, source="news")

    search_results = ddg_search.invoke(search_query)
# exampel of results:
  #   {'snippet': "He maintains a close friendship with Mr. Obama. He first weighed in on presidential politics to criticize President Reagan's re-election campaign, and has since supported Mr. Obama, Hillary ...",
  # 'title': 'Bruce Springsteen to Appear With Harris and Obama at Atlanta and ...',
  # 'link': 'https://www.nytimes.com/2024/10/22/us/politics/springsteen-harris-atlanta-obama-philly.html'}
    
    articles_prompt = f"""
    You are analyzing news articles for relevance to a specific industry and company. You will be given a list of news articles.
    Each news article in this list has this format:
    {'snippet': " ...", 'title': '...', 'link': 'https://...'}
    
    Your task is to find the top 3 most relevant news or articles about the industry and company from this list.
    INDUSTRY: {industry}
    COMPANY BIO: {company_bio}

    Focus on:
    - Major industry developments, trends, or changes
    - New regulations or policy changes affecting the industry
    - Significant company announcements or market movements
    - Technology innovations or disruptions in the space
    - Economic impacts or market analysis

    Prioritize:
    - Recent news (within the last 90 days preferred)
    - Reputable sources (major news outlets, industry publications, trade journals)
    - News that affects the entire industry, not just single companies
    - Actionable insights or trends that matter for business decisions

    Avoid:
    - Company-specific press releases unless they impact the whole industry
    - Opinion pieces or speculation
    - Duplicate stories from different sources

    You MUST return the news articles in exactly this format:
    {articles_parser.get_format_instructions()}
    
    """
    articles_llm = ChatBedrockConverse(
        model_id="us.anthropic.claude-3-5-haiku-20241022-v1:0",
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    )
    msgs = [
        ("system", articles_prompt),
        ("human", f"Pick the 3 most relevant items for the industry and the company from this list: {search_results}")
    ]
    result = articles_llm.invoke(msgs)
    # print(result)

    response_text = result["messages"][-1].content
    
    return {"news": articles_parser.parse(response_text)}

In [51]:
industry_news_node({'url': 'https:/xyz',
 'bio': "TZ.NET is a smart system solution specialist that develops innovative access control and management technologies for various industries. The company creates secure solutions for micro-access control, smart storage, and property management using their proprietary shape memory alloy actuated IoT smart locks - the world's first of its kind. TZ.NET serves major corporations globally through three main platforms: smart locker solutions for employee storage and logistics, tenant and property management services for commercial and residential buildings, and data center security solutions. Their technology offers compact, low-power, silent operation with smart remote control capabilities. Listed on the Australian Securities Exchange (ASX: TZL), TZ.NET provides both software subscription services and fully managed cloud services to support their comprehensive smart system solutions.",
 'industry': 'industrial'})

{'news': 'Linux find 命令 Linux 命令大全 Linux find 命令用于在指定目录下查找文件和目录。 它可以使用不同的选项来过滤和限制查找的结果。 find () 方法为数组中的每个元素都调用一次函数执行： 当数组中的元素在测试条件时返回 true 时, find () 返回符合条件的元素，之后的值不会再调用执行函数。 Python find () 方法检测字符串中是否包含子字符串 str ，如果指定 beg（开始） 和 end（结束） 范围，则检查是否包含在指定范围内，如果包含子字符串返回开始的索引值，否则返回-1。 定义和用法 find () 方法返回被选元素的后代元素。 后代是子、孙、曾孙，依此类推。 DOM 树： 该方法沿着 DOM 元素的后代向下遍历，直至最后一个后代的所有路径（）。 如只需向下遍历 DOM … 定义和用法 findIndex () 方法返回传入一个测试条件（函数）符合条件的数组第一个元素位置。 findIndex () 方法为数组中的每个元素都调用一次函数执行： 当数组中的元素在测试条件时返回 true …'}

In [None]:
from urllib.parse import quote
prompt2 = """
You are an AWS use case extraction agent. Your job is to analyze AWS industry pages and extract relevant use cases.

Available tools:
- MultionCreateSession(): Create a web browsing session
- MultionUpdateSession(): Navigate and interact with web pages  
- MultionCloseSession(): Close the browsing session

Steps to follow:
1. Create a browsing session
2. Navigate to the AWS industry page URL provided
3. Scan the page for key sections like:
   - Main use cases or solutions highlighted
   - Customer case studies or success stories
   - Featured services and their applications
   - Industry-specific challenges addressed
   - Real-world examples or customer testimonials
4. Extract and summarize the top 3 most relevant use cases
5. Close the session

What to extract for each use case:
- Use case name/title
- Brief description of what it solves
- Key AWS services mentioned (if any)
- Business benefit or outcome
- Real customer example (if mentioned)

Focus on:
- Business problems and solutions, not just technical features
- Industry-specific challenges and how AWS addresses them
- Measurable outcomes or benefits when available
- Real-world applications rather than theoretical concepts

Return the information in a structured format that's easy to understand for business decision-makers.
"""
usecase_agent = create_react_agent(
    model = llm,
    tools = tools,
    prompt = prompt2
    
)
def usecase_agent_node(state: State):
    path = quote(str(state.get('industry', '')).strip('/'))
    url = f"https://aws.amazon.com/{path}/"
    result = bio_agent.invoke({"messages":[
        {"role": "user", "content": f"Please analyze this AWS industry page for relevant use cases: {url}"}
    ]})

    response_text = result["messages"][-1]["content"]
    return {"usecases": response_text}

In [None]:
def industry_problems (state: State):
    prompt = """
    """
    search = DuckDuckGoSearchRun()
    results = search.invoke(prompt + state["industry"])
    return state

In [None]:
def summary_node (state: State):
    return state

In [None]:
graph_builder = StateGraph(State)

graph_builder.add_node("bio_agent_node", bio_agent_node)
graph_builder.add_node("usecase_agent_node", usecase_agent_node)
graph_builder.add_node("industry_news_node", industry_news_node)
graph_builder.add_node("summary_node", summary_node)

graph_builder.add_edge(START, "bio_agent_node")
graph_builder.add_edge("bio_agent_node", "usecase_agent_node")
graph_builder.add_edge("bio_agent_node", "industry_news_node")
graph_builder.add_edge("usecase_agent_node", "summary_node")
graph_builder.add_edge("industry_news_node", "summary_node")
graph_builder.add_edge("summary_node", END)

In [None]:
graph = graph_builder.compile()
from IPython.display import Image, display
display(Image(graph.get_graph().draw_mermaid_png()))

In [None]:
{'bio': 'Lorikeet CX is an AI-powered healthcare customer experience platform that helps healthcare organizations improve patient engagement, communication, and overall experience. Their solution uses artificial intelligence to streamline patient interactions, provide personalized communication, and optimize healthcare service delivery.',
 'industry': 'health'}