In [23]:
from llama_index.readers.web import SimpleWebPageReader
from llama_index.core import (
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage,
    Document
)
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.embeddings.ollama import OllamaEmbedding
from ollama import chat
import os
import re
import requests
from bs4 import BeautifulSoup

In [28]:
# import requests
# from bs4 import BeautifulSoup

url = "https://www.nhs.uk/conditions/common-cold/"
# response = requests.get(url)
# soup = BeautifulSoup(response.text, "html.parser")

# title = soup.title.string.strip()
# print(title)  # "Common cold - NHS"


In [29]:
def fetch_html(url: str) -> str:
    """Fetch the HTML of a webpage."""
    response = requests.get(url)
    response.raise_for_status()
    return response.text


html =fetch_html(url)

In [30]:
def extract_title(html: str) -> str:
    """Extract the real page title from HTML."""
    soup = BeautifulSoup(html, "html.parser")
    if soup.title and soup.title.string:
        return soup.title.string.strip()
    # fallback: first <h1>
    h1 = soup.find("h1")
    return h1.get_text(strip=True) if h1 else "unknown"

extract_title(html)

'Common cold\n - NHS'

In [31]:
def extract_metadata(doc: Document, html: str = None, url: str = None) -> dict:
    # Get the HTML either from argument or fetch from URL
    if html is None and url is not None:
        html = fetch_html(url)
    
    text = doc.text.lower()
    
    # Extract the page title
    title = extract_title(html) if html else "unknown"

    metadata = {
        "title": title,
        "source": url or doc.metadata.get("url", "unknown"),
        "symptoms": list(set(re.findall(
            r"\b(fever|cough|sore throat|fatigue|sneezing|hoarse voice|runny nose|blocked nose)\b",
            text
        ))),
        "severity": "mild",  # default for common cold per NHS
        "duration": "1‑2 weeks" if ("1 to 2 weeks" in text or "about 1 to 2 weeks" in text) else "unspecified"
    }

    return metadata


In [32]:
NHS_URLS = [
    "https://www.nhs.uk/conditions/common-cold/",
    "https://www.nhs.uk/conditions/flu/"
]

documents = SimpleWebPageReader(html_to_text=True).load_data(urls=NHS_URLS)
doc = documents[0]


In [33]:
documents = SimpleWebPageReader(html_to_text=True).load_data(urls=NHS_URLS)
doc = documents[0]

# Fetch HTML of the first page (for title extraction)
html_content = fetch_html(NHS_URLS[0])

# Call extract_metadata
metadata = extract_metadata(doc, html=html_content)
print("Extracted Metadata:", metadata)

Extracted Metadata: {'title': 'Common cold\n - NHS', 'source': 'https://www.nhs.uk/conditions/common-cold/', 'symptoms': ['runny nose', 'blocked nose', 'sore throat', 'cough', 'sneezing', 'hoarse voice'], 'severity': 'mild', 'duration': '1‑2 weeks'}


In [34]:
documents = SimpleWebPageReader(html_to_text=True).load_data(urls=NHS_URLS)
for doc in documents:
    print(doc.metadata)
    print(doc.text[:500])  # first 500 characters

{'url': 'https://www.nhs.uk/conditions/common-cold/'}
﻿ Skip to main content

[ ](/)

Search the NHS website

Search

  * [ Health A to Z  ](/health-a-to-z/)
  * [ NHS services  ](/nhs-services/)
  * [ Live Well  ](/live-well/)
  * [ Mental health  ](/mental-health/)
  * [ Care and support  ](/social-care-and-support/)
  * [ Pregnancy  ](/pregnancy/)
  * [ Home  ](/)
  * Browse More 

  1. [Home](/)
  2. [Health A to Z](/health-a-to-z/)
  3. [Conditions A to Z](/conditions/)

[ Back to  Conditions A to Z ](/conditions/)

# Common cold

You can ofte
{'url': 'https://www.nhs.uk/conditions/flu/'}
﻿ Skip to main content

[ ](/)

Search the NHS website

Search

  * [ Health A to Z  ](/health-a-to-z/)
  * [ NHS services  ](/nhs-services/)
  * [ Live Well  ](/live-well/)
  * [ Mental health  ](/mental-health/)
  * [ Care and support  ](/social-care-and-support/)
  * [ Pregnancy  ](/pregnancy/)
  * [ Home  ](/)
  * Browse More 

  1. [Home](/)
  2. [Health A to Z](/health-a-to-z/)
  3. [Condit

✅ NHS Reader Ready.


KeyboardInterrupt: Interrupted by user