In [None]:
import requests
from bs4 import BeautifulSoup
import json
import hashlib
import time
import os

BASE_URL = "https://www.kentonline.co.uk"
SECTION_URL_1 = f"{BASE_URL}/categories/politics-podcasts/"
SECTION_URL_2 = f"{BASE_URL}/categories/politics/"

westminster-watch-kent/
categories/elections/
westminster-watch-medway/

HEADERS = {'User-Agent': 'Mozilla/5.0'}
SAVE_PATH = "../data/news/kent_online/kent_politics_articles.json"

# Step 1: Get the main politics page
response = requests.get(SECTION_URL, headers=HEADERS)
soup = BeautifulSoup(response.text, 'html.parser')

# Step 2: Extract article URLs (we'll look for links inside typical article containers)
article_links = []
for a in soup.select("a[href*='/kent/news/']"):
    href = a.get("href")
    if href and href.startswith("/kent/news/") and href not in article_links:
        article_links.append(BASE_URL + href)

# Limit to first 2 for now
article_links = article_links[:30]

# Step 3: Fetch each article and extract data
articles_data = []

for url in article_links:
    time.sleep(1)  # polite delay
    article_res = requests.get(url, headers=HEADERS)
    article_soup = BeautifulSoup(article_res.text, 'html.parser')

    # Extract from JSON-LD metadata
    json_ld = article_soup.find("script", type="application/ld+json")
    if json_ld:
        try:
            metadata = json.loads(json_ld.string)
            title = metadata.get("headline")
            date_published = metadata.get("datePublished")
            author = metadata.get("author", {}).get("name")
            tags = metadata.get("keywords", [])
        except Exception:
            title, date_published, author, tags = None, None, None, []
    else:
        title, date_published, author, tags = None, None, None, []

    # Extract article text from content div
    paragraphs = article_soup.select("div.PageContent p")
    article_text = "\n".join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))

    # Store result
    article_data = {
        "title": title,
        "author": author,
        "date": date_published,
        "tags": tags,
        "url": url,
        "article_text": article_text
    }
    articles_data.append(article_data)

articles_data

# Step 1: Load existing data if file exists
if os.path.exists(SAVE_PATH):
    with open(SAVE_PATH, "r", encoding="utf-8") as f:
        existing_articles = json.load(f)
else:
    existing_articles = []

# Step 2: Create a set of existing URLs for fast lookup
existing_urls = {article["url"] for article in existing_articles}

# Step 3: Add only new articles
new_articles = [article for article in articles_data if article["url"] not in existing_urls]

# Step 4: Append and save
if new_articles:
    all_articles = existing_articles + new_articles
    with open(SAVE_PATH, "w", encoding="utf-8") as f:
        json.dump(all_articles, f, ensure_ascii=False, indent=2)
