In [1]:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
import time

In [2]:
news_websites = [
    "https://www.bbc.com/news",
    "https://www.reuters.com/news",
    "https://www.theguardian.com/world"
]

KEYWORD = "terrorism"

def fetch_articles(url, keyword):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Failed to fetch {url}: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a', href=True)

    relevant_articles = []
    seen = set()

    for link in links:
        text = link.get_text(strip=True)
        href = link['href']

        if keyword.lower() in text.lower():
            full_url = urljoin(url, href)
            if full_url not in seen:
                seen.add(full_url)
                relevant_articles.append({
                    'title': text,
                    'url': full_url,
                    'source': url
                })

    return relevant_articles

In [3]:
def scrape_news(websites, keyword):
    all_articles = []
    for site in websites:
        print(f"Scraping: {site}")
        articles = fetch_articles(site, keyword)
        all_articles.extend(articles)
        time.sleep(1)  # Be polite

    return all_articles

if __name__ == "__main__":
    results = scrape_news(news_websites, KEYWORD)
    for article in results:
        print(f"Title: {article['title']}\nURL: {article['url']}\n")

Scraping: https://www.bbc.com/news
Scraping: https://www.reuters.com/news
Failed to fetch https://www.reuters.com/news: 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/news
Scraping: https://www.theguardian.com/world


In [4]:
import csv

if __name__ == "__main__":
    results = scrape_news(news_websites, KEYWORD)

    # Save to CSV
    with open('terrorism_articles.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['title', 'url', 'source']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for article in results:
            writer.writerow(article)

    print(f"Saved {len(results)} articles to 'terrorism_articles.csv'")


Scraping: https://www.bbc.com/news
Scraping: https://www.reuters.com/news
Failed to fetch https://www.reuters.com/news: 401 Client Error: HTTP Forbidden for url: https://www.reuters.com/news
Scraping: https://www.theguardian.com/world
Saved 0 articles to 'terrorism_articles.csv'
