In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Function to fetch and parse a webpage
def fetch_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return BeautifulSoup(response.text, "lxml")
        else:
            print(f"Failed to fetch {url} (Status Code: {response.status_code})")
            return None
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

# Function to clean unwanted parts from the HTML
def clean_html(soup):
    for tag in ["header", "footer", "nav", "aside", "script", "style"]:
        for element in soup.find_all(tag):
            element.decompose()
    return soup

# Function to extract article headlines and content
def extract_article_data(article_url):
    article_soup = fetch_page(article_url)
    if not article_soup:
        return None, None
    
    clean_html(article_soup)
    
    # Extract headline (commonly in <h1>)
    headline = article_soup.find("h1")
    headline_text = headline.get_text(strip=True) if headline else None
    
    # Extract main article content
    article_body = article_soup.find("main") or article_soup.body
    content = article_body.get_text(separator="\n", strip=True) if article_body else None
    
    return headline_text, content

# Function to scrape articles from a news website
def scrape_articles(base_url, article_path_filter="news", max_articles=150):
    homepage_soup = fetch_page(base_url)
    if not homepage_soup:
        return []
    
    # Collect article links
    article_links = set()
    for a_tag in homepage_soup.find_all("a", href=True):
        link = a_tag["href"]
        if article_path_filter in link:  # Filter for relevant links
            full_link = f"{base_url}{link}" if link.startswith("/") else link
            article_links.add(full_link)
    
    # Limit to the maximum number of articles
    article_links = list(article_links)[:max_articles]
    
    # Extract data for each article
    data = []
    for link in article_links:
        print(f"Processing: {link}")
        headline, content = extract_article_data(link)
        if headline and content:
            data.append({"headline": headline, "content": content})
        time.sleep(2)  # Be polite and avoid overloading the server
    
    return data

# Main scraping execution
if __name__ == "__main__":
    # Define the target news website
    base_url = "https://www.bbc.com"
    article_filter = "news"  # Adjust this filter for other websites if needed
    
    # Scrape articles
    articles = scrape_articles(base_url, article_filter, max_articles=150)
    
    # Create a DataFrame
    df = pd.DataFrame(articles)
    print(df.head())
    
    # Save the DataFrame to a CSV file
    df.to_csv("C:/Users/Mithi/Downloads/recommend_news.csv", index=False)



Processing: https://www.bbc.com/news/articles/crmnxkl9z9mo
Processing: https://www.bbc.com/news/world/middle_east
Processing: https://www.bbc.com/news/us-canada
Processing: https://www.bbc.com/news/articles/c5y4q4eny2do
Processing: https://www.bbc.com/news/articles/cy7kpvndyyxo
Processing: https://www.bbc.com/news/articles/c24nrr0mv4go
Processing: https://www.bbc.com/news/articles/cdrdjkny103o
Processing: https://www.bbc.com/news/articles/cn4vk5y4ng5o
Processing: https://www.bbc.com/news/world/asia
Processing: https://www.bbc.com/news/articles/c75wqr0k3dyo
Processing: https://www.bbc.com/news/videos/cz6ln121dxgo
Processing: https://www.bbc.com/news/northern_ireland
Processing: https://www.bbc.com/culture/entertainment-news
Processing: https://www.bbc.com/news/england
Processing: https://www.bbc.com/news/articles/c3e3p3nx3kno
Processing: https://www.bbc.com/news/articles/c6238exzjpxo
Processing: https://www.bbc.com/news/articles/cnv3q9qrdd4o
Processing: https://www.bbc.com/live/news
Pro