In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import concurrent.futures

# Scraping BBC
def get_article_urls(section_url, base_url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    try:
        response = requests.get(section_url, headers = headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        links = []
        for a in soup.find_all('a', href=True):
            href = a['href']
            if href.startswith('/') and not href.endswith('live'):
                full_url = f'{base_url}{href}'
                links.append(full_url)
            elif href.startswith(base_url):
                links.append(href)

        return list(set(links))
    except Exception as e:
        print(f"Error fetching URLs from {section_url}: {e}")
        return []
    
def scrape_article(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    try:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        title = soup.title.string if soup.title else ''
        paragraphs = soup.find_all('p')
        content = ' '.join([paragraph.get_text() for paragraph in paragraphs])

        return {'url': url, 'title': title, 'content': content}
    except Exception as e:
        print(f"Error scraping article {url}: {e}")
        return {'url': url, 'title': '', 'content': ''}

# List of BBC sections and their subsections
bbc_sections = [
    'https://www.bbc.com/news',
    'https://www.bbc.com/news/topics/c2vdnvdg6xxt', 
    'https://www.bbc.com/news/war-in-ukraine', 
    'https://www.bbc.com/news/topics/crggn4j2lm0t', 
    'https://www.bbc.com/news/us-canada', 
    'https://www.bbc.com/news/uk', 
    'https://www.bbc.com/news/world/africa', 
    'https://www.bbc.com/news/world/asia', 
    'https://www.bbc.com/news/world/australia', 
    'https://www.bbc.com/news/world/europe', 
    'https://www.bbc.com/news/world/latin_america', 
    'https://www.bbc.com/news/world/middle_east', 
    'https://www.bbc.com/news/in_pictures', 
    'https://www.bbc.com/news/reality_check',
    'https://www.bbc.com/sport',
    'https://www.bbc.com/business', 
    'https://www.bbc.com/innovation', 
    'https://www.bbc.com/culture', 
    'https://www.bbc.com/travel', 
    'https://www.bbc.com/future-planet'
]

def fetch_urls(sections, base_url):
    all_urls = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_url = {executor.submit(get_article_urls, section, base_url): section for section in sections}
        for future in concurrent.futures.as_completed(future_to_url):
            section = future_to_url[future]
            try:
                urls = future.result()
                all_urls.extend(urls)
            except Exception as e:
                print(f"Error getting URLs from {section}: {e}")
    return list(set(all_urls))

def fetch_articles(urls):
    articles = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_url = {executor.submit(scrape_article, url): url for url in urls}
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                article_data = future.result()
                articles.append(article_data)
            except Exception as e:
                print(f"Error scraping article {url}: {e}")
    return articles

# Fetch all URLs for BBC``
bbc_article_urls = fetch_urls(bbc_sections, 'https://www.bbc.com')

# Scrape articles concurrently
articles = fetch_articles(bbc_article_urls)

# Convert the list of dictionaries to a pandas DataFrame
bbc_df = pd.DataFrame(articles)

# Save the DataFrame to a CSV file
bbc_df.to_csv('bbc_articles.csv', index = False)

# Print a sample of the scraped articles
print(bbc_df.head())