<a href="https://colab.research.google.com/github/muqarrab469/insightshield/blob/main/BBCScraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**BBC News Scraper**

In [None]:
!pip install requests beautifulsoup4

import requests
from bs4 import BeautifulSoup
import csv
import os
from datetime import datetime, timedelta

from google.colab import drive
drive.mount('/content/drive')

output_file = "/content/drive/My Drive/FYP/bbc_articles.csv"
IMAGE_DIR = '/content/drive/My Drive/FYP/bbc_images'

os.makedirs(IMAGE_DIR, exist_ok=True)
BASE_URL = 'https://www.bbc.com'

#Articles Link Scraper Function:
def scrape_article_links(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        article_links = set()

        for link in soup.find_all('a', class_='sc-2e6baa30-0 gILusN'):
            href = link.get('href')
            if href:
                full_url = BASE_URL + href if href.startswith('/') else href
                article_links.add(full_url)

        return list(article_links)[:10]
    except Exception as e:
        print(f"Error occurred while scraping article links: {str(e)}")
        return []

def download_image(image_url, article_id):
    try:
        response = requests.get(image_url, stream=True)
        if response.status_code == 200:
            image_path = os.path.join(IMAGE_DIR, f"news_{article_id}.jpg")
            with open(image_path, 'wb') as file:
                for chunk in response.iter_content(1024):
                    file.write(chunk)
            return image_path
        else:
            print(f"Failed to download image from {image_url}")
            return None
    except Exception as e:
        print(f"Error occurred while downloading image from {image_url}: {str(e)}")
        return None

def convert_time_format(time_string):
    if 'hour' in time_string:
        hours_ago = int(time_string.split()[0])
        published_time = datetime.now() - timedelta(hours=hours_ago)
    else:
        published_time = datetime.now()  # Default to current time if unable to parse
    return published_time.strftime('%B %d, %Y / %I:%M %p %Z')

#Scraper to Scrap from Scraped Links:
def scrape_article_data(article_link, article_id):
    try:
        response = requests.get(article_link)
        soup = BeautifulSoup(response.text, 'html.parser')

        #Title
        title_tag = soup.find('h1', class_='sc-518485e5-0 bWszMR')
        title = title_tag.text.strip() if title_tag else None

        #Body
        body = ''
        text_blocks = soup.find_all('div', class_='sc-43e6b7ba-0 bWSguZ')
        for block in text_blocks:
            paragraphs = block.find_all('p')
            for paragraph in paragraphs:
                body += paragraph.text.strip() + ' '
        body = body.strip()

        #Category
        category = 'World'

        #Image URL
        img_tag = soup.find('img', class_='sc-fd6cb93-0 hvRJnO')
        article_img = img_tag.get('src') if img_tag else None
        if article_img:
            article_img = download_image(article_img, article_id)

        #Date and time
        date_tag = soup.find('time', class_='sc-36c5adb2-9 bhJTar')
        article_time_string = date_tag.text.strip() if date_tag else 'Unknown'
        article_date_time = convert_time_format(article_time_string)

        #Author
        author_tags = soup.find('div', class_='sc-36c5adb2-3 eDSSZR').find_all(['span', 'a'], class_=['sc-36c5adb2-5 fiZctN', 'sc-36c5adb2-6 hzkUOS'])
        author = ', '.join(tag.text.strip() for tag in author_tags)
        article_author = author.strip()

        #Website name
        article_website_name = 'BBC'

        return {
            'id': article_id,
            'news_title': title,
            'news_text': body,
            'news_category': category,
            'news_img': article_img,
            'news_date_time': article_date_time,
            'news_author': article_author,
            'news_website_name': article_website_name,
            'news_link': article_link
        }
    except Exception as e:
        print(f"Error occurred while scraping article data from {article_link}: {str(e)}")
        return None

def main():
    base_url = 'https://www.bbc.com'

    print("Scraping article links...")
    article_links = scrape_article_links(base_url)
    print(f"Scraped {len(article_links)} article links.")

    articles_data = []

    print("Scraping article data...")
    for idx, link in enumerate(article_links, start=1):
        print(f"Scraping article {idx} data from: {link}")
        article_data = scrape_article_data(link, idx)
        if article_data:
            articles_data.append(article_data)

    print("Saving scraped data to CSV file...")
    with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = ["id", "news_title", "news_text", "news_category", "news_img", "news_date_time", "news_author", "news_website_name", "news_link"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for article in articles_data:
            writer.writerow(article)

    print(f"Scraping complete. Data saved to '{output_file}'")

if __name__ == "__main__":
    main()


Mounted at /content/drive
Scraping article links...
Scraped 10 article links.
Scraping article data...
Scraping article 1 data from: https://www.bbc.com/news/science-environment-58073295
Scraping article 2 data from: https://www.bbc.com/news/articles/cg33rlzxgnxo
Scraping article 3 data from: https://www.bbc.com/news/articles/cqlle3k92zqo
Scraping article 4 data from: https://www.bbc.com/travel/article/20240605-f1-driver-lance-strolls-guide-to-a-weekend-in-his-hometown-montreal
Scraping article 5 data from: https://www.bbc.com/travel/article/20240603-the-pacific-crest-trail-the-us-west-coasts-greatest-footpath
Scraping article 6 data from: https://www.bbc.com/news/articles/crggqrzyjjpo
Scraping article 7 data from: https://www.bbc.com/news/articles/c800xr94x5lo
Error occurred while scraping article data from https://www.bbc.com/news/articles/c800xr94x5lo: 'NoneType' object has no attribute 'find_all'
Scraping article 8 data from: https://www.bbc.com/news/articles/c9001dkzxeno
Scraping 