In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json

# Base URLs of news sites
bbc_base_url = "https://www.bbc.com"
dawn_base_url = "https://www.dawn.com"

# Initialize an empty list to store articles
articles = []

# Function to get article links
def getting_article_links(base_url):
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    article_links = [urljoin(base_url, link['href']) for link in soup.find_all('a', href=True)]
    return article_links

# Function to get article content
def getting_article_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    title = soup.find('h1').text if soup.find('h1') else ''
    description = soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else ''
    return title, description

# Fetch article links and their content from BBC
links = getting_article_links(bbc_base_url)

for link in links:
    try:
        title, description = getting_article_content(link)
        # Add each article to the articles list
        articles.append({"title": title, "description": description})
    except Exception as e:
        print(f"Error with URL {link}: {e}")

# Save articles data to a JSON file
output_path = 'bbc_articles_scrapped.json'
with open(output_path, 'w', encoding='utf-8') as json_file:
    json.dump(articles, json_file, ensure_ascii=False, indent=4)

print(f"Data has been successfully written to:  {output_path}")

Data has been successfully written to:  bbc_articles_scrapped.json


In [7]:
import json
import re

# Function to clean text
def cleaning_text(text):
    text = re.sub('<.*?>', '', text)  # Remove HTML tags
    text = re.sub('\s+', ' ', text).strip()  # Remove extra spaces and newlines
    return text

# Load data from the existing JSON file
input_path = 'bbc_articles_scrapped.json'
with open(input_path, 'r', encoding='utf-8') as json_file:
    articles = json.load(json_file)

# Clean each article's title and description
for article in articles:
    article['title'] = cleaning_text(article.get('title', ''))
    article['description'] = cleaning_text(article.get('description', ''))

# Save cleaned data to a new JSON file
output_path = 'bbc_articles_cleaned_text.json'
with open(output_path, 'w', encoding='utf-8') as json_file:
    json.dump(articles, json_file, ensure_ascii=False, indent=4)

print(f"Cleaned data successfully written to {output_path}")

Cleaned data successfully written to bbc_articles_cleaned_text.json


  text = re.sub('\s+', ' ', text).strip()  # Remove extra spaces and newlines


In [2]:
!pip install pydrive

Collecting pydrive
  Downloading PyDrive-1.3.1.tar.gz (987 kB)
     ---------------------------------------- 0.0/987.4 kB ? eta -:--:--
     ---------------------------------------- 10.2/987.4 kB ? eta -:--:--
     ---------------------------------------- 10.2/987.4 kB ? eta -:--:--
      ------------------------------------ 20.5/987.4 kB 108.9 kB/s eta 0:00:09
     - ----------------------------------- 30.7/987.4 kB 145.2 kB/s eta 0:00:07
     - ----------------------------------- 41.0/987.4 kB 163.4 kB/s eta 0:00:06
     -- ---------------------------------- 61.4/987.4 kB 217.9 kB/s eta 0:00:05
     --- --------------------------------- 92.2/987.4 kB 290.5 kB/s eta 0:00:04
     ---- ------------------------------- 122.9/987.4 kB 359.9 kB/s eta 0:00:03
     ------ ----------------------------- 174.1/987.4 kB 436.9 kB/s eta 0:00:02
     ------- ---------------------------- 204.8/987.4 kB 478.0 kB/s eta 0:00:02
     -------- --------------------------- 235.5/987.4 kB 514.3 kB/s eta 0:00

In [1]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive

# Set the path to your client secrets file
client_secrets_path = 'client_secrets.json'

# Authenticate with Google Drive
gauth = GoogleAuth()
gauth.LoadClientConfigFile(client_secrets_path)
gauth.LocalWebserverAuth()

# Initialize Google Drive client
drive = GoogleDrive(gauth)

# Example function to upload a file
def upload_file_to_drive(file_path, drive_folder_id):
    file = drive.CreateFile({'parents': [{'id': drive_folder_id}]})
    file.SetContentFile(file_path)
    file.Upload()


drive_folder_id = "16dfWbMm1NqvcEUhEENU9Uxn33QGV2qzv"
file_path = r"C:\Users\Muneeba\Downloads\i202656_Assignment#2\bbc_articles_cleaned.json"

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=461645083159-nq6vu5pdur3ua86lldafjik3msc9cetp.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&access_type=offline&response_type=code

Authentication successful.
