In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json

BASE_URL_BBC = "https://www.bbc.com"
BASE_URL_DAWN = "https://www.dawn.com"

articles = []

#get article links
def get_article_links(base_url):
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    article_links = [urljoin(base_url, link['href']) for link in soup.find_all('a', href=True)]
    return article_links

#get article content
def get_article_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    title = soup.find('h1').text if soup.find('h1') else ''
    description = soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else ''
    return title, description

# Fetch article links
links = get_article_links(BASE_URL_BBC)

for link in links:
    try:
        title, description = get_article_content(link)
        articles.append({"title": title, "description": description})
    except Exception as e:
        print(f"Error with URL {link}: {e}")

output_path = 'bbc_articles.json'
with open(output_path, 'w', encoding='utf-8') as json_file:
    json.dump(articles, json_file, ensure_ascii=False, indent=4)
print(f"Data successfully written to {output_path}")


Data successfully written to bbc_articles.json


In [2]:
import json
import re

def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\s+', ' ', text).strip()  
    return text

input_path = 'bbc_articles.json'
with open(input_path, 'r', encoding='utf-8') as json_file:
    articles = json.load(json_file)
for article in articles:
    article['title'] = clean_text(article.get('title', ''))
    article['description'] = clean_text(article.get('description', ''))

output_path = 'bbc_articles_cleaned.json'
with open(output_path, 'w', encoding='utf-8') as json_file:
    json.dump(articles, json_file, ensure_ascii=False, indent=4)

print(f"Cleaned data to {output_path}")


Cleaned data to bbc_articles_cleaned.json


In [4]:
%pip install pydrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive

# Set the path to your client secrets file
client_secrets_path = 'client_secrets.json'

# Authenticate with Google Drive
gauth = GoogleAuth()
gauth.LoadClientConfigFile(client_secrets_path)
gauth.LocalWebserverAuth()

# Initialize Google Drive client
drive = GoogleDrive(gauth)

# Example function to upload a file
def upload_file_to_drive(file_path, drive_folder_id):
    file = drive.CreateFile({'parents': [{'id': drive_folder_id}]})
    file.SetContentFile(file_path)
    file.Upload()


drive_folder_id = "1pjFjuF_NDbfyZkn9zx_FazNRJ5q1gHJJ"
file_path = r"C:\Users\HP\Desktop\mlops\bbc_articles_cleaned.json"

Note: you may need to restart the kernel to use updated packages.
Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=461645083159-nq6vu5pdur3ua86lldafjik3msc9cetp.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&access_type=offline&response_type=code

Authentication successful.


Collecting pydrive
  Downloading PyDrive-1.3.1.tar.gz (987 kB)
     ------------------------------------ 987.4/987.4 kB 161.6 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting google-api-python-client>=1.2
  Downloading google_api_python_client-2.128.0-py2.py3-none-any.whl (11.7 MB)
     -------------------------------------- 11.7/11.7 MB 281.3 kB/s eta 0:00:00
Collecting oauth2client>=4.0.0
  Using cached oauth2client-4.1.3-py2.py3-none-any.whl (98 kB)
Collecting google-auth-httplib2<1.0.0,>=0.2.0
  Using cached google_auth_httplib2-0.2.0-py2.py3-none-any.whl (9.3 kB)
Collecting uritemplate<5,>=3.0.1
  Using cached uritemplate-4.1.1-py2.py3-none-any.whl (10 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5
  Downloading google_api_core-2.19.0-py3-none-any.whl (139 kB)
     ------------------------------------ 139.0/139.0 kB 550.4 kB/s eta 0:00:00
Collecting httplib2<1.dev0,