In [63]:
# Import necessary libraries and modules
import os 
import re 
import requests 
from newspaper import Article 
from bs4 import BeautifulSoup 
from selenium import webdriver 
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By 
from selenium.webdriver.chrome.options import Options 
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains 
import time
import json
from tqdm import trange


In [64]:
options = Options()
options.add_argument("--headless=new")
driver = webdriver.Chrome(options=options)
driver.maximize_window()
base_url = "https://www.todayonline.com/"

In [65]:
# Want the article body, summary, and URL
scraped_articles = {}

In [66]:
# Function to navigate to a specific category URL
def go_to_category(driver, category_URL):
    driver.get(category_URL)

In [67]:
def scrape_categories(driver, base_url):
    driver.get(base_url)
    time.sleep(2)
        
    try:
        navbar_items = driver.find_elements(By.CSS_SELECTOR, "li.main-menu__item a.main-menu__link")
        exclude = ['watch, myfeed']
        sub_cats = []
        for item in navbar_items:
            heading = item.get_attribute("href").split('/')[-1]
            if heading not in exclude:
                sub_cats.append(heading)
    except Exception as e:
        print(f"Error extracting sub category: {e}")
    sub_cats.remove('watch')
    sub_cats.remove('myfeed')
    return sub_cats

In [68]:
# # Function to gather article URLs from a category page
def gather_article_urls(driver):
    urls = []
    article_cards = driver.find_elements(By.CLASS_NAME, "card-object__figure")
    for card in article_cards:
        try:
            article_link = card.find_element(By.CLASS_NAME, "link")
            url = article_link.get_attribute("href")
            urls.append(url)
        except Exception as e:
            print(f"Error finding article URL: {e}")
    return urls

# Function to scrape articles from gathered URLs
def scrape_articles(driver, urls, sub_url):
    sub_url_ls = []
    for url in urls:
        sub_url_hash = {}
        sub_url_hash["Article URL"] = url
        try:
                # Navigate to the article page
                driver.get(url)
                time.sleep(2)  # Waiting for the page to load

                # Get article summaries
                summary_elements = driver.find_elements(By.CSS_SELECTOR, "div.text-long ul li")
                summary_texts = [li.text for li in summary_elements]
                Combined_Summary = ".\n".join(summary_texts)
                if len(Combined_Summary) == 0:
                    sub_url_hash["Article Summary"] = 'NO SUMMARY'
                else:
                    sub_url_hash["Article Summary"] = Combined_Summary

                # Get article body
                try:
                    paragraphs = driver.find_elements(By.CSS_SELECTOR, "div.text-long p")
                    body = "\n".join([p.text for p in paragraphs])
                    sub_url_hash['Article Body'] = body
                except Exception as e:
                    sub_url_hash['Article Body'] = None
                    print("Body paragraphs not extracted")

        except Exception as e:
                print(f"Error scraping article: {e}")
        sub_url_ls.append(sub_url_hash)
        
    section = f"{sub_url} section"
    scraped_articles[section] = sub_url_ls

In [None]:
def main():
    driver.get(base_url)
    time.sleep(2)  # Wait for the page to load
    category_sub_URLs = scrape_categories(driver, base_url)
    for i in trange(len(category_sub_URLs)):
        print(f'Collecting data for {category_sub_URLs[i]} section...')
        full_category_url = f"{base_url}{category_sub_URLs[i]}"
        go_to_category(driver, full_category_url)
        urls = gather_article_urls(driver)
        scrape_articles(driver, urls, category_sub_URLs[i])
        print(f"Done collecting data for {category_sub_URLs[i]} section...\n")
    # Close the driver after scraping
    driver.quit()
    # Write files to json 
    with open("today_online/articles.json", "w", encoding='utf-8') as fout:
        json.dump(scraped_articles, fout, indent=4, ensure_ascii=False)

In [71]:
scraped_articles

{'news section': [{'Article URL': 'https://www.todayonline.com/news/brazen-man-jail-attempted-rape-molest-step-daughter-2431821',
   'Article Summary': 'For the attempted rape of his step-daughter, a 41-year-old man was sentenced to jail and caning.\nHe also molested the girl in the bedroom where his other young children were present.\nThe victim was aged 11 to 14 during the period when these happened',
   'Article Body': "SINGAPORE — A 41-year-old man was sentenced to six years and seven months’ jail and eight strokes of the cane on Wednesday (May 29) for the attempted rape and molestation of his step-daughter, then aged 11 or 12.\nThe man pleaded guilty to aggravated attempted rape of a person aged under 14, as well as outrage of modesty.\nBoth the offender and his victim cannot be named due to a court order to protect the victim’s identity.\nThe man's sentence was backdated to his date of arrest on May 18 last year.\nCourt documents showed that the man had been in a romantic relatio

In [72]:
with open("today_online/articles.json", "w", encoding='utf-8') as fout:
    json.dump(scraped_articles, fout, indent=4, ensure_ascii=False)