In [28]:
%pip install beautifulsoup4 requests openpyxl

Note: you may need to restart the kernel to use updated packages.


In [None]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import random


headers = {
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0',
  'Accept-Language': 'en-US,en;q=0.9',
  'Accept-Encoding': 'gzip, deflate, br',
  'Connection': 'keep-alive'
}

### Rappler Extraction

In [None]:
def extract_article_data(link):
  """
  Extracts data from an article based on the provided link.

  Parameters:
    link (str): The URL of the article.

  Returns:
    list: A list containing the extracted article details in the following order:
      - title (str): The title of the article.
      - date (str): The date the article was published.
      - link (str): The URL of the article.
      - text (str): The content of the article.
  """
  r = requests.get(link, headers=headers)

  soup = BeautifulSoup(r.content, 'html.parser')

  title = soup.title.text.strip()

  date = soup.find("time", {"datetime": True})['datetime']

  text = ''
  tagged_lines = soup.find("div", {"class": "post-single__content entry-content"}).find_all('p')
  for line in tagged_lines:
    untagged_line = line.get_text()
    text += untagged_line + '\n'

  doc_details = [title, link, date, text]
  return doc_details

In [None]:
mother_url = "https://www.rappler.com/wp-json/rappler/v1/ontology-topics/2824460/latest-news?page="
page = 1
page_limit = 5
corpus = pd.DataFrame(columns=['title', 'link', 'date_published', 'text'])

while True:
  page_str = str(page)
  page_url = mother_url + page_str
  print('\nWorking on ' + page_url)

  time.sleep(random.randint(1, 5))

  page_r = requests.get(page_url, headers=headers)

  if page_r.status_code != 200:
    print('Failed to retrieve page')
    break

  try:
    page_data = page_r.json()
  except ValueError:
    print('Failed to parse JSON')
    break

  for article in page_data:
    if 'permalink' not in article:
      continue
    article_link = article['permalink']

    article_link = article_link.replace('\\/', '/')

    try:
      tmp = extract_article_data(article_link)
      print(tmp)
      corpus.loc[len(corpus)] = tmp
    except:
      continue

  if page >= page_limit:
    break

  page += 1


Working on https://www.rappler.com/wp-json/rappler/v1/ontology-topics/2824460/latest-news?page=1
['Lacson: Only 40% of flood control funds go to implementation | The wRap', 'https://www.rappler.com/video/daily-wrap/august-20-2025/', '2025-08-20T22:19:00+08:00', 'Here are today’s headlines – the latest news in the Philippines and around the world:\nSenator Ping Lacson reveals only 40% of the country’s flood control funds go to actual implementation, and that most of the funds are lost to corruption.\nPresident Ferdinand Marcos Jr. says that his government is eyeing ‘economic sabotage’ cases against contractors of big ghost flood control projects. This, after he verified a citizen’s complaint about a non-existent — but fully paid for — P55-million reinforced concrete riverwall project.\nFamily law expert Katrina Legarda reveals some overseas Filipino workers go to Guam for divorce – a separation that could be recognized in the Philippines later on.\nIsrael Osamudiame Friday, the new Nig

In [35]:
file_name = 'rappler_sb19.xlsx'
corpus.to_excel(file_name)
print(f'File saved to {file_name}')
corpus

File saved to rappler_sb19.xlsx


Unnamed: 0,title,link,date_published,text
0,Lacson: Only 40% of flood control funds go to ...,https://www.rappler.com/video/daily-wrap/augus...,2025-08-20T22:19:00+08:00,Here are today’s headlines – the latest news i...
1,Three-peat: SB19’s A’TIN wins Billboard Fan Ar...,https://www.rappler.com/entertainment/music/sb...,2025-08-20T18:08:00+08:00,"MANILA, Philippines – SB19’s A’TIN was named t..."
2,"LISTEN: Sarah Geronimo, SB19 team up for new t...",https://www.rappler.com/entertainment/sb19-sar...,2025-07-30T17:10:30+08:00,"MANILA, Philippines — A historic musical colla..."
3,Gamemaker SNK teams up with SB19 for 'Fatal Fu...,https://www.rappler.com/technology/gaming/snk-...,2025-05-26T20:00:00+08:00,"MANILA, Philippines – Video game maker SNK Cor..."
4,Immerse in SB19’s ‘Simula at Wakas’ Experience,https://www.rappler.com/entertainment/music/si...,2025-05-24T16:47:28+08:00,"MANILA, Philippines – P-pop boy group SB19 was..."
5,[WATCH] Rappler Live Jam: SB19,https://www.rappler.com/entertainment/live-jam...,2025-05-08T20:01:22+08:00,"MANILA, Philippines – A’TIN, rejoice! SB19 fin..."
6,"Riskier than ever, SB19 releases new EP ‘Simul...",https://www.rappler.com/entertainment/music/sb...,2025-04-25T20:55:04+08:00,"MANILA, Philippines – A two-night concert in t..."
7,SB19’s ‘Simula at Wakas’: Ticket packages for ...,https://www.rappler.com/entertainment/music/sb...,2025-03-13T20:30:25+08:00,"MANILA, Philippines – SB19 has announced that ..."
8,FULL INTERVIEW: SB19 hyped up on new song 'DAM...,https://www.rappler.com/entertainment/video-in...,2025-03-01T20:29:13+08:00,"MANILA, Philippines – As P-pop group SB19 rele..."
9,"Alleged Chinese spies gave cash, vehicles to c...",https://www.rappler.com/video/daily-wrap/febru...,2025-02-28T22:30:00+08:00,Here are today’s headlines – the latest news i...


file_name = 'rappler_corpus.xlsx'
corpus.to_excel(file_name)
print(f'File saved to {file_name}')
corpus