In [85]:
%pip install beautifulsoup4 requests openpyxl pandas google-api-python-client

Note: you may need to restart the kernel to use updated packages.


In [86]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import random


headers = {
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0',
  'Accept-Language': 'en-US,en;q=0.9',
  'Accept-Encoding': 'gzip, deflate, br',
  'Connection': 'keep-alive'
}

### Rappler Extraction

In [87]:
def extract_article_data(link):
  """
  Extracts data from an article based on the provided link.

  Parameters:
    link (str): The URL of the article.

  Returns:
    list: A list containing the extracted article details in the following order:
      - title (str): The title of the article.
      - date (str): The date the article was published.
      - link (str): The URL of the article.
      - text (str): The content of the article.
  """
  r = requests.get(link, headers=headers)

  soup = BeautifulSoup(r.content, 'html.parser')

  title = soup.title.text.strip()

  date = soup.find("time", {"datetime": True})['datetime']

  text = ''
  tagged_lines = soup.find("div", {"class": "post-single__content entry-content"}).find_all('p')
  for line in tagged_lines:
    untagged_line = line.get_text()
    text += untagged_line + '\n'

  doc_details = [title, link, date, text]
  return doc_details

In [88]:
mother_url = "https://www.rappler.com/wp-json/rappler/v1/ontology-topics/2824460/latest-news?page="
page = 1
page_limit = 5
corpus = pd.DataFrame(columns=['title', 'link', 'date_published', 'text'])

while True:
  page_str = str(page)
  page_url = mother_url + page_str
  print('\nWorking on ' + page_url)

  time.sleep(random.randint(1, 5))

  page_r = requests.get(page_url, headers=headers)

  if page_r.status_code != 200:
    print('Failed to retrieve page')
    break

  try:
    page_data = page_r.json()
  except ValueError:
    print('Failed to parse JSON')
    break

  for article in page_data:
    if 'permalink' not in article:
      continue
    article_link = article['permalink']

    article_link = article_link.replace('\\/', '/')

    try:
      tmp = extract_article_data(article_link)
      print(tmp)
      corpus.loc[len(corpus)] = tmp
    except:
      continue

  if page >= page_limit:
    break

  page += 1


Working on https://www.rappler.com/wp-json/rappler/v1/ontology-topics/2824460/latest-news?page=1
['Lacson: Only 40% of flood control funds go to implementation | The wRap', 'https://www.rappler.com/video/daily-wrap/august-20-2025/', '2025-08-20T22:19:00+08:00', 'Here are today’s headlines – the latest news in the Philippines and around the world:\nSenator Ping Lacson reveals only 40% of the country’s flood control funds go to actual implementation, and that most of the funds are lost to corruption.\nPresident Ferdinand Marcos Jr. says that his government is eyeing ‘economic sabotage’ cases against contractors of big ghost flood control projects. This, after he verified a citizen’s complaint about a non-existent — but fully paid for — P55-million reinforced concrete riverwall project.\nFamily law expert Katrina Legarda reveals some overseas Filipino workers go to Guam for divorce – a separation that could be recognized in the Philippines later on.\nIsrael Osamudiame Friday, the new Nig

In [89]:
file_name = 'rappler_sb19.xlsx'
corpus.to_excel(file_name)
print(f'File saved to {file_name}')
corpus

File saved to rappler_sb19.xlsx


Unnamed: 0,title,link,date_published,text
0,Lacson: Only 40% of flood control funds go to ...,https://www.rappler.com/video/daily-wrap/augus...,2025-08-20T22:19:00+08:00,Here are today’s headlines – the latest news i...
1,Three-peat: SB19’s A’TIN wins Billboard Fan Ar...,https://www.rappler.com/entertainment/music/sb...,2025-08-20T18:08:00+08:00,"MANILA, Philippines – SB19’s A’TIN was named t..."
2,"LISTEN: Sarah Geronimo, SB19 team up for new t...",https://www.rappler.com/entertainment/sb19-sar...,2025-07-30T17:10:30+08:00,"MANILA, Philippines — A historic musical colla..."
3,Gamemaker SNK teams up with SB19 for 'Fatal Fu...,https://www.rappler.com/technology/gaming/snk-...,2025-05-26T20:00:00+08:00,"MANILA, Philippines – Video game maker SNK Cor..."
4,Immerse in SB19’s ‘Simula at Wakas’ Experience,https://www.rappler.com/entertainment/music/si...,2025-05-24T16:47:28+08:00,"MANILA, Philippines – P-pop boy group SB19 was..."
5,[WATCH] Rappler Live Jam: SB19,https://www.rappler.com/entertainment/live-jam...,2025-05-08T20:01:22+08:00,"MANILA, Philippines – A’TIN, rejoice! SB19 fin..."
6,"Riskier than ever, SB19 releases new EP ‘Simul...",https://www.rappler.com/entertainment/music/sb...,2025-04-25T20:55:04+08:00,"MANILA, Philippines – A two-night concert in t..."
7,SB19’s ‘Simula at Wakas’: Ticket packages for ...,https://www.rappler.com/entertainment/music/sb...,2025-03-13T20:30:25+08:00,"MANILA, Philippines – SB19 has announced that ..."
8,FULL INTERVIEW: SB19 hyped up on new song 'DAM...,https://www.rappler.com/entertainment/video-in...,2025-03-01T20:29:13+08:00,"MANILA, Philippines – As P-pop group SB19 rele..."
9,"Alleged Chinese spies gave cash, vehicles to c...",https://www.rappler.com/video/daily-wrap/febru...,2025-02-28T22:30:00+08:00,Here are today’s headlines – the latest news i...


file_name = 'rappler_corpus.xlsx'
corpus.to_excel(file_name)
print(f'File saved to {file_name}')
corpus

In [90]:
import pandas as pd
from googleapiclient.discovery import build

In [91]:
api_key = 'AIzaSyArJa4h3CPtJILqexIrRWL22W7K1ZzxABs'
youtube = build('youtube', 'v3', developerKey=api_key)

In [92]:
playlists = [
    "PLajWQmEHD-f_0fa-JHuPZa2t74iw7yXgN",
    "PLxPlPn89QnKJNiVsbfdJey2UngMbiHPK5",
    "PLczlSeoNL3Gzy4nhPVuom1HWj9-l409zK",
    "PLPrvacGmi0E_IFSKV7e8p75yAxxg7W0Dp",
    "PLPChZieZzxBgOrlCBW9DX1fCshxsYanf3"
]
comments = []

for playlist_id in playlists:
    playlist = youtube.playlistItems().list(
        part="contentDetails",
        playlistId=playlist_id,
        maxResults=5
    ).execute()['items']

    for item in playlist:
        video_id = item['contentDetails']['videoId']
        video_response = youtube.commentThreads().list(
            videoId=video_id, part='snippet,replies', maxResults=25,
            order='time'
        ).execute()['items']

        for video in video_response:
            comment = video['snippet']['topLevelComment']['snippet']
            comments.append([
                comment['textDisplay'],
                f'https://www.youtube.com/watch?v={video_id}&lc={video["snippet"]["topLevelComment"]["id"]}',
                comment['publishedAt'],
                comment['textOriginal'],
                comment['likeCount'],
                np.nan
            ])

            # count number of replies
            total_reply_count = video['snippet']['totalReplyCount']

            # if there is at least one reply
            if total_reply_count > 0:
                parent_id = video["snippet"]["topLevelComment"]["id"]

                replies = youtube.comments().list(
                    part='snippet',
                    parentId=parent_id
                ).execute()

                # iterate through the replies
                for reply in replies['items'][:5]:
                    # extract text from each reply
                    # append reply to list of comments
                    replyBody = reply['snippet']
                    comments.append([
                        replyBody['textDisplay'],
                        f"https://www.youtube.com/watch?v={video_id}&lc={reply['id']}",
                        replyBody['publishedAt'],
                        replyBody['textOriginal'],
                        replyBody['likeCount'],
                        replyBody['parentId']
                    ])
comments

[['KEN WILL ALWAYS BE THE MONSTERS VOICE MAN',
  'https://www.youtube.com/watch?v=B9vkM8SeScY&lc=Ugx-q0ocpfOVRQAMIkF4AaABAg',
  '2025-08-22T11:39:31Z',
  'KEN WILL ALWAYS BE THE MONSTERS VOICE MAN',
  0,
  nan],
 ['they blend so well, walang sapawan',
  'https://www.youtube.com/watch?v=B9vkM8SeScY&lc=UgxtLCpin-ct8a1yyV94AaABAg',
  '2025-08-22T00:51:31Z',
  'they blend so well, walang sapawan',
  1,
  nan],
 ['Kenji really looks and sounds like a cool anime character.',
  'https://www.youtube.com/watch?v=B9vkM8SeScY&lc=Ugx4zDvnbn8tPCKN9-F4AaABAg',
  '2025-08-20T10:10:02Z',
  'Kenji really looks and sounds like a cool anime character.',
  3,
  nan],
 ['August 20, 2025👋',
  'https://www.youtube.com/watch?v=B9vkM8SeScY&lc=UgxCTfPYJta71SFKhxt4AaABAg',
  '2025-08-20T09:22:22Z',
  'August 20, 2025👋',
  1,
  nan],
 ['フィリピンも発展途上国！いつかいつか、k-popのような全世界で愛されるカッコいい/可愛いグループがいつか出来るだろう。と１０年間思ってた。まだまだこれからに期待ですよ☆彡',
  'https://www.youtube.com/watch?v=B9vkM8SeScY&lc=UgwsohEKqrDbYFxm41F4AaABAg',
  '2025-08-

In [93]:
youtube_corpus = pd.DataFrame(
  comments, columns=['title', 'link', 'date_published', 'text', 'like_count', 'reply_parent_id'])
youtube_corpus
file_name = 'youtube_sb19.xlsx'
youtube_corpus.to_excel(file_name)
print(f'File saved to {file_name}')
youtube_corpus

File saved to youtube_sb19.xlsx


Unnamed: 0,title,link,date_published,text,like_count,reply_parent_id
0,KEN WILL ALWAYS BE THE MONSTERS VOICE MAN,https://www.youtube.com/watch?v=B9vkM8SeScY&lc...,2025-08-22T11:39:31Z,KEN WILL ALWAYS BE THE MONSTERS VOICE MAN,0,
1,"they blend so well, walang sapawan",https://www.youtube.com/watch?v=B9vkM8SeScY&lc...,2025-08-22T00:51:31Z,"they blend so well, walang sapawan",1,
2,Kenji really looks and sounds like a cool anim...,https://www.youtube.com/watch?v=B9vkM8SeScY&lc...,2025-08-20T10:10:02Z,Kenji really looks and sounds like a cool anim...,3,
3,"August 20, 2025👋",https://www.youtube.com/watch?v=B9vkM8SeScY&lc...,2025-08-20T09:22:22Z,"August 20, 2025👋",1,
4,フィリピンも発展途上国！いつかいつか、k-popのような全世界で愛されるカッコいい/可愛いグ...,https://www.youtube.com/watch?v=B9vkM8SeScY&lc...,2025-08-19T19:59:21Z,フィリピンも発展途上国！いつかいつか、k-popのような全世界で愛されるカッコいい/可愛いグ...,1,
...,...,...,...,...,...,...
679,Yes as much as we are glad they have their sol...,https://www.youtube.com/watch?v=4rZBt3NTtDA&lc...,2025-07-01T04:10:48Z,Yes as much as we are glad they have their sol...,8,
680,Amen to that . They are beyond perfection as a...,https://www.youtube.com/watch?v=4rZBt3NTtDA&lc...,2025-08-19T11:31:34Z,Amen to that . They are beyond perfection as a...,0,Ugzg5g1Si5c1I_FPt1B4AaABAg
681,"tbh, i didn&#39;t care that much about ilaw. i...",https://www.youtube.com/watch?v=4rZBt3NTtDA&lc...,2025-06-30T23:39:32Z,"tbh, i didn't care that much about ilaw. i mea...",18,
682,"dti cl ngrereact s mga vids nila,ng mpnood q u...",https://www.youtube.com/watch?v=4rZBt3NTtDA&lc...,2025-06-30T14:20:12Z,"dti cl ngrereact s mga vids nila,ng mpnood q u...",1,
