In [None]:
import os
import sys
import time
import pickle
from time import sleep
from selenium import webdriver
from bs4 import BeautifulSoup

# References used for Web Scraping
* https://www.javatpoint.com/selenium-python
* https://shanyitan.medium.com/how-to-install-selenium-and-run-it-successfully-via-jupyter-lab-c3f50d22a0d4

# Generate a Dictionary of SONA Speeches

In [None]:
from selenium.webdriver.chrome.options import Options
from selenium import webdriver

chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--enable-javascript")
chrome_options.add_argument("--window-size=1920,1200")

driver = webdriver.Chrome(options=chrome_options)
driver.get("https://www.officialgazette.gov.ph/past-sona-speeches/")
soup = BeautifulSoup(driver.page_source)
soup

In [None]:
presidents = {}
data_table = soup.select('table')[0]
presidents = data_table.select('td[rowspan]')
for pres in presidents:
    president = pres.text
data_rows = data_table.select('tr')[1:]
for row in data_rows:
    columns = row.select('td')
    if len(columns) == 5:
        president = columns[0].text
        presidents[president] = {}
        presidents[president]['Date'] = [columns[1].text]
        presidents[president]['Link'] = [columns[2].a['href']]
        presidents[president]['NoS'] = [columns[2].text]
        presidents[president]['Loc'] = [columns[3].text]
        presidents[president]['Sesh'] = [columns[4].text]
    elif len(columns) == 4:
        presidents[president]['Date'] = (presidents[president]['Date'] +
                                         [columns[0].text])
        presidents[president]['Link'] = (presidents[president]['Link'] +
                                         [columns[1].a['href']])
        presidents[president]['NoS'] = (presidents[president]['NoS'] +
                                        [columns[1].text])
        presidents[president]['Loc'] = (presidents[president]['Loc'] +
                                        [columns[2].text])
        presidents[president]['Sesh'] = (presidents[president]['Sesh'] +
                                         [columns[3].text])

# Since PNoy's speeches were in Tagalog, English translations
# were replaced manually.
english_trans = ['https://www.officialgazette.gov.ph/2010/07/26/' +\
                 'state-of-the-nation-address-2010-en/',
                 'https://www.officialgazette.gov.ph/2011/07/25/' +\
                 'benigno-s-aquino-iii-second-state-of-the-nation-address-july-25-2011-en/',
                 'https://www.officialgazette.gov.ph/2012/07/23/' +\
                 'english-translation-benigno-s-aquino-iii-third-state-of-the-nation-address-july-23-2012/',
                 'https://www.officialgazette.gov.ph/2013/07/22/' +\
                 'english-benigno-s-aquino-iii-fourth-state-of-the-nation-address-july-22-2013/',
                 'https://www.officialgazette.gov.ph/2014/07/28/' +\
                 'english-benigno-s-aquino-iii-fifth-state-of-the-nation-address-july-28-2014/',
                 'https://www.officialgazette.gov.ph/2015/07/27/' +\
                 'english-president-aquino-sixth-sona/']
presidents['Benigno S. Aquino III']['Link'] = english_trans
# Saving files to a Pickle File
with open("SpeechDictionary.pkl","wb") as file:
    pickle.dump(presidents, file)

# Getting Speeches

In [None]:
# If SpeechDictionary is already made. Start runs from here.
with open("SpeechDictionary.pkl","rb") as file:
    presidents = pickle.load(file)
presidents

In [None]:
from tqdm import tqdm
for pres in presidents:
    for l in tqdm(presidents[pres]['Link'],
                  desc = f'Getting Data From {pres}'):
        speech = None
        driver2 = webdriver.Chrome(options=chrome_options)
        driver2.get(l)
        while speech is None:
            try:
                soup2 = BeautifulSoup(driver2.page_source)
                speech = soup2.select('div[class = "entry-content"]')[0].text
            except:
                time.sleep(1)
                driver2.refresh()
                print('Reloading Web Page...')
        driver2.close()
        presidents[pres]['Speech'] = (presidents[pres].get('Speech',[])
                                      + [speech])
    presName = (pres.split()[-1]
                if pres.split()[-1] != 'III'
                else '_'.join(pres.split()[-2:]))
    with open(f"SpeechDictionary_{presName}.pkl","wb") as file:
        pickle.dump(presidents, file)
        
with open(f"SpeechDictionary_complete.pkl","wb") as file:
    pickle.dump(presidents, file)