In [1]:
from datetime import datetime
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from fake_useragent import UserAgent
import re
import csv
import time


In [2]:
def english_months_converter(malay_month):
    if "Januari" in malay_month:
        malay_month = malay_month.replace('Januari', 'January')
    elif 'Februari' in malay_month:
        malay_month = malay_month.replace('Februari', 'February')
    elif 'Mac' in malay_month:
        malay_month = malay_month.replace('Mac', 'March')
    elif 'Mei' in malay_month:
        malay_month = malay_month.replace('Mei', 'May')
    elif 'Jun' in malay_month:
        malay_month = malay_month.replace('Jun', 'June')
    elif 'Julai' in malay_month:
        malay_month = malay_month.replace('Julai', 'July')
    elif 'Ogos' in malay_month:
        malay_month = malay_month.replace('Ogos', 'August')
    elif 'Oktober' in malay_month:
        malay_month = malay_month.replace('Oktober', 'October')
    elif 'Disember' in malay_month:
        malay_month = malay_month.replace('Disember', 'December')    
    return malay_month

In [3]:
# set driver options - do not pop up chrome and disable pop-ups.
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument(f'user-agent":{UserAgent().random}')

# get appropriate drivers for Google Chrome - automatic without downloading driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), 
                          options=chrome_options)


In [4]:
def driver_scroll():
    while True:
        # Scroll to the bottom of the page
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait for the page to load
        time.sleep(5)

        # Check if we have reached the bottom of the page
        scroll_position = driver.execute_script("return window.pageYOffset;")
        page_height = driver.execute_script("return document.body.scrollHeight;")

        # The position for the end of page, if it is reached, quit browser
        # page_height > scroll_height also work
        if (page_height > scroll_position):
            break


In [5]:
date_pattern = r"([a-zA-Z]+ \d{1,2}, \d{4})"
time_pattern = r"(?P<hour>\d{1,2}):(?P<minute>\d{2})(?P<meridiem>am|pm)"

def extract_properties(links, writer):
    for link in links:
            # in h3 tag, go in and search for 'field-title' tags

            # Ex: WKlinik Amal Percuma bantu komuniti memerlukan
            article_title = link.find('h3', attrs='field-title').text

            # Ex: January 13, 2023 @ 2:30am
            time_date_properties = link.find('span', attrs='created-ago ml-2').text
            date_converted = english_months_converter(time_date_properties)
            date_match = re.search(date_pattern, date_converted)
            time_match = re.search(time_pattern, date_converted)

            if date_match and time_match:
                date_extract = date_match.group(1)
                time_extract = time_match.group()
                

            # Ex: January 10, 2023 --> 2023-01-10
            dateobj = datetime.strptime(date_extract, '%B %d, %Y').date()

            # 2:11 pm --> 14:11:00
            timeobj = datetime.strptime(time_extract, '%I:%M%p').time()

            # in a's tag, get its url
            Link = link.find('a')['href']

            writer.writerow([article_title, dateobj, timeobj, Link])

In [6]:
pages_to_get = 20

with open('hmetro.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    headers = ["Title", "Date", "Time", "Link"]
    writer.writerow(headers)

    for page in range (1, pages_to_get+1):
        print('Processing Page: ', page)
        url = 'https://www.hmetro.com.my/search?keywords=vaksin&page='+str(page)
        driver.get(url)

        # Scroll to the bottom of the page and wait for a few seconds for the page to load
        driver_scroll()

        # save its html properties and BeautifulSoup parse
        page_source = driver.page_source
        # Close the browser  
        # driver.quit()

        soup = BeautifulSoup(page_source, 'html.parser')

        links = soup.find_all('div', attrs={'class':'article-teaser'})
        print(f'Links we obtained: {len(links)}')


        extract_properties(links=links, writer=writer)

        print('CSV file saved successfully for Page: ' + str(page))



Processing Page:  1
Links we obtained: 20
CSV file saved successfully for Page: 1
Processing Page:  2
Links we obtained: 20
CSV file saved successfully for Page: 2
Processing Page:  3
Links we obtained: 20
CSV file saved successfully for Page: 3
Processing Page:  4
Links we obtained: 20
CSV file saved successfully for Page: 4
Processing Page:  5
Links we obtained: 20
CSV file saved successfully for Page: 5
Processing Page:  6
Links we obtained: 20
CSV file saved successfully for Page: 6
Processing Page:  7
Links we obtained: 20
CSV file saved successfully for Page: 7
Processing Page:  8
Links we obtained: 20
CSV file saved successfully for Page: 8
Processing Page:  9
Links we obtained: 20
CSV file saved successfully for Page: 9
Processing Page:  10
Links we obtained: 20
CSV file saved successfully for Page: 10
Processing Page:  11
Links we obtained: 20
CSV file saved successfully for Page: 11
Processing Page:  12
Links we obtained: 20
CSV file saved successfully for Page: 12
Processing