In [10]:
from datetime import datetime
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from fake_useragent import UserAgent
import re
import csv
import time
import pandas as pd
import requests


In [11]:
def english_months_converter(malay_month):
    if "Januari" in malay_month:
        malay_month = malay_month.replace('Januari', 'January')
    elif 'Februari' in malay_month:
        malay_month = malay_month.replace('Februari', 'February')
    elif 'Mac' in malay_month:
        malay_month = malay_month.replace('Mac', 'March')
    elif 'Mei' in malay_month:
        malay_month = malay_month.replace('Mei', 'May')
    elif 'Jun' in malay_month:
        malay_month = malay_month.replace('Jun', 'June')
    elif 'Julai' in malay_month:
        malay_month = malay_month.replace('Julai', 'July')
    elif 'Ogos' in malay_month:
        malay_month = malay_month.replace('Ogos', 'August')
    elif 'Oktober' in malay_month:
        malay_month = malay_month.replace('Oktober', 'October')
    elif 'Disember' in malay_month:
        malay_month = malay_month.replace('Disember', 'December')    
    return malay_month

In [12]:
# set driver options - do not pop up chrome and disable pop-ups.
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument(f'user-agent":{UserAgent().random}')

# get appropriate drivers for Google Chrome - automatic without downloading driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), 
                          options=chrome_options)


In [13]:
def driver_scroll():
    while True:
        # Scroll to the bottom of the page
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait for the page to load
        time.sleep(2)

        # Check if we have reached the bottom of the page
        scroll_position = driver.execute_script("return window.pageYOffset;")
        page_height = driver.execute_script("return document.body.scrollHeight;")

        # The position for the end of page, if it is reached, quit browser
        # page_height > scroll_height also work
        if (page_height > scroll_position):
            break


In [14]:
date_pattern = r"([a-zA-Z]+ \d{1,2}, \d{4})"
time_pattern = r"(?P<hour>\d{1,2}):(?P<minute>\d{2})(?P<meridiem>am|pm)"

def extract_properties(links, writer):
    for link in links:
            # in h3 tag, go in and search for 'field-title' tags

            # Ex: WKlinik Amal Percuma bantu komuniti memerlukan
            article_title = link.find('h3', attrs='field-title').text

            # Ex: January 13, 2023 @ 2:30am
            time_date_properties = link.find('span', attrs='created-ago ml-2').text
            date_converted = english_months_converter(time_date_properties)
            date_match = re.search(date_pattern, date_converted)
            time_match = re.search(time_pattern, date_converted)

            if date_match and time_match:
                date_extract = date_match.group(1)
                time_extract = time_match.group()
                

            # Ex: January 10, 2023 --> 2023-01-10
            dateobj = datetime.strptime(date_extract, '%B %d, %Y').date()

            # 2:11 pm --> 14:11:00
            timeobj = datetime.strptime(time_extract, '%I:%M%p').time()

            # 
            author_name = link.find('span', attrs='field-category').text

            # in a's tag, get its url
            article_link = link.find('a')['href']
            full_article_link = "https://www.hmetro.com.my" + article_link
            
            writer.writerow([article_title, dateobj, timeobj, author_name, full_article_link])

In [15]:
pages_to_get = 21

with open('hmetro.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    headers = ["Title", "Date", "Time", "Author Name","Link"]
    writer.writerow(headers)

    for page in range (0, pages_to_get):
        print('Processing Page: ', page)
        url = 'https://www.hmetro.com.my/search?keywords=vaksin&page='+str(page)
        driver.get(url)

        # Scroll to the bottom of the page and wait for a few seconds for the page to load
        driver_scroll()

        # save its html properties and BeautifulSoup parse
        page_source = driver.page_source
        # Close the browser  
        # driver.quit()

        soup = BeautifulSoup(page_source, 'html.parser')

        links = soup.find_all('div', attrs={'class':'article-teaser'})
        print(f'Links we obtained: {len(links)}')

        extract_properties(links=links, writer=writer)

        print('CSV file saved successfully for Page: ' + str(page))



Processing Page:  0
Links we obtained: 20
CSV file saved successfully for Page: 0
Processing Page:  1
Links we obtained: 20
CSV file saved successfully for Page: 1
Processing Page:  2
Links we obtained: 20
CSV file saved successfully for Page: 2
Processing Page:  3
Links we obtained: 20
CSV file saved successfully for Page: 3
Processing Page:  4
Links we obtained: 20
CSV file saved successfully for Page: 4
Processing Page:  5
Links we obtained: 20
CSV file saved successfully for Page: 5
Processing Page:  6
Links we obtained: 20
CSV file saved successfully for Page: 6
Processing Page:  7
Links we obtained: 20
CSV file saved successfully for Page: 7
Processing Page:  8
Links we obtained: 20
CSV file saved successfully for Page: 8
Processing Page:  9
Links we obtained: 20
CSV file saved successfully for Page: 9
Processing Page:  10
Links we obtained: 20
CSV file saved successfully for Page: 10
Processing Page:  11
Links we obtained: 20
CSV file saved successfully for Page: 11
Processing P

In [16]:
# read csv files and drop duplicates

site_data = pd.read_csv('hmetro.csv')
site_data = site_data.drop_duplicates()
site_data

Unnamed: 0,Title,Date,Time,Author Name,Link
0,Lima mitos demam denggi,2023-03-04,14:48:00,Sihat,https://www.hmetro.com.my/sihat/2023/03/942264...
1,Sokongan buat Djokovic sertai Terbuka AS,2023-03-03,22:07:00,Sukan Lain,https://www.hmetro.com.my/arena/lain/2023/03/9...
2,Jabatan Kesihatan Melaka beri vaksin tambahan ...,2023-03-01,19:53:00,Mutakhir,https://www.hmetro.com.my/mutakhir/2023/03/941...
3,Sepanyol kesan kes pertama disyaki virus Marburg,2023-02-25,02:21:00,Eropah,https://www.hmetro.com.my/global/eropah/2023/0...
4,Waspada meningokokus,2023-02-24,15:00:00,WM,https://www.hmetro.com.my/WM/2023/02/939645/wa...
...,...,...,...,...,...
404,KKM tidak cadang lanjut vaksinasi PICKids,2022-05-20,22:11:00,Mutakhir,https://www.hmetro.com.my/mutakhir/2022/05/844...
405,Bakal jemaah haji remaja disaran ambil dos pen...,2022-05-20,21:57:00,Mutakhir,https://www.hmetro.com.my/mutakhir/2022/05/844...
406,Sebarang rawatan baru Covid-19 dikaji terlebih...,2022-05-20,21:24:00,Mutakhir,https://www.hmetro.com.my/mutakhir/2022/05/844...
407,"Lebih 200,000 penduduk Korea Utara demam",2022-05-20,20:11:00,Asia,https://www.hmetro.com.my/global/asia/2022/05/...


In [17]:
# set driver options - do not pop up chrome and disable pop-ups.
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument(f'user-agent":{UserAgent().random}')

# get appropriate drivers for Google Chrome - automatic without downloading driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), 
                          options=chrome_options)

In [18]:
# find p tags and combine into one article 
article_text = []
for url, link_index in zip(site_data['Link'], range(len(site_data['Link']))):
    print('Processing link: ',link_index)
    driver.get(url)

    driver_scroll()
    page_source = driver.page_source
    
    soup_article = BeautifulSoup(page_source, 'html.parser')
    article_paragraphs = soup_article.find_all('p')
    combined_article = ''
    
    for p in article_paragraphs:
        combined_article += p.get_text()
    article_text.append(combined_article)
    print('Obtained article text for link: ',link_index)

Processing link:  0
Obtained article text for link:  0
Processing link:  1
Obtained article text for link:  1
Processing link:  2
Obtained article text for link:  2
Processing link:  3
Obtained article text for link:  3
Processing link:  4
Obtained article text for link:  4
Processing link:  5
Obtained article text for link:  5
Processing link:  6
Obtained article text for link:  6
Processing link:  7
Obtained article text for link:  7
Processing link:  8
Obtained article text for link:  8
Processing link:  9
Obtained article text for link:  9
Processing link:  10
Obtained article text for link:  10
Processing link:  11
Obtained article text for link:  11
Processing link:  12
Obtained article text for link:  12
Processing link:  13
Obtained article text for link:  13
Processing link:  14
Obtained article text for link:  14
Processing link:  15
Obtained article text for link:  15
Processing link:  16
Obtained article text for link:  16
Processing link:  17
Obtained article text for link

In [25]:
site_data['Content'] = article_text
site_data.to_csv('hemetro_with_text.csv', encoding='utf-8', index=False)
site_data

Unnamed: 0,Title,Date,Time,Author Name,Link,Content
0,Lima mitos demam denggi,2023-03-04,14:48:00,Sihat,https://www.hmetro.com.my/sihat/2023/03/942264...,DEMAM denggi menjadi penyakit endemik di Malay...
1,Sokongan buat Djokovic sertai Terbuka AS,2023-03-03,22:07:00,Sukan Lain,https://www.hmetro.com.my/arena/lain/2023/03/9...,Los Angeles: Usaha Novak Djokovic untuk masuk ...
2,Jabatan Kesihatan Melaka beri vaksin tambahan ...,2023-03-01,19:53:00,Mutakhir,https://www.hmetro.com.my/mutakhir/2023/03/941...,Melaka: Jabatan Kesihatan Melaka melaksanakan ...
3,Sepanyol kesan kes pertama disyaki virus Marburg,2023-02-25,02:21:00,Eropah,https://www.hmetro.com.my/global/eropah/2023/0...,"Madrid: Pihak berkuasa kesihatan Valencia, Sep..."
4,Waspada meningokokus,2023-02-24,15:00:00,WM,https://www.hmetro.com.my/WM/2023/02/939645/wa...,Berisiko menyebabkan kematian dalam tempoh 24 ...
...,...,...,...,...,...,...
404,KKM tidak cadang lanjut vaksinasi PICKids,2022-05-20,22:11:00,Mutakhir,https://www.hmetro.com.my/mutakhir/2022/05/844...,Bagan Serai: Kementerian Kesihatan (KKM) tidak...
405,Bakal jemaah haji remaja disaran ambil dos pen...,2022-05-20,21:57:00,Mutakhir,https://www.hmetro.com.my/mutakhir/2022/05/844...,Kuala Lumpur: Kementerian Kesihatan (KKM) meny...
406,Sebarang rawatan baru Covid-19 dikaji terlebih...,2022-05-20,21:24:00,Mutakhir,https://www.hmetro.com.my/mutakhir/2022/05/844...,Bagan Serai: Sebarang dapatan baru mengenai ra...
407,"Lebih 200,000 penduduk Korea Utara demam",2022-05-20,20:11:00,Asia,https://www.hmetro.com.my/global/asia/2022/05/...,"Pyongyang: Korea Utara melaporkan lebih 200,00..."
