In [18]:
from datetime import datetime
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from fake_useragent import UserAgent

import csv
import time
import re
import requests 
import sys
import pandas as pd
import os

In [19]:
def english_months_converter(malay_month):
    if "Januari" in malay_month:
        malay_month = malay_month.replace('Januari', 'January')
    elif 'Februari' in malay_month:
        malay_month = malay_month.replace('Februari', 'February')
    elif 'Mac' in malay_month:
        malay_month = malay_month.replace('Mac', 'March')
    elif 'Mei' in malay_month:
        malay_month = malay_month.replace('Mei', 'May')
    elif 'Jun' in malay_month:
        malay_month = malay_month.replace('Jun', 'June')
    elif 'Julai' in malay_month:
        malay_month = malay_month.replace('Julai', 'July')
    elif 'Ogos' in malay_month:
        malay_month = malay_month.replace('Ogos', 'August')
    elif 'Oktober' in malay_month:
        malay_month = malay_month.replace('Oktober', 'October')
    elif 'Disember' in malay_month:
        malay_month = malay_month.replace('Disember', 'December')    
    return malay_month


In [20]:
date_pattern = r"(\d{1,2} [a-zA-Z]+ \d{4})"
time_pattern = r"(?P<hour>\d{1,2}):(?P<minute>\d{2})(?P<meridiem>am|pm)"

def extract_properties(links, writer):
    # need to get the first 6 articles, as the other 4 are the 'popular section'
    for link in links:
      
      # Within div tag, find article-title tag, get its title text

      # Ex: Anwar dedah perolehan vaksin tanpa persetujuan Peguam Negara
      article_title = link.find('div', attrs='article-title').text

      # Ex: 11 Januari 2021 07:38pm
      time_date_properties = link.find('div', attrs='timespan').text

      # converted: 12 January 2021 01:15pm
      date_converted = english_months_converter(time_date_properties)

      date_match = re.search(date_pattern, date_converted)
      time_match = re.search(time_pattern, date_converted)

      if date_match and time_match:
            date_extract = date_match.group(1)
            time_extract = time_match.group()

      # Ex: 2021-01-12
      dateobj = datetime.strptime(date_extract,'%d %B %Y').date()
      
     # 2:11 pm --> 14:11:00
      timeobj = datetime.strptime(time_extract, '%I:%M%p').time()

      # Extract value of href attribute from 'a' tag using dictionary-style access
      web_address = link.find('div', attrs='article-desc').find('a')['href']

      author_name = link.find('a').text
      
      # Extract short description 
      short_description = link.find('div', attrs='article-desc').text

      # write each as a row in csv file
      writer.writerow([article_title, dateobj, timeobj, author_name, web_address, short_description])

In [21]:
# Jan 2020 - Dec 2022 is in total 86 pages 

pages_to_get = 86

# Writing to a file 
with open('sinarharian.csv', 'w', newline='') as f:
  writer = csv.writer(f)
  headers = ["Title", "Date", "Time", "Author Name","Link", "Article Description"]
  writer.writerow(headers)

  # automatic goes to the next page from 1... n; python exclusive end 

  for page in range(1,pages_to_get+1):
    print('Processing Page: ', page)
    url = 'https://www.sinarharian.com.my/carian?query=vaksin&pgno='+str(page)

    try:
      # response is equivalent to enter a key in chrome 
      # prevent ip-block by adding fake devices accessing web pages 
      response = requests.get(url, headers={'User-Agent': UserAgent().random})

      # this link give valid status code: 200 --> web scrap pass 
      # print(page.status_code)

    except Exception as e:
      error_type, error_obj, error_info = sys.exc_info()
      print('Error Link: ', url)
      print(error_type, 'Line: ', error_info.tb_lineno)

      # ignore this paage and move on to next one
      continue 

    # delay by 2 seconds to prevent ip block
    time.sleep(2)

    soup = BeautifulSoup(response.text, 'html.parser')
    # inspect element attribute type and its names to take their information

    attrs_code = 'col-md-8 col-content'
    links = soup.find_all('div', attrs={'class':attrs_code})
    # print(len(links))

    # Check each page has 10 links
    print(f'Page {page} has {len(links)} links')

    extract_properties(links, writer)

    print('CSV file saved successfully for Page: ' + str(page))


Processing Page:  1
Page 1 has 10 links
CSV file saved successfully for Page: 1
Processing Page:  2
Page 2 has 10 links
CSV file saved successfully for Page: 2
Processing Page:  3
Page 3 has 10 links
CSV file saved successfully for Page: 3
Processing Page:  4
Page 4 has 10 links
CSV file saved successfully for Page: 4
Processing Page:  5
Page 5 has 10 links
CSV file saved successfully for Page: 5
Processing Page:  6
Page 6 has 10 links
CSV file saved successfully for Page: 6
Processing Page:  7
Page 7 has 10 links
CSV file saved successfully for Page: 7
Processing Page:  8
Page 8 has 10 links
CSV file saved successfully for Page: 8
Processing Page:  9
Page 9 has 10 links
CSV file saved successfully for Page: 9
Processing Page:  10
Page 10 has 10 links
CSV file saved successfully for Page: 10
Processing Page:  11
Page 11 has 10 links
CSV file saved successfully for Page: 11
Processing Page:  12
Page 12 has 10 links
CSV file saved successfully for Page: 12
Processing Page:  13
Page 13 ha

In [22]:
# read csv files and drop duplicates

site_data = pd.read_csv('sinarharian.csv')
site_data

Unnamed: 0,Title,Date,Time,Author Name,Link,Article Description
0,Empat kes kematian akibat rabies di Sarawak,2023-03-06,15:57:00,Sabah Sarawak,https://www.sinarharian.com.my/article/248356/...,KUCHING - Sarawak mencatatkan enam kes jangki...
1,Tepatkah langkah LTAT menswastakan Bousted?,2023-03-03,17:45:00,BISNES,https://www.sinarharian.com.my/article/248000/...,INSTITUSI keselamatan sosial dan badan berkan...
2,Taiwan dipilih destinasi pilihan Asia - MATTA,2023-03-03,11:02:00,Nasional,https://www.sinarharian.com.my/article/247918/...,KUALA LUMPUR - Taiwan dipilih sebagai destina...
3,"Permintaan vaksin Covid-19 rendah, Pharmaniaga...",2023-02-27,23:05:00,BISNES,https://www.sinarharian.com.my/article/247355/...,PHARMANIAGA Bhd telah mencatatkan kerugian be...
4,[VIDEO] Perolehan vaksin Covid-19 ikut prosedu...,2023-02-09,08:00:00,Nasional,https://www.sinarharian.com.my/article/244323/...,"SHAH ALAM - Bekas Menteri Kesihatan, Khairy J..."
...,...,...,...,...,...,...
835,Peka B40 'game changer' untuk perkhidmatan kes...,2020-01-11,10:30:00,Nasional,https://www.sinarharian.com.my/article/65459/b...,KUALA LUMPUR - Skim Peduli Kesihatan untuk Ku...
836,KKM sahkan tiga kes polio di Sabah,2020-01-10,11:55:00,Sabah Sarawak,https://www.sinarharian.com.my/article/65319/e...,SHAH ALAM - Kementerian Kesihatan (KKM) memak...
837,"Dua kes influenza A di Selangor di Cyberjaya, ...",2020-01-10,10:05:00,Selangor KL,https://www.sinarharian.com.my/article/65308/e...,SHAH ALAM - Jabatan Kesihatan Negeri Selangor...
838,Influenza: Pesakit wajib dikuarantin elak penu...,2020-01-10,08:45:00,Nasional,https://www.sinarharian.com.my/article/65297/b...,SHAH ALAM - Kerajaan perlu mewajibkan arahan ...


In [23]:
# set driver options - do not pop up chrome and disable pop-ups.
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument(f'user-agent":{UserAgent().random}')
# get appropriate drivers for Google Chrome - automatic without downloading driver

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), 
                          options=chrome_options)

In [24]:
# find p tags and combine into one article 
article_text = []
for url, link_index in zip(site_data['Link'], range(len(site_data['Link']))):
    print('Processing link: ',link_index)
    driver.get(url)

    # driver_scroll()
    page_source = driver.page_source
    
    # Close the browser  
    # driver.quit()

    soup_article = BeautifulSoup(page_source, 'html.parser')
    article_paragraphs = soup_article.find_all('p')
    combined_article = ''
    
    for p in article_paragraphs:
        combined_article += p.get_text()
    article_text.append(combined_article)
    print('Obtained article text for link: ',link_index)

Processing link:  0
Obtained article text for link:  0
Processing link:  1
Obtained article text for link:  1
Processing link:  2
Obtained article text for link:  2
Processing link:  3
Obtained article text for link:  3
Processing link:  4
Obtained article text for link:  4
Processing link:  5
Obtained article text for link:  5
Processing link:  6
Obtained article text for link:  6
Processing link:  7
Obtained article text for link:  7
Processing link:  8
Obtained article text for link:  8
Processing link:  9
Obtained article text for link:  9
Processing link:  10
Obtained article text for link:  10
Processing link:  11
Obtained article text for link:  11
Processing link:  12
Obtained article text for link:  12
Processing link:  13
Obtained article text for link:  13
Processing link:  14
Obtained article text for link:  14
Processing link:  15
Obtained article text for link:  15
Processing link:  16
Obtained article text for link:  16
Processing link:  17
Obtained article text for link

In [25]:
site_data['Content'] = article_text
site_data.to_csv('sinarharian_with_text.csv', encoding='utf-8', index=False)
site_data

Unnamed: 0,Title,Date,Time,Author Name,Link,Article Description,Content
0,Empat kes kematian akibat rabies di Sarawak,2023-03-06,15:57:00,Sabah Sarawak,https://www.sinarharian.com.my/article/248356/...,KUCHING - Sarawak mencatatkan enam kes jangki...,KUCHING - Sarawak mencatatkan enam kes jangkit...
1,Tepatkah langkah LTAT menswastakan Bousted?,2023-03-03,17:45:00,BISNES,https://www.sinarharian.com.my/article/248000/...,INSTITUSI keselamatan sosial dan badan berkan...,INSTITUSI keselamatan sosial dan badan berkanu...
2,Taiwan dipilih destinasi pilihan Asia - MATTA,2023-03-03,11:02:00,Nasional,https://www.sinarharian.com.my/article/247918/...,KUALA LUMPUR - Taiwan dipilih sebagai destina...,KUALA LUMPUR - Taiwan dipilih sebagai destinas...
3,"Permintaan vaksin Covid-19 rendah, Pharmaniaga...",2023-02-27,23:05:00,BISNES,https://www.sinarharian.com.my/article/247355/...,PHARMANIAGA Bhd telah mencatatkan kerugian be...,Sedania As Salam jalin kerjasama dengan Wasiyy...
4,[VIDEO] Perolehan vaksin Covid-19 ikut prosedu...,2023-02-09,08:00:00,Nasional,https://www.sinarharian.com.my/article/244323/...,"SHAH ALAM - Bekas Menteri Kesihatan, Khairy J...","SHAH ALAM - Bekas Menteri Kesihatan, Khairy Ja..."
...,...,...,...,...,...,...,...
835,Peka B40 'game changer' untuk perkhidmatan kes...,2020-01-11,10:30:00,Nasional,https://www.sinarharian.com.my/article/65459/b...,KUALA LUMPUR - Skim Peduli Kesihatan untuk Ku...,KUALA LUMPUR - Skim Peduli Kesihatan untuk Kum...
836,KKM sahkan tiga kes polio di Sabah,2020-01-10,11:55:00,Sabah Sarawak,https://www.sinarharian.com.my/article/65319/e...,SHAH ALAM - Kementerian Kesihatan (KKM) memak...,SHAH ALAM - Kementerian Kesihatan (KKM) memakl...
837,"Dua kes influenza A di Selangor di Cyberjaya, ...",2020-01-10,10:05:00,Selangor KL,https://www.sinarharian.com.my/article/65308/e...,SHAH ALAM - Jabatan Kesihatan Negeri Selangor...,SHAH ALAM - Jabatan Kesihatan Negeri Selangor ...
838,Influenza: Pesakit wajib dikuarantin elak penu...,2020-01-10,08:45:00,Nasional,https://www.sinarharian.com.my/article/65297/b...,SHAH ALAM - Kerajaan perlu mewajibkan arahan ...,SHAH ALAM - Kerajaan perlu mewajibkan arahan k...
