In [1]:
import sys 
import time 
import requests 
import csv

from bs4 import BeautifulSoup 
from datetime import datetime
from fake_useragent import UserAgent

In [3]:
def extract_properties(links, writer):
    # need to get the first 6 articles, as the other 4 are the 'popular section'
    for link in links[:6]:
      # Within div tag, find h3 tag, then within h3 find 'a' tag, extract its title as text

      # Ex: Perolehan vaksin: Cubaan ketiga Anwar burukkan PN 

      article_title = link.find('h3').find('a').text

      # Ex: Date 2023-02-18T22:09:02+00:00
      date = link.find('time')['datetime'][:10]

      # Ex: 2023-02-18
      dateobj = datetime.strptime(date,'%Y-%m-%d').date()
      
      # Ex: 22:09:02+00:00 --> 22:09:02
      time = link.find('time')['datetime'][11:19]

      # Ex: 22:09:02 --> datetime format
      timeobj = datetime.strptime(time, '%H:%M:%S').time()

      # Extract value of href attribute from 'a' tag using dictionary-style access
      Link = link.find('h3').find('a')['href']

      # write each as a row in csv file
      writer.writerow([article_title, dateobj, timeobj, Link])

In [4]:
# Jan 2020 - Dec 2022 is in total 418 pages 

pages_to_get = 418

# Writing to a file 
with open('harakah_daily.csv', 'w', newline='') as f:
  writer = csv.writer(f)
  headers = ["Title", "Date", "Time", "Link"]
  writer.writerow(headers)

  # automatic goes to the next page from 1... n; python exclusive end 

  for page in range(1,pages_to_get+1):
    print('Processing Page: ', page)
    url = 'https://harakahdaily.net/index.php/page/'+str(page)+'/?s=vaksin'

    try:
      # response is equivalent to enter a key in chrome 
      # prevent ip-block by adding fake devices accessing web pages 
      response = requests.get(url, headers={'User-Agent': UserAgent().random})

      # this link give valid status code: 200 --> web scrap pass 
      # print(page.status_code)

    except Exception as e:
      error_type, error_obj, error_info = sys.exc_info()
      print('Error Link: ', url)
      print(error_type, 'Line: ', error_info.tb_lineno)

      # ignore this paage and move on to next one
      continue 

    # delay by 2 seconds to prevent ip block
    time.sleep(2)

    soup = BeautifulSoup(response.text, 'html.parser')
    # inspect element attribute type and its names to take their information

    attrs_code = 'item-details'
    links = soup.find_all('div', attrs={'class':attrs_code})
    # print(len(links))

    # Check each page has 6 links
    print(f'This page has {len(links[:6])} links')

    extract_properties(links, writer)

    print('CSV file saved successfully for Page: ' + str(page))

Processing Page:  1
This page has 6 links
CSV file saved successfully for Page: 1
Processing Page:  2
This page has 6 links
CSV file saved successfully for Page: 2
Processing Page:  3
This page has 6 links
CSV file saved successfully for Page: 3
Processing Page:  4
This page has 6 links
CSV file saved successfully for Page: 4
Processing Page:  5
This page has 6 links
CSV file saved successfully for Page: 5
Processing Page:  6
This page has 6 links
CSV file saved successfully for Page: 6
Processing Page:  7
This page has 6 links
CSV file saved successfully for Page: 7
Processing Page:  8
This page has 6 links
CSV file saved successfully for Page: 8
Processing Page:  9
This page has 6 links
CSV file saved successfully for Page: 9
Processing Page:  10
This page has 6 links
CSV file saved successfully for Page: 10
Processing Page:  11
This page has 6 links
CSV file saved successfully for Page: 11
Processing Page:  12
This page has 6 links
CSV file saved successfully for Page: 12
Processing