In [10]:
from datetime import datetime
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from fake_useragent import UserAgent
import sys
import re
import csv
import time
import pandas as pd
import requests

In [7]:
def extract_properties(links, writer):
    # need to get the first 6 articles, as the other 4 are the 'popular section'
    for link in links[:6]:
      # Within div tag, find h3 tag, then within h3 find 'a' tag, extract its title as text

      # Ex: Perolehan vaksin: Cubaan ketiga Anwar burukkan PN 

      article_title = link.find('h3').find('a').text

      # Ex: Date 2023-02-18T22:09:02+00:00
      date = link.find('time')['datetime'][:10]

      # Ex: 2023-02-18
      dateobj = datetime.strptime(date,'%Y-%m-%d').date()
      
      # Ex: 22:09:02+00:00 --> 22:09:02
      time = link.find('time')['datetime'][11:19]

      # Ex: 22:09:02 --> datetime format
      timeobj = datetime.strptime(time, '%H:%M:%S').time()

      # Extract value of href attribute from 'a' tag using dictionary-style access
      Link = link.find('h3').find('a')['href']

      # Extract post category website gave to each article 

      post_category = link.find('a', attrs={'class':'td-post-category'}).text

      # write each as a row in csv file
      writer.writerow([article_title, dateobj, timeobj, Link, post_category])

In [8]:
# Jan 2020 - Dec 2022 is in total 418 pages 

pages_to_get = 418

# Writing to a file 
with open('harakah_daily.csv', 'w', newline='') as f:
  writer = csv.writer(f)
  headers = ["Title", "Date", "Time", "Link", 'Post_Category']
  writer.writerow(headers)

  # automatic goes to the next page from 1... n; python exclusive end 

  for page in range(1,pages_to_get+1):
    print('Processing Page: ', page)
    url = 'https://harakahdaily.net/index.php/page/'+str(page)+'/?s=vaksin'

    try:
      # response is equivalent to enter a key in chrome 
      # prevent ip-block by adding fake devices accessing web pages 
      response = requests.get(url, headers={'User-Agent': UserAgent().random})

      # this link give valid status code: 200 --> web scrap pass 
      # print(page.status_code)

    except Exception as e:
      error_type, error_obj, error_info = sys.exc_info()
      print('Error Link: ', url)
      print(error_type, 'Line: ', error_info.tb_lineno)

      # ignore this paage and move on to next one
      continue 

    # delay by 2 seconds to prevent ip block
    time.sleep(2)

    soup = BeautifulSoup(response.text, 'html.parser')
    # inspect element attribute type and its names to take their information

    attrs_code = 'item-details'
    links = soup.find_all('div', attrs={'class':attrs_code})
    # print(len(links))

    # Check each page has 6 links
    print(f'This page has {len(links[:6])} links')

    extract_properties(links, writer)

    print('CSV file saved successfully for Page: ' + str(page))

Processing Page:  1
This page has 6 links
CSV file saved successfully for Page: 1
Processing Page:  2
This page has 6 links
CSV file saved successfully for Page: 2
Processing Page:  3
This page has 6 links
CSV file saved successfully for Page: 3
Processing Page:  4
This page has 6 links
CSV file saved successfully for Page: 4
Processing Page:  5
This page has 6 links
CSV file saved successfully for Page: 5
Processing Page:  6
This page has 6 links
CSV file saved successfully for Page: 6
Processing Page:  7
This page has 6 links
CSV file saved successfully for Page: 7
Processing Page:  8
This page has 6 links
CSV file saved successfully for Page: 8
Processing Page:  9
This page has 6 links
CSV file saved successfully for Page: 9
Processing Page:  10
This page has 6 links
CSV file saved successfully for Page: 10
Processing Page:  11
This page has 6 links
CSV file saved successfully for Page: 11
Processing Page:  12
This page has 6 links
CSV file saved successfully for Page: 12
Processing

In [43]:
# read csv files and drop duplicates

site_data = pd.read_csv('harakah_daily.csv')
# site_data = site_data.drop_duplicates()
site_data

Unnamed: 0,Title,Date,Time,Link,Post_Category
0,Perolehan vaksin: Cubaan ketiga Anwar burukkan PN,2023-02-18,22:09:02,https://harakahdaily.net/index.php/2023/02/18/...,Nasional
1,"NPRA, KKM nafi vaksin Covid-19 mengandungi cip ID",2023-02-03,16:27:44,https://harakahdaily.net/index.php/2023/02/03/...,Nasional
2,Tiada laporan kes strok dalam kajian vaksin Co...,2023-01-18,18:55:20,https://harakahdaily.net/index.php/2023/01/18/...,Berita
3,130 rakyat Singapura tersalah terima dos vaksi...,2022-10-04,10:01:28,https://harakahdaily.net/index.php/2022/10/04/...,Global
4,1.5 juta kanak-kanak belum divaksin,2022-07-28,13:24:14,https://harakahdaily.net/index.php/2022/07/28/...,Nasional
...,...,...,...,...,...
2503,"Kuarantin selaras ajaran Islam, dipraktikkan d...",2020-02-02,00:01:30,https://harakahdaily.net/index.php/2020/02/02/...,Nasional
2504,WEF2020: Apa langkah Malaysia seterusnya?,2020-02-01,19:03:48,https://harakahdaily.net/index.php/2020/02/01/...,Global
2505,Virus Corona: Kembali kepada sistem Islam,2020-01-26,17:43:59,https://harakahdaily.net/index.php/2020/01/26/...,Global
2506,Penyakit Influenza Virus Corona Wuhan (2019 n-...,2020-01-26,08:46:49,https://harakahdaily.net/index.php/2020/01/26/...,Berita


In [44]:
# set driver options - do not pop up chrome and disable pop-ups.
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument(f'user-agent":{UserAgent().random}')

# get appropriate drivers for Google Chrome - automatic without downloading driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), 
                          options=chrome_options)

In [45]:
def driver_scroll():
    while True:
        # Scroll to the bottom of the page
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait for the page to load
        time.sleep(2)

        # Check if we have reached the bottom of the page
        scroll_position = driver.execute_script("return window.pageYOffset;")
        page_height = driver.execute_script("return document.body.scrollHeight;")

        # The position for the end of page, if it is reached, quit browser
        # page_height > scroll_height also work
        if (page_height > scroll_position):
            break


In [46]:
# find p tags and combine into one article 
article_text = []
for url, link_index in zip(site_data['Link'], range(len(site_data['Link']))):
    print('Processing link: ',link_index)
    driver.get(url)

    driver_scroll()
    page_source = driver.page_source
    
    soup_article = BeautifulSoup(page_source, 'html.parser')
    article_paragraphs = soup_article.find_all('p')
    combined_article = ''
    
    for p in article_paragraphs:
        combined_article += p.get_text()
    article_text.append(combined_article)
    print('Obtained article text for link: ',link_index)

Processing link:  0
Obtained article text for link:  0
Processing link:  1
Obtained article text for link:  1
Processing link:  2
Obtained article text for link:  2
Processing link:  3
Obtained article text for link:  3
Processing link:  4
Obtained article text for link:  4
Processing link:  5
Obtained article text for link:  5
Processing link:  6
Obtained article text for link:  6
Processing link:  7
Obtained article text for link:  7
Processing link:  8
Obtained article text for link:  8
Processing link:  9
Obtained article text for link:  9
Processing link:  10
Obtained article text for link:  10
Processing link:  11
Obtained article text for link:  11
Processing link:  12
Obtained article text for link:  12
Processing link:  13
Obtained article text for link:  13
Processing link:  14
Obtained article text for link:  14
Processing link:  15
Obtained article text for link:  15
Processing link:  16
Obtained article text for link:  16
Processing link:  17
Obtained article text for link

WebDriverException: Message: disconnected: Unable to receive message from renderer
  (failed to check if window was closed: disconnected: not connected to DevTools)
  (Session info: headless chrome=110.0.5481.177)
Stacktrace:
0   chromedriver                        0x00000001005211c0 chromedriver + 4248000
1   chromedriver                        0x00000001004a1dc0 chromedriver + 3726784
2   chromedriver                        0x0000000100156ec4 chromedriver + 274116
3   chromedriver                        0x0000000100140fdc chromedriver + 184284
4   chromedriver                        0x0000000100140d10 chromedriver + 183568
5   chromedriver                        0x000000010013f990 chromedriver + 178576
6   chromedriver                        0x000000010013fcc8 chromedriver + 179400
7   chromedriver                        0x0000000100158efc chromedriver + 282364
8   chromedriver                        0x00000001001ce5f0 chromedriver + 763376
9   chromedriver                        0x00000001001ce054 chromedriver + 761940
10  chromedriver                        0x0000000100185200 chromedriver + 463360
11  chromedriver                        0x0000000100186318 chromedriver + 467736
12  chromedriver                        0x00000001004ef060 chromedriver + 4042848
13  chromedriver                        0x00000001004f38a4 chromedriver + 4061348
14  chromedriver                        0x00000001004fb3d0 chromedriver + 4092880
15  chromedriver                        0x00000001004f46e4 chromedriver + 4064996
16  chromedriver                        0x00000001004ca0fc chromedriver + 3891452
17  chromedriver                        0x0000000100514a64 chromedriver + 4196964
18  chromedriver                        0x0000000100514bb8 chromedriver + 4197304
19  chromedriver                        0x0000000100528650 chromedriver + 4277840
20  libsystem_pthread.dylib             0x00000001a13de06c _pthread_start + 148
21  libsystem_pthread.dylib             0x00000001a13d8e2c thread_start + 8


In [None]:
site_data['Content'] = article_text
site_data.to_csv('hemetro_with_text.csv', encoding='utf-8', index=False)
site_data