In [1]:
import pandas as pd
import numpy as np
import json

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import time
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

For [The Edge Malaysia](https://theedgemalaysia.com/), each of their pages seem to have a unique ID, e.g., "https://theedgemalaysia.com/node/677590". Hence, since we won't be able to do this by month, page no., etc., we'll use a **brute force** approach that tests every combination of numbers, such that we'll only scrape from a valid url.

In [2]:
headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
        "Cache-Control": "max-age=0",
    }

# for selenium
driver = webdriver.Chrome()

The chromedriver version (114.0.5735.90) detected in PATH at c:\Users\kucha\Desktop\PERSONAL Projects\malaysian-dataset\crawl\theedgemalaysia\chromedriver.exe might not be compatible with the detected chrome version (115.0.5790.171); currently, chromedriver 115.0.5790.170 is recommended for chrome 115.*, so it is advised to delete the driver in PATH and retry


In [3]:
# function to get text and metadata from url
def process_url(x):
    while True:
        webpage = f'https://theedgemalaysia.com/node/{x}'
        try:
            r = requests.get(webpage, headers=headers)
            driver.get(webpage)
            break
        except Exception as e:
            time.sleep(5.0)
    
    soup = BeautifulSoup(r.text, "lxml")
    
    # Wait for a specific element to be present, e.g., an element with class 'dynamic-content'
    WebDriverWait(driver, 3) \
        .until(EC.presence_of_element_located((By.CLASS_NAME, 
                                               'news-detail_newsInfo__dv0be')))
    page_source = driver.page_source
    soup_sel = BeautifulSoup(page_source, 'html.parser')

    try:
        headline = soup.find('title').text
        h = soup.find('div', class_="news-detail_newsTextDataWrap__PkAu5") 
        content_list = [p.text for p in h.find_all("p")]
        content_str = ' '.join(content_list)
        if 'English version' in content_str:
            language = 'Mandarin'
        else:
            language = 'English'

        div_date = soup_sel.find('div', class_="news-detail_newsInfo__dv0be")
        date_published = div_date.find('span').text

    except Exception as e:
        return None

    data = {'url': f'https://theedgemalaysia.com/node/{x}', 
            'headline': headline,
            'date_published': date_published,
            'language': language,
            'content': content_str}

    return data

In [4]:
np.random.seed(10082023)
batch1 = [str(x) for x in np.random.randint(0, 100000, size=20000)]
batch2 = [str(x) for x in np.random.randint(100001, 200000, size=20000)]
batch3 = [str(x) for x in np.random.randint(200001, 300000, size=20000)]
batch4 = [str(x) for x in np.random.randint(300001, 400000, size=20000)]
batch5 = [str(x) for x in np.random.randint(400001, 500000, size=20000)]
batch6 = [str(x) for x in np.random.randint(500001, 600000, size=20000)]
batch7 = [str(x) for x in np.random.randint(600001, 700000, size=20000)]

# let's just try batch1 first.
batches = [batch1]

In [5]:
# consolidate links and get texts
max_workers = 5

for i, urls in enumerate(batches):

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_url, x) for x in urls]

        for future in tqdm(futures, total=len(urls)):
            result = future.result()
            if result:
                with open(f'theedgemalaysia-batch-{i+1}.jsonl', 'a') as f:
                    json.dump(result, f)
                    f.write('\n')

  0%|          | 0/10 [00:00<?, ?it/s]