In [1]:
import pandas as pd
import numpy as np
import json

import requests
from bs4 import BeautifulSoup

import time
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor

### 1.0 Scraping **The Edge Malaysia**

For [The Edge Malaysia](https://theedgemalaysia.com/), each of their articles seem to have a unique ID, e.g., "https://theedgemalaysia.com/node/677590". Hence, since we won't be able to do this by month, page no., etc., we'll use a **brute force** approach that tests every combination of numbers, such that we'll only scrape from a valid url.

In [2]:
headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
        "Cache-Control": "max-age=0",
    }

In [3]:
# function to get text and metadata from url
def process_url(x):
    while True:
        webpage = f'https://theedgemalaysia.com/node/{x}'
        try:
            r = requests.get(webpage, headers=headers)
            break
        except:
            time.sleep(5.0)
    
    soup = BeautifulSoup(r.text, "lxml")

    try:
        headline = soup.find('title').text
        h = soup.find('div', class_="news-detail_newsTextDataWrap__PkAu5") 
        content_list = [p.text for p in h.find_all("p")]
        content_str = ' '.join(content_list)
        if 'English version' in content_str:
            language = 'Mandarin'
        else:
            language = 'English'

    except:
        return None

    data = {'url': f'https://theedgemalaysia.com/node/{x}', 
            'headline': headline,
            'language': language,
            'content': content_str}

    return data

In [6]:
"""
Initially I only wanted to scrape a subset of the website.
I decided to proceed with the rest after batch 1 - 7.

Pertaining to below, the range I initially specified neglects numbers like
200000, 300000, 400000, etc. I've fixed this batch8 onwards.
"""
np.random.seed(10082023)
batch1 = list(set([str(x) for x in np.random.randint(0, 100000, size=20000)]))
batch2 = list(set([str(x) for x in np.random.randint(100001, 200000, size=20000)]))
batch3 = list(set([str(x) for x in np.random.randint(200001, 300000, size=20000)]))
batch4 = list(set([str(x) for x in np.random.randint(300001, 400000, size=20000)]))
batch5 = list(set([str(x) for x in np.random.randint(400001, 500000, size=20000)]))
batch6 = list(set([str(x) for x in np.random.randint(500001, 600000, size=20000)]))
batch7 = list(set([str(x) for x in np.random.randint(600001, 700000, size=20000)]))

batch8 = list(
    set([str(x) for x in np.arange(500000, 600000)]) - set(batch6)
)

batch9 = list(
    set([str(x) for x in np.arange(400000, 500000)]) - set(batch5)
)

batch10 = list(
    set([str(x) for x in np.arange(300000, 400000)]) - set(batch4)
)

batch11 = list(
    set([str(x) for x in np.arange(200000, 300000)]) - set(batch3)
)

batch12 = list(
    set([str(x) for x in np.arange(100000, 200000)]) - set(batch2)
)

batch13 = list(
    set([str(x) for x in np.arange(0, 100000)]) - set(batch1)
)

batch14 = list(
    set([str(x) for x in np.arange(600000, 700000)]) - set(batch7)
)

batches = [batch1, batch2, batch3,
           batch4, batch5, batch6,
           batch7, batch8, batch9,
           batch10, batch11, batch12,
           batch13, batch14]

In [11]:
# consolidate links and get texts
max_workers = 10

for i, urls in enumerate(batches):

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_url, x) for x in urls]

        for future in tqdm(futures, total=len(urls)):
            result = future.result()
            if result:
                with open(f'theedgemalaysia--complete-batch-{i+1}.jsonl', 'a') as f:
                    json.dump(result, f)
                    f.write('\n')

### 2.0 Final checks

In [13]:
files_to_check = [
    'theedgemalaysia--complete-batch-1.jsonl',
    'theedgemalaysia--complete-batch-2.jsonl',
    'theedgemalaysia--complete-batch-3.jsonl',
    'theedgemalaysia--complete-batch-4.jsonl',
    'theedgemalaysia--complete-batch-5.jsonl',
    'theedgemalaysia--complete-batch-6.jsonl',
    'theedgemalaysia--complete-batch-7.jsonl',
    'theedgemalaysia--complete-batch-8.jsonl',
    'theedgemalaysia--complete-batch-9.jsonl',
    'theedgemalaysia--complete-batch-10.jsonl',
    'theedgemalaysia--complete-batch-11.jsonl',
    'theedgemalaysia--complete-batch-12.jsonl',
    'theedgemalaysia--complete-batch-13.jsonl',
    'theedgemalaysia--complete-batch-14.jsonl',
    ]

dfs = []
for i, file in enumerate(files_to_check):
    get_df = pd.read_json(file, lines=True)
    get_df.drop_duplicates(inplace=True)
    dfs.append(get_df)
    print(f'No. of articles in batch {i+0}: {len(get_df)}')

final_num_articles = pd.concat(dfs, axis=0)
final_num_articles.to_parquet('theedgemalaysia--complete-1-14.parquet', index=False)
print(f'Num. of articles/webpages scraped: {len(final_num_articles)}')