In [1]:
import pandas as pd
import json

import requests
from bs4 import BeautifulSoup

import gc
import time
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

In [79]:
# list down the sections of the website we'd like to scrape.
sections = [
    "cars/",
    "fashion/",
    "lifestyle/hotels/",
    "lifestyle/watches/",
    "travel/",
    "technology-gadgets/",
    "dining/premium-dining/",
    "dining/romantic-dining-spots/",
    "media/music-concerts/",
    "random/news/",
    "finance/",
    "retail/",
    "random/editors-note/"
]

In [73]:
cookies = {
    '__utma': '129007881.1867875053.1694272480.1694272480.1694272480.1',
    '__utmc': '129007881',
    '__utmz': '129007881.1694272480.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
    '__ATA_tuuid': '6288fef3-89f2-4108-aa81-3d96a6b17778',
    '__utmt': '1',
    '__utmb': '129007881.47.9.1694273467830',
}

headers = {
    'authority': 'timchew.net',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'accept-language': 'en-US,en;q=0.9',
    'cache-control': 'max-age=0',
    # 'cookie': '__utma=129007881.1867875053.1694272480.1694272480.1694272480.1; __utmc=129007881; __utmz=129007881.1694272480.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __ATA_tuuid=6288fef3-89f2-4108-aa81-3d96a6b17778; __utmt=1; __utmb=129007881.47.9.1694273467830',
    'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'none',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
}

### 1.0 Get number of pages in each section, and their hrefs

In [80]:
def crawl_pagenums(url):
    while True:
        try:
            response = requests.get(url, cookies=cookies, headers=headers)
            break
        except Exception as e:
            print(e)
            time.sleep(1.0)

    soup = BeautifulSoup(response.text, "lxml")

    block = soup.find('div',attrs = {"class":"site-content cf"})
    
    if block is None:
        return
    hrefs = []
    for link in block.find_all('h2', attrs = {"class":"entry-title"}):
        try:
            href = link.find('a').get('href')
            hrefs.append(href)
        except:
            pass
    
    return hrefs

In [81]:
pages = []
hrefs = []
for section in sections:
    print(f'Getting pages from section: "{section}"')
    i = 1
    while section is not None:
        url = f"https://timchew.net/category/{section}page/{i}/"
        response = requests.get(url, cookies=cookies, headers=headers)
        if response.status_code == 200:
            if i > 1:
                url_b4 = f"https://timchew.net/category/{section}page/{i-1}/"
                links = crawl_pagenums(url)
                link_b4 = crawl_pagenums(url_b4)
                if links != link_b4:
                    hrefs.extend(links)
                    pages.append(url)
                    i += 1
                else:
                    print(f"'{section.title()}' has {i} unique pages.")
                    print("")
                    break
            else:
                links = crawl_pages(url)
                hrefs.extend(links)
                pages.append(url)
                i += 1
        else:
            print(f"'{section.title()}' has {i} unique pages.")
            print("")
            break

numpages_unique = list(set(pages))
hrefs_unique = list(set(hrefs))
print(f'Num. of unique page numbers: {len(numpages_unique)}')
print(f'Num. of unique articles: {len(hrefs_unique)}')

Getting pages from section: "cars/"
'Cars/' has 6 unique pages.

Getting pages from section: "fashion/"
'Fashion/' has 6 unique pages.

Getting pages from section: "lifestyle/hotels/"
'Lifestyle/Hotels/' has 2 unique pages.

Getting pages from section: "lifestyle/watches/"
'Lifestyle/Watches/' has 3 unique pages.

Getting pages from section: "travel/"
'Travel/' has 4 unique pages.

Getting pages from section: "technology-gadgets/"
'Technology-Gadgets/' has 7 unique pages.

Getting pages from section: "dining/premium-dining/"
'Dining/Premium-Dining/' has 3 unique pages.

Getting pages from section: "dining/romantic-dining-spots/"
'Dining/Romantic-Dining-Spots/' has 3 unique pages.

Getting pages from section: "media/music-concerts/"
'Media/Music-Concerts/' has 2 unique pages.

Getting pages from section: "random/news/"
'Random/News/' has 2 unique pages.

Getting pages from section: "finance/"
'Finance/' has 3 unique pages.

Getting pages from section: "retail/"
'Retail/' has 2 unique pa

In [82]:
with open(f'timchew_numpages.json', 'a') as numpages:
    json.dump(numpages_unique, numpages)
with open(f'timchew_articles.json', 'a') as articles:
    json.dump(hrefs_unique, articles)

### 2.0 Get text data from each webpage

In [90]:
articles_to_scrape = []
with open(f'timchew_articles.json') as fopen:
    links = json.load(fopen)
articles_to_scrape.extend(links)
print(f"Num. of articles to collect text data from: {len(articles_to_scrape)}")

Num. of articles to collect text data from: 839


In [91]:
articles_to_scrape[:5]

['https://timchew.net/2014/11/24/lenovo-yoga-3-pro-and-yoga-tablet-2-pro-launch-at-the-roof-first-avenue-bandar-utama/',
 'https://timchew.net/2017/10/09/news-october-2017/',
 'https://timchew.net/2015/12/01/gift-ideas-for-her-dior-paradise-2015-collection/',
 'https://timchew.net/2020/10/23/kamoshibito-kuheiji-sake-pairing-dinner-by-tmi-trading-at-dc-restaurant-by-darren-chin/',
 'https://timchew.net/2017/06/23/italian-prosecco-brunch-at-villa-danieli-sheraton-imperial-kuala-lumpur/']

In [92]:
def crawl_article(url):
    while True:
        try:
            response = requests.get(url, cookies=cookies, headers=headers)
            break
        except Exception as e:
            print(e)
            time.sleep(1.0)

    soup = BeautifulSoup(response.text, "lxml")

    try:
        headline = soup.find('h1', class_="entry-title").text
        words = soup.find('div', attrs = {"class":"entry-content"}) # post-body entry-content
        content = words.text

    except Exception as e:
        print('error in link:'+ url)
        print(e)
        return None

    data = {'url': url, 'headline': headline, 'content': content}
    return data

In [93]:
max_worker = 50

for i in tqdm(range(0, len(articles_to_scrape), max_worker)):
    gc.collect()
    with ThreadPoolExecutor(max_workers=max_worker) as executor:
        futures = {executor.submit(crawl_article, t): t for t in articles_to_scrape[i: i + max_worker]}

    for future in as_completed(futures):
        result = future.result()
        if result:
            with open(f'timchew-scraped-data.jsonl', 'a') as final:
                json.dump(result, final)
                final.write('\n')

  0%|          | 0/17 [00:00<?, ?it/s]

In [94]:
final_test = pd.read_json('timchew-scraped-data.jsonl', lines=True)
print(f"Num. of rows in dataframe: {len(final_test)}")

# check if there's any articles with no content
no_content = final_test[final_test['content'] == ""]
print(f"Num. of webpages with no content: {no_content}")
no_content.head(3)

Num. of rows in dataframe: 839
Num. of webpages with no content: Empty DataFrame
Columns: [url, headline, content]
Index: []


Unnamed: 0,url,headline,content
