In [1]:
import pandas as pd
import json

import requests
from bs4 import BeautifulSoup

import gc
import time
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
# list down the sections of the website we'd like to scrape.
sections = [
    'features/tips/',
    'news/bulletin/',
    'news/international/',
    'news/local-news/',
    'news/newswire/',
    'features/geared-up/',
    'features/outings/',
]

This was a site that had infinite scrolling. It was a bit hard to find the page numbers but we got there by looking at *Inspect > Network > Docs*. After that, we pasted the link on https://curlconverter.com/ to get the below `cookies` & `headers`.

In [3]:
cookies = {
    'PHPSESSID': '2k6ra51aiq8bkoffj41k3i7nl6',
    '_gid': 'GA1.2.60285680.1694179075',
    '__gads': 'ID=62a9074952e63892:T=1691302727:RT=1694179077:S=ALNI_MbifilTODQ4waB-NotcWZmrmqJGCQ',
    '__gpi': 'UID=00000c2790829244:T=1691302727:RT=1694179077:S=ALNI_Mb4tZnjW_oAqV9zASTyNdddCPFd9A',
    '_ga': 'GA1.1.984074356.1691302726',
    '_ga_0MVNT427LM': 'GS1.1.1694179073.2.1.1694179269.0.0.0',
}

headers = {
    'authority': 'www.bikesrepublic.com',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'accept-language': 'en-US,en;q=0.9',
    'cache-control': 'max-age=0',
    # 'cookie': 'PHPSESSID=2k6ra51aiq8bkoffj41k3i7nl6; _gid=GA1.2.60285680.1694179075; __gads=ID=62a9074952e63892:T=1691302727:RT=1694179077:S=ALNI_MbifilTODQ4waB-NotcWZmrmqJGCQ; __gpi=UID=00000c2790829244:T=1691302727:RT=1694179077:S=ALNI_Mb4tZnjW_oAqV9zASTyNdddCPFd9A; _ga=GA1.1.984074356.1691302726; _ga_0MVNT427LM=GS1.1.1694179073.2.1.1694179269.0.0.0',
    'referer': 'https://www.bikesrepublic.com/category/features/tips/',
    'sec-ch-ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Google Chrome";v="116"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
}

### 1.0 Get number of pages

In [5]:
pages = []
for section in sections:
    print(f'Scraping "{section}"')
    i = 0
    while section is not None:
        url = f"https://www.bikesrepublic.com/category/news/{section}page/{i}"
        i += 1
        response = requests.get(url, cookies=cookies, headers=headers)
        if response.status_code == 200:
            pages.append(url)
        else:
            print(f"'{section.title()}' only has {i-1} pages.")
            break

pages_unique = list(set(pages))
print(f'Num. of unique pages: {len(pages_unique)}')
with open(f'bikesrepublic_numpages.json', 'a') as f:
    json.dump(pages_unique, f)

Scraping "features/tips/"
'Features/Tips/' only has 13 pages.
Scraping "news/bulletin/"
'News/Bulletin/' only has 533 pages.
Scraping "news/international/"
'News/International/' only has 399 pages.
Scraping "news/local-news/"
'News/Local-News/' only has 190 pages.
Scraping "news/newswire/"
'News/Newswire/' only has 2 pages.
Scraping "features/geared-up/"
'Features/Geared-Up/' only has 43 pages.
Scraping "features/outings/"
'Features/Outings/' only has 49 pages.
Num. of unique pages: 1229


### 2.0 Get links in each page

In [20]:
page_nums = []
with open(f'bikesrepublic_numpages.json') as fopen:
    links = json.load(fopen)
page_nums.extend(links)

In [21]:
def crawl(url):
    while True:
        try:
            response = requests.get(url, cookies=cookies, headers=headers)
            break
        except Exception as e:
            print(e)
            time.sleep(1.0)

    soup = BeautifulSoup(response.text, "lxml")

    block = soup.find('div',attrs = {"class":"td-ss-main-content"})
    
    if block is None:
        return

    for link in block.find_all('h3', attrs = {"class":"entry-title td-module-title"}):
        try:
            href = link.find('a').get('href')
            hrefs.append(href)
        except:
            pass

In [6]:
max_worker = 50

hrefs = []
for i in tqdm(range(0, len(page_nums), max_worker)):
    gc.collect()
    with ThreadPoolExecutor(max_workers=max_worker) as executor:
        futures = {executor.submit(crawl, t): t for t in page_nums[i: i + max_worker]}

    for future in as_completed(futures):
        future.result()

hrefs_unique = list(set(hrefs))
print(f'Num. of unique links: {len(hrefs_unique)}')
with open(f'bikesrepublic-links.json', 'a') as f:
    json.dump(hrefs_unique, f)

NameError: name 'page_nums' is not defined

### 3.0 Get content from articles

In [4]:
# load articles that we need to scrape
articles = []
with open(f'bikesrepublic-links.json') as fopen:
    article_links = json.load(fopen)
articles.extend(article_links)

print(f"Num. of articles to scrape: {len(articles)}")

Num. of articles to scrape: 6974


In [5]:
articles[:10]

['https://www.bikesrepublic.com/featured/gallery-ducati-werideasone-malaysian-leg-kicks-off-in-style/',
 'https://www.bikesrepublic.com/features/outings/shell-cup-round-3-japan-dominates-malaysia-surprises/',
 'https://www.bikesrepublic.com/news/bulletin/video-samsung-smart-windshield-for-motorcycles/',
 'https://www.bikesrepublic.com/featured/10-reasons-you-should-attend-california-superbike-school-malaysia/',
 'https://www.bikesrepublic.com/news/bulletin/rieju-rs3-125-nkd-introduced-to-european-market/',
 'https://www.bikesrepublic.com/featured/motogp-maverick-vinales-signs-two-year-deal-with-yamaha/',
 'https://www.bikesrepublic.com/featured/exclusive-tyre-knowledge-sharing-session-at-pirelli-tyre-seminar/',
 'https://www.bikesrepublic.com/featured/ktm-malaysia-set-rock-malaysian-motogp-2017/',
 'https://www.bikesrepublic.com/featured/toc-automotive-college-launches-superbike-technician-course/',
 'https://www.bikesrepublic.com/featured/the-zeppelin-will-remain-as-a-concept-motorcyc

In [11]:
def crawl_article(url):
    while True:
        try:
            response = requests.get(url, cookies=cookies, headers=headers)
            break
        except Exception as e:
            print(e)
            time.sleep(1.0)

    soup = BeautifulSoup(response.text, "lxml")

    try:
        headline = soup.find('h1', class_="entry-title").text
        words = soup.find('div', attrs = {"class":"td-post-content"}) # post-body entry-content
        content = words.text

    except Exception as e:
        print('error in link:'+ url)
        print(e)
        return None

    data = {'url': url, 'headline': headline, 'content': content}
    return data

In [13]:
max_worker = 50

for i in tqdm(range(0, len(articles), max_worker)):
    gc.collect()
    with ThreadPoolExecutor(max_workers=max_worker) as executor:
        futures = {executor.submit(crawl_article, t): t for t in articles[i: i + max_worker]}

    for future in as_completed(futures):
        result = future.result()
        if result:
            with open(f'bikesrepublic-scraped-data.jsonl', 'a') as final:
                json.dump(result, final)
                final.write('\n')

  0%|          | 0/140 [00:00<?, ?it/s]

error in link:https://www.bikesrepublic.com/news/bulletin/216842/
'NoneType' object has no attribute 'text'
error in link:https://www.bikesrepublic.com/news/bulletin/198356/
'NoneType' object has no attribute 'text'
error in link:https://www.bikesrepublic.com/news/bulletin/217499/
'NoneType' object has no attribute 'text'
error in link:https://www.bikesrepublic.com/news/bulletin/215235/
'NoneType' object has no attribute 'text'
error in link:https://www.bikesrepublic.com/featured/indian-ftr1200-custom-will-go-production/
'NoneType' object has no attribute 'text'
error in link:https://www.bikesrepublic.com/featured/reviewed-ktm-duke-200-and-rc200-the-fun-loving-siblings/
'NoneType' object has no attribute 'text'
error in link:https://www.bikesrepublic.com/features/geared-up/first-malaysian-superbikes-time-trial-2014/
'NoneType' object has no attribute 'text'
error in link:https://www.bikesrepublic.com/featured/2021-ktm-250-390-adventure-launched-in-malaysia-from-rm21500/
'NoneType' obje

In [14]:
final_test = pd.read_json('bikesrepublic-scraped-data.jsonl', lines=True)
print(f"Num. of rows in dataframe: {len(final_test)}")

# check if there's any articles with no content
no_content = final_test[final_test['content'] == ""]
print(f"Num. of webpages with no content: {no_content}")
no_content.head(3)

Num. of rows in dataframe: 6959
Num. of webpages with no content: Empty DataFrame
Columns: [url, headline, content]
Index: []


Unnamed: 0,url,headline,content


For some reason, barring the urls where it's just a number, the other urls are consistent and have text in them. Therefore, we'll manually add them below.

In [16]:
list_remaining_urls = [
    'https://www.bikesrepublic.com/featured/indian-ftr1200-custom-will-go-production/',
    'https://www.bikesrepublic.com/featured/reviewed-ktm-duke-200-and-rc200-the-fun-loving-siblings/',
    'https://www.bikesrepublic.com/features/geared-up/first-malaysian-superbikes-time-trial-2014/',
    'https://www.bikesrepublic.com/featured/2021-ktm-250-390-adventure-launched-in-malaysia-from-rm21500/',
    'https://www.bikesrepublic.com/featured/demak-shuts-down-what-could-have-happened-to-this-promising-malaysian-motorcycle-builder/',
    'https://www.bikesrepublic.com/featured/hog-pj-conducts-safe-rider-skills-program/',
    'https://www.bikesrepublic.com/featured/building-motorcycle-model-kits-a-great-hobby-for-the-bike-nut/',
    'https://www.bikesrepublic.com/news/bulletin/harley-davidson-kuala-lumpur-organises-sahur-ride/',
    "https://www.bikesrepublic.com/featured/lexmoto-lxs-125-lands-in-the-uk-under-tail-exhaust-lives-on/",
    "https://www.bikesrepublic.com/featured/spy-shot-2023-triumph-street-triple-gets-new-ohlins-front-fork/"
]

In [18]:
for i in list_remaining_urls:
    data = crawl_article(i)
    with open(f'bikesrepublic-scraped-data-fixed.jsonl', 'a') as final:
        json.dump(result, final)
        final.write('\n')

In [19]:
final_test = pd.read_json('bikesrepublic-scraped-data-fixed.jsonl', lines=True)
print(f"Num. of rows in dataframe: {len(final_test)}")

# check if there's any articles with no content
no_content = final_test[final_test['content'] == ""]
print(f"Num. of webpages with no content: {no_content}")
no_content.head(3)

Num. of rows in dataframe: 6969
Num. of webpages with no content: Empty DataFrame
Columns: [url, headline, content]
Index: []


Unnamed: 0,url,headline,content


In [20]:
final_test.head(3)

Unnamed: 0,url,headline,content
0,https://www.bikesrepublic.com/featured/motogp-...,"MotoGP: Marc Marquez back on a bike, poised fo...",\n\nInjured MotoGP star Marc Marquez is back o...
1,https://www.bikesrepublic.com/featured/ducati-...,Ducati Desmosedici GP17 – A Closer Look,\n\n\n\nThe latest Ducati Desmo GP17 is one of...
2,https://www.bikesrepublic.com/featured/2020-kt...,2020 KTM 890 Duke R Also Appears at EICMA 2019,\n2020 KTM 890 Duke R\n\n\nThe 2020 KTM 890 Du...
