In [143]:
from bs4 import BeautifulSoup
import pandas as pd
import json
import requests
from tqdm import tqdm
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import numpy as np
import os

### 1.0 Get a list of webpages to scrape

In [3]:
sections = [
"lifestyle",
"beauty",
"book",
"resepi"
]

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'} 

In [66]:
def crawl(url):
    while True:
        try:
            r = requests.get(url,headers = headers)
            break
        except Exception as e:
            print(e)
            time.sleep(1.0)

    soup = BeautifulSoup(r.text, "lxml")

    block = soup.find('div',attrs = {"class":"main section"})
    
    if block is None:
        return

    for link in block.find_all('h3', attrs = {"class":"post-title entry-title"}):
        try:
            href = link.find('a').get('href')
            hrefs.append(href)
        except:
            pass

In [55]:
"""
This website doesn't seem to have a page number. That's troublesome.
Instead, I'll first scrape every first page (e.g., lifestyle, beauty, etc.)
and 'OLDER POSTS' link (if it exists) at the end of every page.
"""
pages = []
i = 0
for t in sections:
    print(t)
    url = f'https://www.leaazleeya.com/search/label/{t}'
    pages.append(url)
    while t is not None:
        r = requests.get(url)
        soup = BeautifulSoup(r.content, "lxml")
        try:
            a = soup.find('a', attrs={'class': 'blog-pager-older-link'}).get('href')
        except:
            a = None
        if a is not None:
            url = a
            pages.append(url)
            i +=1
        else:
            break

print(f'Links collected: {i}')

lifestyle
beauty
book
resepi
Links collected: 52


In [73]:
max_worker = 10

hrefs = []
for t in pages:
    r = requests.get(t)
    soup = BeautifulSoup(r.content, "lxml")
    a = soup.find_all('a')
    a = [a_.get('href') for a_ in a if a_.get('href')]
    
    # for i in tqdm(range(1, len(pages) + 1)):
    with ThreadPoolExecutor(max_workers=max_worker) as executor:
        futures = {executor.submit(crawl, t): t for t in pages}
    
    for future in as_completed(futures):
        future.result()

hrefs2 = list(set(hrefs))
print(f'Num. of unique links: {len(hrefs2)}')
with open(f'leaazleeya-link.json', 'a') as f:
    json.dump(hrefs2, f)

Num. of unique links: 544


### 2.0 Get webpage content (headers, paragraphs, links, etc.)

In [130]:
url = []
with open(f'leaazleeya-link.json') as fopen:
    href = json.load(fopen)
url.extend(href)

In [132]:
def process_url(x):
    
    while True:
        try:
            r = requests.get(x, headers=headers)
            break
        except Exception as e:
            print(e)
            time.sleep(5.0)
    
    soup = BeautifulSoup(r.text, "lxml")

    try:
        headline = soup.find('h3', class_="post-title entry-title").text
        h = soup.find('div', class_="post-body entry-content")

        # KIV
        content = [b.text for b in h.find_all("b")] + [br.text for br in h.find_all("br")] \
                  + [span.text for span in h.find_all("span")]

    except Exception as e:
        print('error in link:'+ x)
        print(e)
        return None

    data = {'url': x, 'headline': headline, 'content': content}
    return data

In [142]:
batch1 = url[0:49]
batch2 = url[50:99]
batch3 = url[100:149]
batch4 = url[150:199]
batch5 = url[200:249]
batch6 = url[250:299]
batch7 = url[300:349]
batch8 = url[350:399]
batch9 = url[400:449]
batch10 = url[450:499]
batch11 = url[500:543]

batches = [batch1, batch2, batch3, batch4, batch5, batch6, batch7,
        batch8, batch9, batch10, batch11]

In [138]:
max_workers = 5

for urls in batches:

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_url, x) for x in urls]
        
        for future in tqdm(futures, total=len(urls)):
            result = future.result()
            if result:
                with open(f'leaazleeya-links-batch-{urls}.jsonl', 'a') as f:
                    json.dump(result, f)
                    f.write('\n')

100%|██████████| 49/49 [00:11<00:00,  4.38it/s]
