In [None]:
import urllib.request
from bs4 import BeautifulSoup
import requests

import multiprocessing as mp
import concurrent.futures

import time
import re
import pandas as pd

In [None]:
def _parse_content(url):
    headers = {'User-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0'}
    content = BeautifulSoup(requests.get(url, headers).content, 
                            'html.parser')
    return content

def _get_article_b4(url_category):
    """helper function"""
    url = url_category[0]
    category = url_category[1]
    ArtUrl = []
    for pg in range(1, 6):
        content = _parse_content(url + f"page/{str(pg)}/")
        page = content.find('div', {'class': 'td-container td-category-container'})
        next10 = page.find('div', {'class': 'td-ss-main-content'}).find_all('a', {'class': 'td-image-wrap'})
        if pg == 1:
            top5 = page.find('div', {'class': 'td-big-grid-wrapper'}).find_all('a', {'class': 'td-image-wrap'})
            ArtUrl = ArtUrl + [item.get('href') for item in top5] + [item.get('href') for item in next10]
        else:
            ArtUrl = ArtUrl + [item.get('href') for item in next10]
        dct = {"Article Url": ArtUrl, "Category": category}
    df = pd.DataFrame.from_dict(dct)
    return df

def seq_main(news_url):
    exclude = ['markets', 'buying rates', 'foreign interest rates', 'philippine mutual funds',
               'leaders and laggards', 'stock quotes', 'stock markets summary',
               'non-bsp convertible currencies', 'bsp convertible currencies', 'us commodity futures',
               'health']
    
    content = _parse_content(news_url)

    catContent = content.find('ul', {'id': 'menu-header-menu-1'}).find_all('a')
    category = [item.text.title() for item in catContent if item.text.lower() not in exclude]
    category_url = [item.get('href') for item in catContent if item.text.lower() not in exclude]
    
    pagetab = [_parse_content(url).find('span', {'class': 'pages'}).text for url in category_url]
    lstpg = [int(re.sub(r'[^\w\s]', '', page[-(len(page) - 10):])) for page in pagetab]
    
    output = pd.DataFrame()
    for i in range(0, len(category)):
        df = _get_article_b4([category_url[i], category[i]])
        output = pd.concat([output, df], axis=0)
    return output


def par_main(news_url):
    exclude = ['markets', 'buying rates', 'foreign interest rates', 'philippine mutual funds',
               'leaders and laggards', 'stock quotes', 'stock markets summary',
               'non-bsp convertible currencies', 'bsp convertible currencies', 'us commodity futures',
               'health']
    
    content = _parse_content(news_url)

    catContent = content.find('ul', {'id': 'menu-header-menu-1'}).find_all('a')
    category = [item.text.title() for item in catContent if item.text.lower() not in exclude]
    category_url = [item.get('href') for item in catContent if item.text.lower() not in exclude]
    
    pagetab = [_parse_content(url).find('span', {'class': 'pages'}).text for url in category_url]
    lstpg = [int(re.sub(r'[^\w\s]', '', page[-(len(page) - 10):])) for page in pagetab]
    
    output = pd.DataFrame()
    with concurrent.futures.ThreadPoolExecutor() as pool:
        futures = [pool.submit(_get_article_b4, [category_url[i], category[i]]) for i in range(0, len(category))]
        for f in concurrent.futures.as_completed(futures):
            output = pd.concat([output, f.result()], axis=0)
    return output

newsWeb = 'https://www.bworldonline.com/'
dir_output = r'C:\Users\ojell\Desktop\Oj\_Thesis\Data\news\businessworld'

In [None]:
start1 = time.time()
test1 = seq_main(newsWeb)
test1.to_csv(dir_output + r'\test1.csv', index=False)
seq_time = time.time() - start1
print(f"Sequential Output: {len(test1)}")
print(f"Sequential elapsed time: {round(seq_time/60, 2)} mins")

In [None]:
start2 = time.time()
test2 = par_main(newsWeb)
test2.to_csv(dir_output + r'\test2.csv', index=False)
par_time = time.time() - start2
print(f"Parallel Output: {len(test2)}")
print(f"Parallel elapsed time: {round(par_time/60, 2)} mins")

In [None]:
print(f"Speedup: {seq_time/par_time}")
print(f"Efficiency: {100*(seq_time/par_time)/mp.cpu_count()}")

### 