In [2]:
import os, math, json, time, random, pandas as pd
from pprint import pprint
import concurrent.futures

In [3]:
def spider(data):
    # INITIALIZATION
    spiders = []
    MAX_THREAD = 2
    # CHUNK SPIDER
    step_n = math.ceil(len(data) / MAX_THREAD) # Return the ceiling of step_n, the smallest integer greater than or equal to step_n
    spider_data = [data[i:i + step_n]
                   for i in range(0, len(data), step_n)]
    # SPIDER CRAWLER
#     print("Total thread spider(s): {}".format(len(spider_data)))
    # print("Divisible_n:", divisible_n)
    for spider in spider_data:
        spiders.append(len(spider))
    return spiders

In [4]:
def main(system_data, WORKERS):
    # Computation for divisible number for chunking URL(s)
    divisible_n = math.ceil(len(system_data) / WORKERS)
    data = [system_data[i:i + divisible_n]
            for i in range(0, len(system_data), divisible_n)]  # Chunking URL(s)

    # Instansiating MAX WORKERS based on the lower number between length of data and defined WORKERS
    MAX_WORKERS = min(len(data), WORKERS)

    # LOGGING ---------------------------
    print("Total URL(s): {}".format(len(system_data)))
    print("Total Spider(s) / Worker(s): {}".format(MAX_WORKERS))
    print("Chunked Data for multiprocessing: ",
          list(map(lambda d: len(d), data)))
    # MULTI PROCESSING OF SPIDER FUNCTION
    with concurrent.futures.ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor:
        future = [executor.submit(spider, obj) for obj in data]

    # Getting the return value of multi spiders
    spiders = [obj.result() for obj in future]
    return len(system_data), MAX_WORKERS, spiders

In [5]:
def get_rows(spiders):
    rows = []
    for i in range(1, len(spiders[0]) + 1):
        rows.append("Thread {}".format(i))
    return rows

In [6]:
def get_headers(total_workers):
    headers = []
    for i in range(1, total_workers + 1):
        headers.append("Worker {}".format(i))
    return headers

In [7]:
def execute_parser():
    
    DATA = 500
    articles = list(map(lambda x: x.strip(), open(
        'test-data/test-articles.txt').read().split('\n'))) * DATA
    if len(articles) == 0:
        print("No Data")
        exit(0)
    # Get the machine's total workers
    WORKERS = 14#os.cpu_count() - 2
    total_data, total_workers, spiders = main(articles, WORKERS)
    headers = get_headers(total_workers)
    fields = get_rows(spiders)
    data_set = pd.DataFrame(spiders, index = headers, columns = fields)
    print(data_set.T)

In [8]:
execute_parser()

Total URL(s): 500
Total Spider(s) / Worker(s): 14
Chunked Data for multiprocessing:  [36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 32]
          Worker 1  Worker 2  Worker 3  Worker 4  Worker 5  Worker 6  \
Thread 1        18        18        18        18        18        18   
Thread 2        18        18        18        18        18        18   

          Worker 7  Worker 8  Worker 9  Worker 10  Worker 11  Worker 12  \
Thread 1        18        18        18         18         18         18   
Thread 2        18        18        18         18         18         18   

          Worker 13  Worker 14  
Thread 1         18         16  
Thread 2         18         16  
