# Using Python's Multiprocessing Library, Part 1A

Frank Neugebauer
May 19, 2019

In [1]:
import pandas as pd
import os
import logging
import timeit
from multiprocessing import Pool
from datetime import datetime

## 1A. Read and JSONL Files

The featured article data is spread across multiple files on the disk. The following code contains a function that takes the JSONL file path as input, loads each JSON value, and returns a list of Python dictionaries as a result. 

Security limitations on your workstation may limit the functionality of this code - hopefully not.

In [2]:
def start_logger():
    logging.basicConfig(filename ='./log/log_a_%s.log' %
                        datetime.strftime(datetime.now(), '%m%d%Y_%H%M%S'),
                        level = logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m-%d%H:%M:%S')

def read_json_directory_time(n_processes):
    logging.debug('In read_json_directory_time')
    TEST_CODE = 'articles = read_json_directory(' + str(n_processes) + ')'
    SETUP_CODE = '''from __main__ import read_json_directory'''
    time = timeit.timeit(TEST_CODE, setup=SETUP_CODE, number = 1)
    return time

def read_article_jsonl(file_paths):
    articles = []
    logging.debug('In read_article_json...')
    for file_path in file_paths:
        logging.debug('Reading the ' + file_path + ' file...')
        wiki_file_full = pd.read_json(file_path, lines=True)
        articles.append(wiki_file_full.to_dict())
    return articles

def read_json_directory(n_processes):
    WIKI_DIR = '../../data/wikipedia//featured-articles'
    logging.debug('In read_json_directory...')
    logging.debug('Building paths...')
    json_file_paths = [
        entry.path
        for entry in os.scandir(WIKI_DIR) if entry.name.endswith('.jsonl')
    ]
    logging.debug('Starting the pooling...')
    articles = read_article_jsonl(json_file_paths)
    logging.debug('There are ' + str(len(articles)) + ' dictionaries in the articles list.')
    logging.debug('Finished building the article dictionary.')
    
    with Pool(processes=n_processes) as pool:
        articles = pool.map(read_article_jsonl, json_file_paths) 
    
    return articles

if __name__ == '__main__':
    start_logger()
    logging.debug('Starting read_json_directory...')

    times = []
    n_proc = [1, 2, 4, 8, 16]
    for this_proc in n_proc:
        this_record = {}
        time = read_json_directory_time(this_proc)
        this_record['# Processes'] = this_proc
        this_record['Time to Process'] = round(time, 4)
        times.append(this_record)

    print("# Processes\tTime to Process")
    for i in times:
        print("{}\t\t\t{}".format(i['# Processes'],i['Time to Process']))


# Processes	Time to Process
1			1.8371
2			1.6708
4			1.669
8			1.6643
16			1.6483
