# Using Python's Multiprocessing Library, Part 1B

Frank Neugebauer
May 19, 2019

In [1]:
import pandas as pd
import os
import logging
import timeit
from sklearn.feature_extraction.text import TfidfVectorizer
from multiprocessing import Pool
from datetime import datetime

## 1B. Tokenize Articles

Created a function that tokenizes each of the articles from the previous step. Measured the time it takes to tokenize all of the articles for the different number of processes.

A separate log file is part created.

In [3]:
def start_logger():
    logging.basicConfig(filename ='./log/log_b_%s.log' %
                        datetime.strftime(datetime.now(), '%m%d%Y_%H%M%S'),
                        level = logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m-%d%H:%M:%S')

def read_json_directory_time(n_processes):
    logging.debug('In read_json_directory_time')
    TEST_CODE = 'articles = read_json_directory(' + str(n_processes) + ')'
    SETUP_CODE = '''from __main__ import read_json_directory'''
    time = timeit.timeit(TEST_CODE, setup=SETUP_CODE, number = 1)
    return time

def read_article_jsonl(file_paths):
    articles = []
    logging.debug('In read_article_json...')
    for file_path in file_paths:
        logging.debug('Reading the ' + file_path + ' file...')
        wiki_file_full = pd.read_json(file_path, lines=True)
        articles.append(wiki_file_full.to_dict())
    return articles

def read_json_directory(n_processes):
    WIKI_DIR = '../../data/wikipedia//featured-articles'
    logging.debug('In read_json_directory...')
    logging.debug('Building paths...')
    json_file_paths = [
        entry.path
        for entry in os.scandir(WIKI_DIR) if entry.name.endswith('.jsonl')
    ]
    logging.debug('Starting the pooling...')
    articles = read_article_jsonl(json_file_paths)
    logging.debug('There are ' + str(len(articles)) + ' dictionaries in the articles list.')
    logging.debug('Finished building the article dictionary.')

    with Pool(processes=n_processes) as pool:
        articles = pool.map(read_article_jsonl, json_file_paths) 
 
    return articles

def tokenize_time(n_processes):
    logging.debug('In tokenize_time...')
    SETUP_CODE = '''from __main__ import tokenize_documents, read_json_directory'''
    TEST_CODE = 'articles = read_json_directory(' + str(n_processes) + ');' + \
                'tokenize_documents(articles,' + str(n_processes) + ')'
    times = timeit.timeit(TEST_CODE, setup=SETUP_CODE, number = 1)
    return times

def tokenize_document(document):
    logging.debug('Inside tokenize_document (singlular)...')
    sklearn_tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False,
                                    sublinear_tf=True, tokenizer=final_clean)
    tfidf_rep = sklearn_tfidf.fit_transform(document)
    tfidf_array = tfidf_rep.toarray()
    return tfidf_array

def strip_extras(this_text):
    logging.debug('Inside strip_extras...')
    return_str = []
    for text in this_text.values():
        the_string = text[0].replace('\n', '').replace('\'', '')
        return_str.append(the_string)

    return return_str

def final_clean(doc):
    logging.debug('Inside final_clean...')
    doc.lower().split(" ")
    return doc

def tokenize_documents(all_documents, n_processes):
    logging.debug('Inside tokenize_documents...')

    df = pd.DataFrame(all_documents)

    clean_string_list = []
    texts = df['section_texts']
    for text_dict in texts:
        clean_string_list.append(strip_extras(text_dict))

    text_strings = []
    for this_string_array in clean_string_list:
        text_strings.append(this_string_array[0])

    final_docs = []
    for doc in clean_string_list:
        final_docs.append(final_clean(doc[0]))

    tokens_np_array = tokenize_document(final_docs)
    logging.debug('There are ' + str(tokens_np_array.size) + ' tokens in the article.')

    with Pool(n_processes)as pool:
        tokenized_documents = pool.map(self.tokenize_document, all_documents)

if __name__ == '__main__':
    start_logger()
    logging.debug('Starting tokenization...')

    times = []
    n_proc = [1, 2, 4, 8, 16]
    for this_proc in n_proc:
        this_record = {}
        time = tokenize_time(this_proc)
        this_record['# Processes'] = this_proc
        this_record['Time to Process'] = round(time, 4)
        times.append(this_record)

    print("# Processes\tTime to Process")
    for i in times:
        print("{}\t\t{}".format(i['# Processes'],i['Time to Process']))

# Processes	Time to Process
1		1.7296
2		1.7208
4		1.7025
8		1.6892
16		1.6907
