In [None]:
import concurrent.futures as cf
import glob
import os
from multiprocessing import Process, Manager
from time import time

from Indexer import doc2tokens

def inverter(chunk, d):
    """
    update inverted_index dictionary, where key = token
    and value = [fname1, fname2,...], with each new chunk.
    
    parameters
    ----------
    chunk : tuple 
        consisting of document name and its corresponding 
        list of tokens
    d : dict
        threadsafe dict created by manager
    """
    print("pid: ", os.getpid())
    
    fname = chunk[0]
    tokens = chunk[1]  # list of doc's tokens
    
    for token in tokens:
        if fname in d[token]:  # fname has been indexed already
            pass  
        else:
            d[token].append(fname)

In [None]:
ls_fnames = glob.glob("corpus/*")

In [None]:
# create dict where k=fname, v=[token1, token2,...]
regdex = doc2tokens(ls_fnames)

In [None]:
# create inverted index using a shared dict object
start = time()

# use a manager to create shared objects
manager = Manager()
d = manager.dict()  # https://stackoverflow.com/questions/6832554/python-multiprocessing-how-do-i-share-a-dict-among-multiple-processes

# populate inverted index with keys, where k=tokens
ls_tokens = list(regdex.values())

for tokens in ls_tokens:
    unique_tokens = set(tokens)
    for token in unique_tokens:
        d[token] = manager.list()  # all shared objects must be created by manager  https://stackoverflow.com/questions/8640367/python-manager-dict-in-multiprocessing
print("execution", time() - start)

In [None]:
n_workers = 2

# update inverted index by adding fnames to the corresponding keys
with cf.ProcessPoolExecutor(max_workers=n_workers) as executor:
    for chunk in list(regdex.items()):
        _ = executor.submit(inverter, chunk, d)
        print("processing", chunk[0])

In [None]:
print(d["the"])