In [1]:
import os 
import string 
import gc
from multiprocessing.pool import ThreadPool
from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor

In [2]:
DATA_DIR = './Gutenberg_Text/'

In [3]:
def get_files(directory):
    file_list = []

    for root, _, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            file_list.append(file_path)
        gc.collect()
    return file_list

def formatText(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    gc.collect()
    return text

def map_function(document):
    words = document.split()
    word_count_pairs = [(word, 1) for word in words]
    gc.collect()
    return word_count_pairs

def reduce_function(word_counts):
    word, counts = word_counts[0], word_counts[1:]
    total_count = sum(counts)
    return word, total_count

In [4]:
files = get_files(DATA_DIR)
len(files)

3467

In [24]:
for file in files:
    with open(file, 'r', encoding='latin-1') as f:
        content = f.read()
        content = formatText(content)
        print(len(content))

21928
875463
253693
605011
661950
408154
251134
130615
265844
755961
303145
456402
319960
360062
59125
10138
536523
716507
362005
353762
747468
596922
199318
342537
106531
184247
969909
357919
85097
188864
266022
494334
75261
643442
494620
557733
88159
221242
215434
231388
194819
27646
27221
33760
26392
165403
146144
25788
24736
23377
35148
25614
45704
26700
26101
23454
25048
34714
26092
20553
24654
22527
20446
237155
295808
140185
420830
207351
262077
285937
103962
89964
28188
84569
416913
331271
224622
396544
330318
128653
88872
138584
340611
358845
369256
425909
540102
320442
287646
399422
430417
29768
291662
492937
323542
371846
465296
260881
6123
275855
494658
534071
34873
69270
77406
701800
10150
507246
91202
1262431
594587
712287
82587
37895
239074
280305
585484
601179
526800
641993
728193
262374
170639
432838
1212184
1311888
1029522
473571
136376
477355
438966
467606
143739
477374
351641
228408
974739
1049068
425356
1209483
43919
423557
12130
528138
502545
687158
522411
505384


In [38]:
def execute(file):
    with open(file, 'r', encoding='latin-1') as f:
        content = f.read()
        content = formatText(content)
        gc.collect()
        return file, len(content)

In [43]:
def read_map(file):
  with open(file, 'r', encoding='latin-1') as f:
    content = f.read()
    content = formatText(content)
    return map_function(content)

In [39]:
pool = Pool(processes=12)

In [40]:
gc.collect()

0

In [44]:
mapped = pool.map(read_map, files)

Process ForkPoolWorker-88:
Process ForkPoolWorker-87:
Process ForkPoolWorker-86:
Process ForkPoolWorker-89:
Process ForkPoolWorker-84:
Process ForkPoolWorker-94:
Process ForkPoolWorker-93:
Process ForkPoolWorker-92:
Process ForkPoolWorker-90:
Process ForkPoolWorker-85:
Process ForkPoolWorker-91:
Process ForkPoolWorker-83:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, 

In [33]:
res = pool.map(execute, files)

In [34]:
for file, size in res:
    print(f'file: {file}, len: {size}')

file: ./Gutenberg_Text/.DS_Store, len: 21928
file: ./Gutenberg_Text/Conrad, Joseph/Romance.txt, len: 875463
file: ./Gutenberg_Text/Conrad, Joseph/Turgenev: A Study.txt, len: 253693
file: ./Gutenberg_Text/Conrad, Joseph/The Arrow of Gold: A Story Between Two Notes.txt, len: 605011
file: ./Gutenberg_Text/Conrad, Joseph/Victory: An Island Tale.txt, len: 661950
file: ./Gutenberg_Text/Conrad, Joseph/'Twixt Land & Sea: Tales.txt, len: 408154
file: ./Gutenberg_Text/Conrad, Joseph/The Book of the Homeless.txt, len: 251134
file: ./Gutenberg_Text/Conrad, Joseph/Gaspar Ruiz.txt, len: 130615
file: ./Gutenberg_Text/Conrad, Joseph/A Personal Record.txt, len: 265844
file: ./Gutenberg_Text/Conrad, Joseph/Chance: A Tale in Two Parts.txt, len: 755961
file: ./Gutenberg_Text/Conrad, Joseph/The End of the Tether.txt, len: 303145
file: ./Gutenberg_Text/Conrad, Joseph/Notes on Life & Letters.txt, len: 456402
file: ./Gutenberg_Text/Conrad, Joseph/The Nigger Of The "Narcissus": A Tale Of The Forecastle.txt, le

In [20]:
pool = ThreadPool(15)

In [21]:
res = pool.map_async(execute, files)

res.wait()
res = res.get()

In [None]:
for file, size in res:
    print(f'file: {file}, len: {size}')

In [23]:
executor = ThreadPoolExecutor(max_workers=15)

In [24]:
resultados = [executor.submit(execute, archivo) for archivo in files]

In [25]:
executor.shutdown()

In [26]:
resultados = [resultado.result() for resultado in resultados]

In [27]:
for archivo, num_caracteres in resultados:
    print(f"El archivo {archivo} tiene {num_caracteres} caracteres.")

El archivo ./Gutenberg_Text/.DS_Store tiene 21928 caracteres.
El archivo ./Gutenberg_Text/Conrad, Joseph/Romance.txt tiene 875463 caracteres.
El archivo ./Gutenberg_Text/Conrad, Joseph/Turgenev: A Study.txt tiene 253693 caracteres.
El archivo ./Gutenberg_Text/Conrad, Joseph/The Arrow of Gold: A Story Between Two Notes.txt tiene 605011 caracteres.
El archivo ./Gutenberg_Text/Conrad, Joseph/Victory: An Island Tale.txt tiene 661950 caracteres.
El archivo ./Gutenberg_Text/Conrad, Joseph/'Twixt Land & Sea: Tales.txt tiene 408154 caracteres.
El archivo ./Gutenberg_Text/Conrad, Joseph/The Book of the Homeless.txt tiene 251134 caracteres.
El archivo ./Gutenberg_Text/Conrad, Joseph/Gaspar Ruiz.txt tiene 130615 caracteres.
El archivo ./Gutenberg_Text/Conrad, Joseph/A Personal Record.txt tiene 265844 caracteres.
El archivo ./Gutenberg_Text/Conrad, Joseph/Chance: A Tale in Two Parts.txt tiene 755961 caracteres.
El archivo ./Gutenberg_Text/Conrad, Joseph/The End of the Tether.txt tiene 303145 carac