In [1]:
import asyncio
import glob
import time
from collections import defaultdict, Counter
from concurrent.futures import ThreadPoolExecutor
from pprint import pprint
from typing import List

import aiofiles
%load_ext memory_profiler

In [2]:
txt_files = glob.glob("files/*.txt")
many_txt_files = glob.glob("many_files/*.txt")
stop_words = ['the', 'to', 'and', 'of', 'a', 'an', 'at', 'as', 'on', 'in', 'o']
TOP_WORDS_COUNT = 20
CASE_SENSITIVITY = True

In [3]:
def clean_word(word: str, case_sensitivity: bool = CASE_SENSITIVITY) -> str:
    """ Очищает слово от оставшихся символов """
    char_to_clean = '”—’“,.!?"»«:;^()\\-…‘*?\''
    word = word.strip(char_to_clean)
    return word if case_sensitivity else word.lower()

In [4]:
def filter_words(word:str) -> bool:
    return word and word not in stop_words

In [5]:
def get_words_from_file(file_name: str) -> List[str]:
    return filter(filter_words, map(clean_word, open(file_name, 'r').read().split()))

In [6]:
def counter_on_defaultdict(file_list: List[str], verbose: bool = True):
    word_counts = defaultdict(int)
    
    words = (word for file_name in file_list for word in get_words_from_file(file_name))
    for word in words:
        word_counts[word] += 1

    sorted_dict = sorted(word_counts.items(), key=lambda item: item[1], reverse=True)
    if verbose:
        pprint(sorted_dict[:TOP_WORDS_COUNT])

In [7]:
def counter_on_counter(file_list: List[str], verbose: bool = True):
    words = (word for file_name in file_list for word in get_words_from_file(file_name))
    word_counts = Counter(words)
    if verbose:
        pprint(word_counts.most_common(TOP_WORDS_COUNT))

In [8]:
def counter_on_threads(file_list: List[str], verbose: bool = True):
    word_counts = Counter()
    with ThreadPoolExecutor() as executor:
        results = executor.map(get_words_from_file, file_list)
        for value in results: 
            word_counts += Counter(value)
    if verbose:
        pprint(word_counts.most_common(TOP_WORDS_COUNT))

In [9]:
async def counter_on_aiofiles(file_list: List[str]): 
    words = []
    for file_name in file_list:
        async with aiofiles.open(file_name, mode='r') as f:
            content = await f.read()
            words += filter(filter_words, map(clean_word, content.split()))
    word_counts = Counter(words)

    pprint(word_counts.most_common(TOP_WORDS_COUNT))

*Short file list*


In [10]:
%time counter_on_defaultdict(txt_files)
%memit counter_on_defaultdict(txt_files, False)

[('Harry', 16636),
 ('was', 15506),
 ('he', 14748),
 ('said', 14430),
 ('his', 13720),
 ('it', 10590),
 ('I', 10102),
 ('you', 10049),
 ('had', 10048),
 ('that', 9475),
 ('him', 6932),
 ('with', 6358),
 ('Ron', 5770),
 ('He', 5452),
 ('for', 5367),
 ('her', 5295),
 ('Hermione', 4956),
 ('not', 4944),
 ('be', 4427),
 ('they', 4394)]
CPU times: user 1.04 s, sys: 27.7 ms, total: 1.07 s
Wall time: 1.09 s
peak memory: 76.14 MiB, increment: 17.73 MiB


In [11]:
%time counter_on_counter(txt_files)
%memit counter_on_counter(txt_files, False)

[('Harry', 16636),
 ('was', 15506),
 ('he', 14748),
 ('said', 14430),
 ('his', 13720),
 ('it', 10590),
 ('I', 10102),
 ('you', 10049),
 ('had', 10048),
 ('that', 9475),
 ('him', 6932),
 ('with', 6358),
 ('Ron', 5770),
 ('He', 5452),
 ('for', 5367),
 ('her', 5295),
 ('Hermione', 4956),
 ('not', 4944),
 ('be', 4427),
 ('they', 4394)]
CPU times: user 959 ms, sys: 19.5 ms, total: 978 ms
Wall time: 986 ms
peak memory: 77.10 MiB, increment: 16.89 MiB


In [12]:
%time counter_on_threads(txt_files)
%memit counter_on_threads(txt_files, False)

[('Harry', 16636),
 ('was', 15506),
 ('he', 14748),
 ('said', 14430),
 ('his', 13720),
 ('it', 10590),
 ('I', 10102),
 ('you', 10049),
 ('had', 10048),
 ('that', 9475),
 ('him', 6932),
 ('with', 6358),
 ('Ron', 5770),
 ('He', 5452),
 ('for', 5367),
 ('her', 5295),
 ('Hermione', 4956),
 ('not', 4944),
 ('be', 4427),
 ('they', 4394)]
CPU times: user 1.01 s, sys: 52.5 ms, total: 1.07 s
Wall time: 1.07 s
peak memory: 134.29 MiB, increment: 73.81 MiB


In [13]:
from datetime import datetime
start = datetime.now()
await(counter_on_aiofiles(txt_files))
print(f'Execution time: {datetime.now()-start} s')

[('Harry', 16636),
 ('was', 15506),
 ('he', 14748),
 ('said', 14430),
 ('his', 13720),
 ('it', 10590),
 ('I', 10102),
 ('you', 10049),
 ('had', 10048),
 ('that', 9475),
 ('him', 6932),
 ('with', 6358),
 ('Ron', 5770),
 ('He', 5452),
 ('for', 5367),
 ('her', 5295),
 ('Hermione', 4956),
 ('not', 4944),
 ('be', 4427),
 ('they', 4394)]
Execution time: 0:00:00.971988 s


*Long file list*

In [14]:
%time counter_on_defaultdict(many_txt_files)
%memit counter_on_defaultdict(many_txt_files, False)

[('Harry', 266176),
 ('was', 248096),
 ('he', 235968),
 ('said', 230880),
 ('his', 219520),
 ('it', 169440),
 ('I', 161632),
 ('you', 160784),
 ('had', 160768),
 ('that', 151600),
 ('him', 110912),
 ('with', 101728),
 ('Ron', 92320),
 ('He', 87232),
 ('for', 85872),
 ('her', 84720),
 ('Hermione', 79296),
 ('not', 79104),
 ('be', 70832),
 ('they', 70304)]
CPU times: user 16.9 s, sys: 219 ms, total: 17.1 s
Wall time: 17.4 s
peak memory: 82.50 MiB, increment: 19.87 MiB


In [15]:
%time counter_on_counter(many_txt_files)
%memit counter_on_counter(many_txt_files, False)

[('Harry', 266176),
 ('was', 248096),
 ('he', 235968),
 ('said', 230880),
 ('his', 219520),
 ('it', 169440),
 ('I', 161632),
 ('you', 160784),
 ('had', 160768),
 ('that', 151600),
 ('him', 110912),
 ('with', 101728),
 ('Ron', 92320),
 ('He', 87232),
 ('for', 85872),
 ('her', 84720),
 ('Hermione', 79296),
 ('not', 79104),
 ('be', 70832),
 ('they', 70304)]
CPU times: user 16.5 s, sys: 247 ms, total: 16.8 s
Wall time: 17.3 s
peak memory: 80.89 MiB, increment: 17.77 MiB


In [16]:
%time counter_on_threads(many_txt_files)
%memit counter_on_threads(many_txt_files, False)

[('Harry', 266176),
 ('was', 248096),
 ('he', 235968),
 ('said', 230880),
 ('his', 219520),
 ('it', 169440),
 ('I', 161632),
 ('you', 160784),
 ('had', 160768),
 ('that', 151600),
 ('him', 110912),
 ('with', 101728),
 ('Ron', 92320),
 ('He', 87232),
 ('for', 85872),
 ('her', 84720),
 ('Hermione', 79296),
 ('not', 79104),
 ('be', 70832),
 ('they', 70304)]
CPU times: user 18.4 s, sys: 1.09 s, total: 19.5 s
Wall time: 21 s
peak memory: 1296.29 MiB, increment: 1231.40 MiB


In [17]:
from datetime import datetime
start = datetime.now()
await(counter_on_aiofiles(many_txt_files))
print(f'Execution time: {datetime.now()-start} s')

[('Harry', 266176),
 ('was', 248096),
 ('he', 235968),
 ('said', 230880),
 ('his', 219520),
 ('it', 169440),
 ('I', 161632),
 ('you', 160784),
 ('had', 160768),
 ('that', 151600),
 ('him', 110912),
 ('with', 101728),
 ('Ron', 92320),
 ('He', 87232),
 ('for', 85872),
 ('her', 84720),
 ('Hermione', 79296),
 ('not', 79104),
 ('be', 70832),
 ('they', 70304)]
Execution time: 0:00:17.591065 s
