In [1]:
from collections import Counter
import multiprocessing as mp
import random
import nltk
from tqdm.auto import tqdm
from pos_noise import POSNoise

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk.download('brown')

[nltk_data] Downloading package brown to /home/dniko/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [3]:
pos_noise = POSNoise()

In [4]:
brown = nltk.corpus.brown
brown_sentences = list(brown.sents())

In [5]:
brown_sentences[0]

['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of',
 "Atlanta's",
 'recent',
 'primary',
 'election',
 'produced',
 '``',
 'no',
 'evidence',
 "''",
 'that',
 'any',
 'irregularities',
 'took',
 'place',
 '.']

In [6]:
# Too slow without multiprocessing
# brown_pos_noised = list(map(pos_noise.apply_noise, brown_sentences))

In [7]:
def process_sentence(sentence):
    """
    A wrapper function for a parallel loop.
    """
    return pos_noise.apply_noise(sentence)

In [8]:
num_processes = mp.cpu_count() - 2
num_processes

14

In [9]:
with mp.Pool(processes=num_processes) as pool:
    results = list(tqdm(
        pool.imap_unordered(process_sentence, brown_sentences),
        total=len(brown_sentences),
        desc="Processing"
    ))

Processing: 100%|██████████| 57340/57340 [00:59<00:00, 959.96it/s] 


In [10]:
pos_counts = Counter()
for result in results:
    pos_counts.update(result)
pos_counts.most_common()

[('NOUN', 224314),
 ('VERB', 77277),
 ('ADJ', 74940),
 ('the', 69971),
 (',', 58334),
 ('.', 49346),
 ('PROPN', 39964),
 ('of', 36412),
 ('and', 28853),
 ('to', 26158),
 ('a', 23195),
 ('in', 21337),
 ('ADV', 15550),
 ('that', 10594),
 ('is', 10109),
 ('was', 9815),
 ('NUM', 9803),
 ('he', 9548),
 ('for', 9489),
 ('``', 8837),
 ("''", 8789),
 ('it', 8760),
 ('with', 7289),
 ('as', 7253),
 ('his', 6996),
 ('on', 6741),
 ('be', 6377),
 (';', 5566),
 ('at', 5372),
 ('by', 5306),
 ('i', 5164),
 ('this', 5145),
 ('had', 5133),
 ('?', 4693),
 ('X', 4622),
 ('not', 4610),
 ('are', 4394),
 ('but', 4381),
 ('from', 4370),
 ('or', 4206),
 ('have', 3942),
 ('an', 3740),
 ('they', 3620),
 ('which', 3561),
 ('one', 3292),
 ('you', 3286),
 ('were', 3284),
 ('her', 3036),
 ('all', 3001),
 ('she', 2860),
 ('there', 2728),
 ('would', 2714),
 ('their', 2669),
 ('we', 2652),
 ('him', 2619),
 ('been', 2472),
 (')', 2466),
 ('has', 2437),
 ('(', 2435),
 ('when', 2331),
 ('who', 2252),
 ('will', 2245),
 ('m

In [12]:
len(pos_counts)

750

In [17]:
for s in brown_sentences:
    if 'Rundfunk-Sinfonie-Orchester' in s:
        for p in zip(s, pos_noise.apply_noise(s)):
            print(f'{p[0]:<30} {p[1]}')

The                            the
last                           last
program                        NOUN
of                             of
this                           this
festival                       NOUN
,                              ,
which                          which
during                         during
two                            two
weeks                          NOUN
had                            had
sampled                        VERB
most                           most
compositional                  ADJ
categories                     NOUN
,                              ,
brought                        VERB
the                            the
Cologne                        NOUN
Rundfunk-Sinfonie-Orchester    X
and                            and
Rundfunkchor                   NOUN
to                             to
Bonn's                         PROPN
gold-filled                    ADJ
hall                           NOUN
for                            for
a        