In [107]:
import json
import zipfile
from collections import Counter
import multiprocessing as mp
import random
import numpy as np
import nltk
from tqdm.auto import tqdm
from pos_noise import POSNoise

In [None]:
nltk.download('brown')
nltk.download('gutenberg')
nltk.download('reuters')
nltk.download('webtext')
nltk.download('nps_chat')
nltk.download('punkt_tab')

[nltk_data] Downloading package brown to /home/dniko/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package gutenberg to /home/dniko/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package reuters to /home/dniko/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package webtext to /home/dniko/nltk_data...
[nltk_data]   Package webtext is already up-to-date!
[nltk_data] Downloading package nps_chat to /home/dniko/nltk_data...
[nltk_data]   Unzipping corpora/nps_chat.zip.
[nltk_data] Downloading package punkt_tab to /home/dniko/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
pos_noise = POSNoise()

In [131]:
reference_sentences = (
    list(nltk.corpus.brown.sents()) +
    list(nltk.corpus.gutenberg.sents()) +
    list(nltk.corpus.reuters.sents()) +
    list(nltk.corpus.webtext.sents()) +
    [post.text for post in nltk.corpus.nps_chat.xml_posts()]
)

In [132]:
len(reference_sentences), sum(len(s) for s in reference_sentences)

(246908, 6132220)

In [6]:
# Too slow without multiprocessing
# brown_pos_noised = list(map(pos_noise.apply_noise, brown_sentences))

In [7]:
def process_sentence(sentence):
    """
    A wrapper function for a parallel loop.
    """
    return pos_noise.apply_noise(sentence)

In [8]:
num_processes = mp.cpu_count() - 2
num_processes

14

In [133]:
with mp.Pool(processes=num_processes) as pool:
    pos_noised_sentences = list(tqdm(
        pool.imap_unordered(process_sentence, reference_sentences),
        total=len(reference_sentences),
        desc="Processing"
    ))

Processing: 100%|██████████| 246908/246908 [05:09<00:00, 797.06it/s] 


In [134]:
pos_counts = Counter()
for result in pos_noised_sentences:
    pos_counts.update(result)
pos_stats = pos_counts.most_common()
pos_stats[:10], pos_stats[-10:], len(pos_stats)

([('NOUN', 1106025),
  ('VERB', 375756),
  (',', 329776),
  ('PROPN', 297994),
  ('ADJ', 292420),
  ('the', 281399),
  ('NUM', 251938),
  ('.', 236402),
  ('and', 154816),
  ('of', 147851)],
 [('astraddle', 1),
  ('tho', 1),
  ('differing', 1),
  ('condition', 1),
  ('mind', 1),
  ("'(", 1),
  ('-.', 1),
  ('foreground', 1),
  ('firstly', 1),
  ('+,', 1)],
 817)

In [135]:
with zipfile.ZipFile('data/reference_corpus_w_pos_noise.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Add the JSON as a file in the archive
    zipf.writestr('reference_corpus_w_pos_noise.json', json.dumps(pos_noised_sentences))

In [100]:
# from nltk.lm import KneserNeyInterpolated
from nltk.lm import WittenBellInterpolated
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm.vocabulary import Vocabulary

In [103]:
def train_ngram_model(sentences, n=3, vocab=None):
    """
    Train an n-gram model on the given sentences.
    """
    train, vocab_local = padded_everygram_pipeline(n, sentences)
    if vocab is None:
        vocab = vocab_local
    model = WittenBellInterpolated(order=n, vocabulary=vocab)
    # model = KneserNeyInterpolated(n, discount=0.75)
    model.fit(train)
    return model

In [117]:
def get_log_prob(model, sentence):
    """
    Get the log probability of a sentence using the trained model.
    """
    n = model.order
    padded_sentence = ['<s>'] * (n - 1) + sentence + ['</s>']
    log_prob = 0.0
    for i, word in enumerate(padded_sentence):
        if i < n - 1:
            continue
        context = tuple(padded_sentence[i - n + 1:i])
        word = padded_sentence[i]
        log_prob += np.log(model.score(word, context) + 1e-10)
    return log_prob / len(sentence)

In [137]:
N = 20
SAMPLE_SIZE = 10000
n = 5

In [136]:
min_freq = 5

# Filter by frequency if needed
filtered_words = {word for word, count in pos_counts.items() if count >= min_freq}

# Add special tokens
filtered_words.add('<s>')
filtered_words.add('</s>')
filtered_words.add('<UNK>')

# Create the NLTK Vocabulary from this set
full_vocab = Vocabulary(filtered_words)

In [138]:
# Train N n-gram models on random subsets of the reference corpus
grammars = []
for i in tqdm(range(N)):
    sampled_sentences = random.sample(pos_noised_sentences, SAMPLE_SIZE)
    model = train_ngram_model(sampled_sentences, n=n, vocab=full_vocab)
    grammars.append(model)

  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 20/20 [02:23<00:00,  7.19s/it]


In [139]:
test_sentences = [
    "For that reason, we decided to postpone the meeting until next week.",
    "Whatever you decide, I will support your choice.",
    "I'll be ready to leave in a moment.",
    "She used to live in Paris before moving to London.",
    "All in all, it was a successful event despite the minor issues.",
    "He doesn't like coffee, so he always drinks tea.",
    "In addition to her job, she volunteers at the local shelter.",
    "They discussed the project at length during the meeting.",
    "In any event, we should be prepared for any outcome.",
    "She enjoys hiking as well as that she loves swimming.",
    "In addition, we need to consider the budget constraints.",
    "You should take care of yourselves during the trip.",
    "I'd like to visit the museum this weekend.",
    "Please stand at the front of the line.",
    "For one thing, we need more time to complete the project.",
    "You will receive the results in due time.",
    "To summarize, the main points are clear and concise.",
    "Who is responsible for this task?",
    "The contract specifies the terms wherein the agreement can be terminated.",
    "Her painting style is similar to that of the famous artist."
]

In [140]:
for s in test_sentences:
    log_probs = np.zeros(len(test_sentences))
    s_w_pos_noise = pos_noise.apply_noise(s)
    print(s, end='\n\t')
    for i, model in tqdm(enumerate(grammars), total=len(grammars), leave=False, disable=True):
        log_probs[i] = get_log_prob(model, s_w_pos_noise)
    std = np.std(log_probs)
    mean = np.mean(log_probs)
    print(f'{mean - std:.2f}, {mean:.2f}, {mean + std:.2f}')

For that reason, we decided to postpone the meeting until next week.
	-5.50, -5.10, -4.71
Whatever you decide, I will support your choice.
	-4.48, -4.04, -3.60
I'll be ready to leave in a moment.
	-6.25, -5.40, -4.56
She used to live in Paris before moving to London.
	-5.63, -5.24, -4.84
All in all, it was a successful event despite the minor issues.
	-3.95, -3.54, -3.13
He doesn't like coffee, so he always drinks tea.
	-5.51, -5.07, -4.64
In addition to her job, she volunteers at the local shelter.
	-3.04, -2.86, -2.67
They discussed the project at length during the meeting.
	-3.99, -3.68, -3.37
In any event, we should be prepared for any outcome.
	-5.62, -4.76, -3.90
She enjoys hiking as well as that she loves swimming.
	-4.45, -4.20, -3.96
In addition, we need to consider the budget constraints.
	-2.92, -2.84, -2.76
You should take care of yourselves during the trip.
	-5.51, -5.15, -4.79
I'd like to visit the museum this weekend.
	-5.34, -4.51, -3.67
Please stand at the front of the