## bag of words

In [1]:
from nltk.corpus import reuters
from collections import Counter

In [2]:
counts = Counter(reuters.words())
total_count = len(reuters.words())

# The most common 20 words are ...
print(counts.most_common(n=20))

[('.', 94687), (',', 72360), ('the', 58251), ('of', 35979), ('to', 34035), ('in', 26478), ('said', 25224), ('and', 25043), ('a', 23492), ('mln', 18037), ('vs', 14120), ('-', 13705), ('for', 12785), ('dlrs', 11730), ("'", 11272), ('The', 10968), ('000', 10277), ('1', 9977), ('s', 9298), ('pct', 9093)]


In [3]:
# Compute the frequencies
for word in counts:
    counts[word] /= float(total_count)

# The frequencies should add up to 1
print(sum(counts.values()))  # 1.0

1.0000000000006808


In [5]:
import random
 
# Generate 100 words of language
text = []
 
for _ in range(100):
    r = random.random()
    accumulator = .0
 
    for word, freq in counts.items():
        accumulator += freq
 
        if accumulator >= r:
            text.append(word)
            break

print(' '.join(text))

and t it within share because DENKO be is to barrels a on Term , immediately a BPMI said of issue Last be . > lt the next 2 meeting s cts goods 254 exclude ADMINISTRATORS an . . coast and said October . major of OFF will a an meeting mln year prepared the be effect sales . the Boston of in unexpected split INC to party Petroleum . the a subject 6 topic 141 in dlrs , how fluctuations 550 of merger ultimately . board gain in executive growth face generate than has stock ' in , NORWEGIAN


## bigrams and trigrams

In [8]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\liyud\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [6]:
from nltk.corpus import reuters
from nltk import bigrams, trigrams
from collections import Counter, defaultdict

In [9]:
first_sentence = reuters.sents()[0]
print(first_sentence) # [u'ASIAN', u'EXPORTERS', u'FEAR', u'DAMAGE', u'FROM' ...
 
# Get the bigrams
print(list(bigrams(first_sentence))) # [(u'ASIAN', u'EXPORTERS'), (u'EXPORTERS', u'FEAR'), (u'FEAR', u'DAMAGE'), (u'DAMAGE', u'FROM'), ...
 
# Get the padded bigrams
print(list(bigrams(first_sentence, pad_left=True, pad_right=True))) # [(None, u'ASIAN'), (u'ASIAN', u'EXPORTERS'), (u'EXPORTERS', u'FEAR'), (u'FEAR', u'DAMAGE'), (u'DAMAGE', u'FROM'),
 
# Get the trigrams
print(list(trigrams(first_sentence))) # [(u'ASIAN', u'EXPORTERS', u'FEAR'), (u'EXPORTERS', u'FEAR', u'DAMAGE'), (u'FEAR', u'DAMAGE', u'FROM'), ...
 
# Get the padded trigrams
print(list(trigrams(first_sentence, pad_left=True, pad_right=True))) # [(None, None, u'ASIAN'), (None

['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', '.', 'S', '.', 'And', 'Japan', 'has', 'raised', 'fears', 'among', 'many', 'of', 'Asia', "'", 's', 'exporting', 'nations', 'that', 'the', 'row', 'could', 'inflict', 'far', '-', 'reaching', 'economic', 'damage', ',', 'businessmen', 'and', 'officials', 'said', '.']
[('ASIAN', 'EXPORTERS'), ('EXPORTERS', 'FEAR'), ('FEAR', 'DAMAGE'), ('DAMAGE', 'FROM'), ('FROM', 'U'), ('U', '.'), ('.', 'S'), ('S', '.-'), ('.-', 'JAPAN'), ('JAPAN', 'RIFT'), ('RIFT', 'Mounting'), ('Mounting', 'trade'), ('trade', 'friction'), ('friction', 'between'), ('between', 'the'), ('the', 'U'), ('U', '.'), ('.', 'S'), ('S', '.'), ('.', 'And'), ('And', 'Japan'), ('Japan', 'has'), ('has', 'raised'), ('raised', 'fears'), ('fears', 'among'), ('among', 'many'), ('many', 'of'), ('of', 'Asia'), ('Asia', "'"), ("'", 's'), ('s', 'exporting'), ('exporting', 'nations'), ('nations', 'that'), 

In [12]:
# build a trigram language model
model = defaultdict(lambda: defaultdict(lambda: 0))
 
for sentence in reuters.sents():
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        model[(w1, w2)][w3] += 1

print(model["what", "the"]["economists"]) # "economists" follows "what the" 2 times
print(model["what", "the"]["nonexistingword"]) # 0 times
print(model[None, None]["The"]) # 8839 sentences start with "The"
 
# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

print(model["what", "the"]["economists"]) # 0.0434782608696
print(model["what", "the"]["nonexistingword"]) # 0.0
print(model[None, None]["The"]) # 0.161543241465

2
0
8839
0.043478260869565216
0.0
0.16154324146501936


In [13]:
import random
 
def generate_text(model):
    text = [None, None]
    prob = 1.0  # <- Init probability

    sentence_finished = False

    while not sentence_finished:
        r = random.random()
        accumulator = .0

        for word in model[tuple(text[-2:])].keys():
            accumulator += model[tuple(text[-2:])][word]

            if accumulator >= r:
                text.append(word)
                prob *= model[tuple(text[-2:])][word]  # <- Update the probability with the conditional pro
                break

        if text[-2:] == [None, None]:
            sentence_finished = True

    sentence = ' '.join([t for t in text if t])
    return sentence, prob

In [14]:
sentence, prob = generate_text(model)
print(f"probability: {prob} of sentence: {sentence}")

probability: 0.0 of sentence: But he declined to 37 pct , he could not support the stock after the stock for Rexham , which includes Wells Fargo security guards , and he didn ' t eliminate subsidies unilaterally ," one said .


In [15]:
sentence, prob = generate_text(model)
print(f"probability: {prob} of sentence: {sentence}")

probability: 0.0 of sentence: Lowest temperatures will be similarly cut , and other market their crude oil is definitely underway ," Lyng said .
