In [1]:
from nltk.corpus import reuters, brown  # nltk corpus
from nltk import bigrams, trigrams
from collections import Counter, defaultdict
import random

In [2]:
corpus = reuters

In [3]:
first_sentence = corpus.sents()[1]
print(first_sentence)

['They', 'told', 'Reuter', 'correspondents', 'in', 'Asian', 'capitals', 'a', 'U', '.', 'S', '.', 'Move', 'against', 'Japan', 'might', 'boost', 'protectionist', 'sentiment', 'in', 'the', 'U', '.', 'S', '.', 'And', 'lead', 'to', 'curbs', 'on', 'American', 'imports', 'of', 'their', 'products', '.']


In [4]:
# Get the bigrams
#print(list(trigrams(first_sentence)))
# P(Reuter|'They', 'told')
tri = list(trigrams(first_sentence))
for t in tri:
    print(t)

('They', 'told', 'Reuter')
('told', 'Reuter', 'correspondents')
('Reuter', 'correspondents', 'in')
('correspondents', 'in', 'Asian')
('in', 'Asian', 'capitals')
('Asian', 'capitals', 'a')
('capitals', 'a', 'U')
('a', 'U', '.')
('U', '.', 'S')
('.', 'S', '.')
('S', '.', 'Move')
('.', 'Move', 'against')
('Move', 'against', 'Japan')
('against', 'Japan', 'might')
('Japan', 'might', 'boost')
('might', 'boost', 'protectionist')
('boost', 'protectionist', 'sentiment')
('protectionist', 'sentiment', 'in')
('sentiment', 'in', 'the')
('in', 'the', 'U')
('the', 'U', '.')
('U', '.', 'S')
('.', 'S', '.')
('S', '.', 'And')
('.', 'And', 'lead')
('And', 'lead', 'to')
('lead', 'to', 'curbs')
('to', 'curbs', 'on')
('curbs', 'on', 'American')
('on', 'American', 'imports')
('American', 'imports', 'of')
('imports', 'of', 'their')
('of', 'their', 'products')
('their', 'products', '.')


### build a trigram model

In [5]:
# creat all 
model = defaultdict(lambda: defaultdict(lambda: 0))
for sentence in corpus.sents():
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        model[(w1, w2)][w3] += 1
 

In [6]:
# count of the bigrams preceded by price
print(model["today", "the"]["price"])
print(model["today", "the"]["nonexistingword"])
print(model[None, None]["The"])

2
0
8839


In [7]:
# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

In [None]:
# count("today, the, price")/count("today, the")
# count(", , the")/count("none, the")

In [8]:
print(model["today", "the"]["price"])
print(model["today", "the"]["nonexistingword"])
print(model[None, None]["The"])

0.1111111111111111
0.0
0.16154324146501936


In [9]:
#model[None, None]

Now we have a trigram language model. Let’s generate some text

In [9]:
text = ["today", "the"] # store the sentence
sentence_finished = False
 
while not sentence_finished:
    r = random.random()
    accumulator = .0

    for word in model[tuple(text[-2:])].keys():
        accumulator += model[tuple(text[-2:])][word]
        print("{}: {}, {}, {}".format(text, word, accumulator, r))

        if accumulator >= r:
            text.append(word)
            break
 
    # verify if sentence is completed
    if text[-2:] == [None, None]:
        sentence_finished = True


['today', 'the']: public, 0.05555555555555555, 0.6147113728048186
['today', 'the']: European, 0.1111111111111111, 0.6147113728048186
['today', 'the']: Bank, 0.16666666666666666, 0.6147113728048186
['today', 'the']: price, 0.2777777777777778, 0.6147113728048186
['today', 'the']: emirate, 0.33333333333333337, 0.6147113728048186
['today', 'the']: overseas, 0.38888888888888895, 0.6147113728048186
['today', 'the']: newspaper, 0.44444444444444453, 0.6147113728048186
['today', 'the']: company, 0.6111111111111112, 0.6147113728048186
['today', 'the']: Turkish, 0.6666666666666667, 0.6147113728048186
['today', 'the', 'Turkish']: General, 0.16666666666666666, 0.025689267650513048
['today', 'the', 'Turkish', 'General']: Staff, 1.0, 0.2868257291984514
['today', 'the', 'Turkish', 'General', 'Staff']: ,, 1.0, 0.07426712855650874
['today', 'the', 'Turkish', 'General', 'Staff', ',']: predicted, 0.25, 0.9306702864070255
['today', 'the', 'Turkish', 'General', 'Staff', ',']: told, 0.5, 0.9306702864070255
[

In [10]:
print(' '.join([t for t in text if t]))

today the Turkish General Staff , during a crucial phase of the problem dealt with properly at the end of the workforce .


## loading data

In [14]:
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE
from nltk import word_tokenize, sent_tokenize
import re
from nltk.tokenize import ToktokTokenizer
import os
import requests
import io 

In [15]:
word_tokenize(sent_tokenize("This is a foobar sentence. Yes it is.")[0])

['This', 'is', 'a', 'foobar', 'sentence', '.']

In [27]:
if os.path.isfile('data/lm_text.txt'):
    with io.open('data/lm_text.txt', encoding='utf8') as fin:
        text = fin.read()

In [52]:
print(text)

                       Language is never, ever, ever, random

                                                               ADAM KILGARRIFF




Abstract
Language users never choose words randomly, and language is essentially
non-random. Statistical hypothesis testing uses a null hypothesis, which
posits randomness. Hence, when we look at linguistic phenomena in cor-
pora, the null hypothesis will never be true. Moreover, where there is enough
data, we shall (almost) always be able to establish that it is not true. In
corpus studies, we frequently do have enough data, so the fact that a rela-
tion between two phenomena is demonstrably non-random, does not sup-
port the inference that it is not arbitrary. We present experimental evidence
of how arbitrary associations between word frequencies and corpora are
systematically non-random. We review literature in which hypothesis test-
ing has been used, and show how it has often led to unhelpful or mislead-
ing results.
Keywords: 쎲쎲쎲

1. Int

#### Assignment

In [None]:
#### Assignment
# Write code to preprocess the data

In [29]:
# Tokenize the text.
tokenized_text = [list(map(str.lower, word_tokenize(sent))) 
                  for sent in sent_tokenize(text)]

In [16]:
#tokenized_text[0]

In [30]:
# Preprocess the tokenized text for 3-grams language modelling
n = 3 # n gram
train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

### Training an tri-gram Model

In [31]:
# initialize MLE model - MLE model, creates an empty vocabulary
model = MLE(n)

In [32]:
len(model.vocab)

0

In [33]:
# traine model 
model.fit(train_data, padded_sents)
print(model.vocab)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 1391 items>


In [34]:
len(model.vocab)

1391

In [35]:
print(model.vocab.lookup(tokenized_text[0]))

('language', 'is', 'never', ',', 'ever', ',', 'ever', ',', 'random', 'adam', 'kilgarriff', 'abstract', 'language', 'users', 'never', 'choose', 'words', 'randomly', ',', 'and', 'language', 'is', 'essentially', 'non-random', '.')


In [37]:
# the model automatically replace words not in the vocabulary with `<UNK>`.
print(model.vocab.lookup('language is never random unseenword .'.split()))

('language', 'is', 'never', 'random', '<UNK>', '.')


### Exploring the model

In [38]:
print(model.counts)

<NgramCounter with 3 ngram orders and 19611 ngrams>


In [39]:
phrase = 'language'
print("number of bigrams for the phrase", phrase, model.counts[phrase])

number of bigrams for the phrase language 25


In [40]:
# number of bigrames of language followed by is
print("number of bigrams for the phrase", model.counts[['language']]['is'])

number of bigrams for the phrase 11


In [27]:
print("number of bigrams for the phrase", model.counts[['language', 'is']]['never'])

number of bigrams for the phrase 7


In [61]:
# what is the probability score for the word language i.e how likely is the word in our vocanulary
# model.score('language') # P('language')
model.score("1")

0.0017720023626698169

In [42]:
# probability of 'is' given 'language'
model.score('is', 'language'.split())  # P('is'|'language') [language is]

0.44

In [43]:
# P('never'|'language is')
model.score('never', 'language is'.split())  # count(lanuage is never)/count(langauge is)

0.6363636363636364

In [44]:
model.logscore("never", "language is".split())

-0.6520766965796932

### Generating text

In [49]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

In [50]:
detokenize = TreebankWordDetokenizer().detokenize

def generate_sent(model, num_words, random_seed=42):
    """Generate text method  """
    content = []
    for token in model.generate(num_words, random_seed=random_seed):
        if token == '<s>':
            continue
        if token == '</s>':
            break
        content.append(token)
    return detokenize(content)

In [51]:
generate_sent(model, 50, random_seed=52)

'words (bigrams, etc.'