In [1]:
import itertools
import numpy as np
from collections import Counter
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [2]:
# text6 = monty python and the holy grail
reference_text = text6
unigram_counts = Counter(reference_text)
bigrams = [(reference_text[i],reference_text[i+1]) for i in range(len(reference_text)-1)]
bigram_counts = Counter(bigrams)

In [3]:
def compute_ngrams(text,n):
    if len(text)!=n:
        return [tuple(text[i:(i+n)]) for i in range(len(text)-n+1)]
    else:
        return [tuple(text)]

In [4]:
def compute_ngram_counts(text,n):
    ngram_counts = Counter(compute_ngrams(text,n))
    context_counts = Counter(compute_ngrams(text,n-1))
    return ngram_counts,context_counts

In [5]:
trigrams = compute_ngrams(reference_text,3)
trigrams[:10]

[('SCENE', '1', ':'),
 ('1', ':', '['),
 (':', '[', 'wind'),
 ('[', 'wind', ']'),
 ('wind', ']', '['),
 (']', '[', 'clop'),
 ('[', 'clop', 'clop'),
 ('clop', 'clop', 'clop'),
 ('clop', 'clop', ']'),
 ('clop', ']', 'KING')]

In [6]:
def compute_bigram_probability(bigram,bigram_counts,unigram_counts):
    return bigram_counts[bigram]/unigram_counts[bigram[0]]

In [7]:
def compute_ngram_probability(ngram,ngram_counts,context_counts,verbose=False):
    if verbose:
        print('ngram: ',ngram)
        print('ngram counts: ',ngram_counts[ngram])
        print('context: ',ngram[:len(ngram)-1])
        print('context counts: ',context_counts[ngram[:len(ngram)-1]])
    return ngram_counts[ngram]/context_counts[ngram[:len(ngram)-1]]

In [8]:
def compute_sentence_bigram_probability(s,bigram_counts,unigram_counts):
    s = s.split(' ')
    return np.exp(np.sum([np.log(compute_bigram_probability((s[i],s[i+1]),bigram_counts,unigram_counts)) for i in range(len(s)-1)]))

In [9]:
def compute_sentence_ngram_probability(s,ngram_counts,context_counts):
    s = s.split(' ')
    ngrams = compute_ngrams(s,len(list(ngram_counts.keys())[0]))
    return np.exp(np.sum([np.log(compute_ngram_probability(ngrams[i],ngram_counts,context_counts)) for i in range(len(ngrams))]))

In [10]:
bigram_counts,unigram_counts = compute_ngram_counts(reference_text,2)
ngram_counts,context_counts = compute_ngram_counts(reference_text,3)

In [11]:
def compute_perplexity(text,ngram_counts,context_counts):
    ngrams = compute_ngrams(text,len(list(ngram_counts.keys())[0]))
    return np.exp(-(1/len(text))*np.sum([np.log(compute_ngram_probability(ngrams[i],ngram_counts,context_counts)) for i in range(len(ngrams))]))

In [12]:
def unk_encode_text_train_test(min_count,train_text,test_text=[]):
    train_unigram_counts = Counter(train_text)
    train_rare_words = [w for w in list(train_unigram_counts.keys()) if train_unigram_counts[w]<=min_count]
    train_text = ['<UNK>' if w in train_rare_words else w for w in train_text ]
    test_text = ['<UNK>' if w not in list(train_unigram_counts.keys()) else w for w in test_text]
    return train_text,test_text

In [13]:
def laplace_bigram_smoothing_counts(bigram_counts,unigram_counts,k=1):
    vocabulary = [l[0] for l in list(unigram_counts.keys())]
    all_bigrams  = list(itertools.product(vocabulary,vocabulary))
    all_bigram_counts = Counter(all_bigrams)
    all_bigram_counts.update(Counter(dict.fromkeys(all_bigram_counts,k-1)))
    all_bigram_counts.update(bigram_counts)
    unigram_counts.update(Counter(dict.fromkeys(unigram_counts,len(vocabulary))))
    return all_bigram_counts,unigram_counts

In [14]:
laplace_bigram_counts,laplace_unigram_counts = laplace_bigram_smoothing_counts(ngram_counts,context_counts,k=2)

In [15]:
def sample_word_from_context(context_word,ngram_counts):
    ngrams = list(ngram_counts.keys())
    context_ngrams = np.array([ngram for ngram in ngrams if ngram[0]==context_word])
    next_possible_words = np.array([ngram[1] for ngram in context_ngrams])
    context_probs = np.array([ngram_counts[ngram] for ngram in ngrams if ngram[0]==context_word],dtype='float')
    context_probs/=np.sum(context_probs)
    return np.random.choice(a=next_possible_words,size=1,p=context_probs)[0]

In [16]:
def generate_random_sentence(seed_word,ngram_counts,stop_word='.',sentence_max_length=300,):
    sentence = seed_word
    next_word = sample_word_from_context(context_word=seed_word,ngram_counts=ngram_counts)
    sentence += ' '+next_word
    while next_word not in ('.','!','?') and len(sentence.split(' '))<sentence_max_length:
        next_word = sample_word_from_context(context_word=next_word,ngram_counts=ngram_counts)
        sentence += ' '+next_word
    print("Random Sentence:",sentence)
    return None        

In [17]:
generate_random_sentence(seed_word='She',ngram_counts=ngram_counts,stop_word='.',sentence_max_length=20)

Random Sentence: She looks like one , eh ?
