In [1]:
import itertools
import numpy as np
from collections import Counter
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [2]:
# text2 = sense and sensibility by jane austen
unigram_counts = Counter(text2)
bigrams = [(text2[i],text2[i+1]) for i in range(len(text2)-1)]
bigram_counts = Counter(bigrams)

In [3]:
def compute_ngrams(text,n):
    if len(text)!=n:
        return [tuple(text[i:(i+n)]) for i in range(len(text)-n+1)]
    else:
        return [tuple(text)]

In [4]:
def compute_ngram_counts(text,n):
    ngram_counts = Counter(compute_ngrams(text,n))
    context_counts = Counter(compute_ngrams(text,n-1))
    return ngram_counts,context_counts

In [5]:
trigrams = compute_ngrams(text2,3)
trigrams[2]

('and', 'Sensibility', 'by')

In [6]:
def compute_bigram_probability(bigram,bigram_counts,unigram_counts):
    return bigram_counts[bigram]/unigram_counts[bigram[0]]

In [7]:
def compute_ngram_probability(ngram,ngram_counts,context_counts,verbose=False):
    if verbose:
        print('ngram: ',ngram)
        print('ngram counts: ',ngram_counts[ngram])
        print('context: ',ngram[:len(ngram)-1])
        print('context counts: ',context_counts[ngram[:len(ngram)-1]])
    return ngram_counts[ngram]/context_counts[ngram[:len(ngram)-1]]

In [8]:
def compute_sentence_bigram_probability(s,bigram_counts,unigram_counts):
    s = s.split(' ')
    return np.exp(np.sum([np.log(compute_bigram_probability((s[i],s[i+1]),bigram_counts,unigram_counts)) for i in range(len(s)-1)]))

In [9]:
def compute_sentence_ngram_probability(s,ngram_counts,context_counts):
    s = s.split(' ')
    ngrams = compute_ngrams(s,len(list(ngram_counts.keys())[0]))
    return np.exp(np.sum([np.log(compute_ngram_probability(ngrams[i],ngram_counts,context_counts)) for i in range(len(ngrams))]))

In [10]:
ngram_counts,context_counts = compute_ngram_counts(text2,3)

In [11]:
ngram_counts[('she','did')]

0

In [12]:
compute_sentence_bigram_probability('she did not care',bigram_counts,unigram_counts)

4.569205742850947e-05

In [13]:
compute_sentence_ngram_probability('she did not care',ngram_counts,context_counts)

0.011299435028248582

In [14]:
def compute_perplexity(text,ngram_counts,context_counts):
    ngrams = compute_ngrams(text,len(list(ngram_counts.keys())[0]))
    return np.exp(-(1/len(text))*np.sum([np.log(compute_ngram_probability(ngrams[i],ngram_counts,context_counts)) for i in range(len(ngrams))]))

In [15]:
compute_perplexity(text2,ngram_counts,context_counts)

4.6990318881662745

In [16]:
for i in range(2,6):
    ngram_counts,context_counts = compute_ngram_counts(text2,i)
    print('n=',i,':',compute_perplexity(text2,ngram_counts,context_counts))

n= 2 : 37.37576749369875
n= 3 : 4.6990318881662745
n= 4 : 1.5042243590107542
n= 5 : 1.095325619353706


In [17]:
def unk_encode_text_train_test(min_count,train_text,test_text=[]):
    train_unigram_counts = Counter(train_text)
    train_rare_words = [w for w in list(train_unigram_counts.keys()) if train_unigram_counts[w]<=min_count]
    train_text = ['<UNK>' if w in train_rare_words else w for w in train_text ]
    test_text = ['<UNK>' if w not in list(train_unigram_counts.keys()) else w for w in test_text]
    return train_text,test_text

In [18]:
train_text,test_text = unk_encode_text_train_test(min_count=1,train_text=text2,test_text=text4)

In [19]:
ngram_counts,context_counts = compute_ngram_counts(train_text,2)

In [20]:
def laplace_bigram_smoothing_counts(bigram_counts,unigram_counts,k=1):
    vocabulary = [l[0] for l in list(unigram_counts.keys())]
    all_bigrams  = list(itertools.product(vocabulary,vocabulary))
    all_bigram_counts = Counter(all_bigrams)
    all_bigram_counts.update(Counter(dict.fromkeys(all_bigram_counts,k-1)))
    all_bigram_counts.update(bigram_counts)
    unigram_counts.update(Counter(dict.fromkeys(unigram_counts,len(vocabulary))))
    return all_bigram_counts,unigram_counts

In [None]:
laplace_bigram_counts,laplace_unigram_counts = laplace_bigram_smoothing_counts(ngram_counts,context_counts,k=2)

In [79]:
def sample_word_from_context(context_word,ngram_counts):
    ngrams = list(ngram_counts.keys())
    context_ngrams = np.array([ngram for ngram in ngrams if ngram[0]==context_word])
    next_possible_words = np.array([ngram[1] for ngram in context_ngrams])
    context_probs = np.array([ngram_counts[ngram] for ngram in ngrams if ngram[0]==context_word],dtype='float')
    context_probs/=np.sum(context_probs)
    return np.random.choice(a=next_possible_words,size=1,p=context_probs)[0]

In [None]:
def generate_random_sentence(seed_word,ngram_counts,stop_word='.',sentence_max_length=300,):
    sentence = seed_word
    next_word = sample_word_from_context(context_word=seed_word,ngram_counts=ngram_counts)
    sentence += ' '+next_word
    while next_word!='.' and len(sentence.split(' '))<300:
        next_word = sample_word_from_context(context_word=seed_word,ngram_counts=ngram_counts)
        sentence += ' '+next_word
    print("Random Sentence:",sentence)
    return None        

In [96]:
generate_random_sentence(seed_word='She',ngram_counts=ngram_counts,stop_word='.',sentence_max_length=20)

Random Sentence: She was had would could blushed saw had will instantly smiled was perceived was began feared was <UNK> felt said began was instantly was was loved then managed had felt determined had was continued surprised began only will determined was expressly instantly saw was continued paused had had could felt saw does speedily sat had <UNK> was was had saw felt was did performed dared will liked was was received expressly even tried was fell expects was moved was might had doubted was was spent walked began sometimes wondered had trembled got met was acknowledged put had was concluded knows hesitated would came then thought returned had could insisted would observed would could saw declared could says had repeated blushed was dreaded was paused was will was acknowledged speedily was trembled knew was liked was was had must saw could is observed was caught paused instantly saw <UNK> looked tried was was began thanked turned had knew could looked instantly could instantly was co