In [1]:
import numpy as np
from collections import Counter
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [2]:
# text2 = sense and sensibility by jane austen
unigram_counts = Counter(text2)
bigrams = [(text2[i],text2[i+1]) for i in range(len(text2)-1)]
bigram_counts = Counter(bigrams)

In [3]:
def compute_ngrams(text,n):
    if len(text)!=n:
        return [tuple(text[i:(i+n)]) for i in range(len(text)-n+1)]
    else:
        return [tuple(text)]

In [4]:
def compute_ngram_counts(text,n):
    ngram_counts = Counter(compute_ngrams(text,n))
    context_counts = Counter(compute_ngrams(text,n-1))
    return ngram_counts,context_counts

In [5]:
trigrams = compute_ngrams(text2,3)
trigrams[2]

('and', 'Sensibility', 'by')

In [6]:
def compute_bigram_probability(bigram,bigram_counts,unigram_counts):
    return bigram_counts[bigram]/unigram_counts[bigram[0]]

In [7]:
def compute_ngram_probability(ngram,ngram_counts,context_counts,verbose=False):
    if verbose:
        print('ngram: ',ngram)
        print('ngram counts: ',ngram_counts[ngram])
        print('context: ',ngram[:len(ngram)-1])
        print('context counts: ',context_counts[ngram[:len(ngram)-1]])
    return ngram_counts[ngram]/context_counts[ngram[:len(ngram)-1]]

In [8]:
def compute_sentence_bigram_probability(s,bigram_counts,unigram_counts):
    s = s.split(' ')
    return np.exp(np.sum([np.log(compute_bigram_probability((s[i],s[i+1]),bigram_counts,unigram_counts)) for i in range(len(s)-1)]))

In [9]:
def compute_sentence_ngram_probability(s,ngram_counts,context_counts):
    s = s.split(' ')
    ngrams = compute_ngrams(s,len(list(ngram_counts.keys())[0]))
    return np.exp(np.sum([np.log(compute_ngram_probability(ngrams[i],ngram_counts,context_counts)) for i in range(len(ngrams))]))

In [10]:
ngram_counts,context_counts = compute_ngram_counts(text2,3)

In [11]:
ngram_counts[('she','did')]

0

In [12]:
compute_sentence_bigram_probability('she did not care',bigram_counts,unigram_counts)

4.569205742850947e-05

In [13]:
compute_sentence_ngram_probability('she did not care',ngram_counts,context_counts)

0.011299435028248582

In [14]:
def compute_perplexity(text,ngram_counts,context_counts):
    ngrams = compute_ngrams(text,len(list(ngram_counts.keys())[0]))
    return np.exp(-(1/len(text))*np.sum([np.log(compute_ngram_probability(ngrams[i],ngram_counts,context_counts)) for i in range(len(ngrams))]))

In [15]:
compute_perplexity(text2,ngram_counts,context_counts)

4.6990318881662745

In [16]:
for i in range(2,6):
    ngram_counts,context_counts = compute_ngram_counts(text2,i)
    print('n=',i,':',compute_perplexity(text2,ngram_counts,context_counts))

n= 2 : 37.37576749369875
n= 3 : 4.6990318881662745
n= 4 : 1.5042243590107542
n= 5 : 1.095325619353706


In [17]:
rare_words = [w for w in list(unigram_counts.keys()) if unigram_counts[w]==1]

In [18]:
len(rare_words)/len(list(unigram_counts.keys()))

0.39426313478706276

In [19]:
replaced_text2 = ['<UNK>' if w in rare_words else w for w in list(unigram_counts.keys()) ]

In [20]:
text4_replaced_for_test2_training = ['<UNK>' if w not in list(unigram_counts.keys()) else w for w in text4]

In [21]:
ngram_counts,context_counts = compute_ngram_counts(replaced_text2,2)

In [22]:
compute_perplexity(text4_replaced_for_test2_training,ngram_counts,context_counts)

  This is separate from the ipykernel package so we can avoid doing imports until


ZeroDivisionError: division by zero

In [28]:
import itertools
unigram_counts = Counter(text4_replaced_for_test2_training)
#def bigram_laplace_smoothing(bigram_counts,unigram_counts):

In [29]:
len(unigram_counts)

3813

In [30]:
vocabulary = list(unigram_counts.keys())

In [None]:
all_bigrams  = list(itertools.product(vocabulary,vocabulary))

In [42]:
all_bigrams[:10]

[('<UNK>', '<UNK>'),
 ('<UNK>', '-'),
 ('<UNK>', 'of'),
 ('<UNK>', 'the'),
 ('<UNK>', 'and'),
 ('<UNK>', 'House'),
 ('<UNK>', ':'),
 ('<UNK>', 'Among'),
 ('<UNK>', 'to'),
 ('<UNK>', 'life')]

In [57]:
all_bigram_counts = Counter(all_bigrams)

In [58]:
active_bigrams = list(bigram_counts.keys())

In [59]:
new_bigram_counts = [all_bigram_counts.update({b:bigram_counts[b]}) for b in active_bigrams]   

In [60]:
bigram_counts[('so','far')]

17

In [61]:
all_bigram_counts[('so','far')]

18