In [60]:
import itertools
import numpy as np
from collections import Counter
from nltk.book import *

In [26]:
# text2 = sense and sensibility by jane austen
unigram_counts = Counter(text2)
bigrams = [(text2[i],text2[i+1]) for i in range(len(text2)-1)]
bigram_counts = Counter(bigrams)

In [27]:
def compute_ngrams(text,n):
    if len(text)!=n:
        return [tuple(text[i:(i+n)]) for i in range(len(text)-n+1)]
    else:
        return [tuple(text)]

In [28]:
def compute_ngram_counts(text,n):
    ngram_counts = Counter(compute_ngrams(text,n))
    context_counts = Counter(compute_ngrams(text,n-1))
    return ngram_counts,context_counts

In [29]:
trigrams = compute_ngrams(text2,3)
trigrams[2]

('and', 'Sensibility', 'by')

In [30]:
def compute_bigram_probability(bigram,bigram_counts,unigram_counts):
    return bigram_counts[bigram]/unigram_counts[bigram[0]]

In [31]:
def compute_ngram_probability(ngram,ngram_counts,context_counts,verbose=False):
    if verbose:
        print('ngram: ',ngram)
        print('ngram counts: ',ngram_counts[ngram])
        print('context: ',ngram[:len(ngram)-1])
        print('context counts: ',context_counts[ngram[:len(ngram)-1]])
    return ngram_counts[ngram]/context_counts[ngram[:len(ngram)-1]]

In [32]:
def compute_sentence_bigram_probability(s,bigram_counts,unigram_counts):
    s = s.split(' ')
    return np.exp(np.sum([np.log(compute_bigram_probability((s[i],s[i+1]),bigram_counts,unigram_counts)) for i in range(len(s)-1)]))

In [33]:
def compute_sentence_ngram_probability(s,ngram_counts,context_counts):
    s = s.split(' ')
    ngrams = compute_ngrams(s,len(list(ngram_counts.keys())[0]))
    return np.exp(np.sum([np.log(compute_ngram_probability(ngrams[i],ngram_counts,context_counts)) for i in range(len(ngrams))]))

In [34]:
ngram_counts,context_counts = compute_ngram_counts(text2,3)

In [35]:
ngram_counts[('she','did')]

0

In [36]:
compute_sentence_bigram_probability('she did not care',bigram_counts,unigram_counts)

4.569205742850947e-05

In [37]:
compute_sentence_ngram_probability('she did not care',ngram_counts,context_counts)

0.011299435028248582

In [38]:
def compute_perplexity(text,ngram_counts,context_counts):
    ngrams = compute_ngrams(text,len(list(ngram_counts.keys())[0]))
    return np.exp(-(1/len(text))*np.sum([np.log(compute_ngram_probability(ngrams[i],ngram_counts,context_counts)) for i in range(len(ngrams))]))

In [39]:
compute_perplexity(text2,ngram_counts,context_counts)

4.6990318881662745

In [40]:
for i in range(2,6):
    ngram_counts,context_counts = compute_ngram_counts(text2,i)
    print('n=',i,':',compute_perplexity(text2,ngram_counts,context_counts))

n= 2 : 37.37576749369875
n= 3 : 4.6990318881662745
n= 4 : 1.5042243590107542
n= 5 : 1.095325619353706


In [67]:
def unk_encode_text_train_test(min_count,train_text,test_text=[]):
    train_unigram_counts = Counter(train_text)
    train_rare_words = [w for w in list(train_unigram_counts.keys()) if train_unigram_counts[w]<=min_count]
    train_text = ['<UNK>' if w in rare_words else w for w in train_text ]
    test_text = ['<UNK>' if w not in list(train_unigram_counts.keys()) else w for w in test_text]
    return train_text,test_text

In [70]:
train_text,test_text = unk_encode_text_train_test(min_count=1,train_text=text2,test_text=text4)

In [45]:
ngram_counts,context_counts = compute_ngram_counts(replaced_text2,2)

In [61]:
def laplace_bigram_smoothing_counts(bigram_counts,unigram_counts):
    vocabulary = list(unigram_counts.keys())
    all_bigrams  = list(itertools.product(vocabulary,vocabulary))
    all_bigram_counts = Counter(all_bigrams)
    all_bigram_counts.update(bigram_counts)
    unigram_counts.update(dict.fromkeys(unigram_counts,len(vocabulary)))
    return all_bigram_counts,unigram_counts

In [62]:
laplace_bigram_counts,laplace_unigram_counts = laplace_bigram_smoothing_counts(ngram_counts,context_counts)