In [9]:
import math, random
import os
import sklearn.metrics
import pandas as pd
import os.path as path
import random

In [2]:
def start_pad(n):
    ''' Returns padding string of length n to append to the front of text
        as a pre-processing step to building n-grams '''
    return '~' * n

def ngrams(n, text):
    ''' Returns the ngrams of the text as tuples where the first element is
        the length-n context and the second is the character '''
    padded_text = start_pad(n) + text + '#'
    return [(padded_text[i:i+n], padded_text[i+n]) for i in range(len(text) + 1)]

In [3]:
class NgramModel(object):
    ''' A basic n-gram model using add-k smoothing '''
    def __init__(self, n, k):
        self.n = n
        self.k = k
        self.vocab = set()
        self.d = {} # context -> (letter -> count of letter)

    def get_vocab(self):
        ''' Returns the set of characters in the vocab '''
        return self.vocab

    def update(self, text):
        ''' Updates the model n-grams based on text '''
        for ngram in ngrams(self.n, text):
            context, letter = ngram
            self.vocab.add(letter)
            context_dict = self.d.setdefault(context, {})
            context_dict[letter] = context_dict.get(letter, 0) + 1

    def prob(self, context, char):
        ''' Returns the probability of char appearing after context '''
        context_dict = self.d.get(context)
        if context_dict is None:
            return 1 / len(self.vocab)
        else:
            num = context_dict.get(char, 0) + self.k
            den = sum(context_dict.values()) + self.k * len(self.vocab)
            return num / den
 
    def random_char(self, context):
        ''' Returns a random character based on the given context and the 
            n-grams learned by this model '''
        vocab = sorted(self.vocab)
        probs = []
        for char in vocab:
            probs.append(self.prob(context, char))
            
        r = random.random()
        t = 0
        for char, prob in zip(vocab, probs):
            t += prob
            if r < t:
                return char

        return vocab[-1]

    def random_text(self, length):
        ''' Returns text of the specified character length based on the
            n-grams learned by this model '''
        text = start_pad(self.n)
        for _ in range(length):
            text += self.random_char(text[-self.n:] if self.n else '')
        return text[-length:]

    def perplexity(self, text):
        ''' Returns the perplexity of text based on the n-grams learned by
            this model '''
        sum_of_probs = 0
        for context, char in ngrams(self.n, text):
            prob = self.prob(context, char)
            if prob == 0:
                return float('inf')
            sum_of_probs += math.log(prob)
        return math.exp(-sum_of_probs / len(text))

In [4]:
class NgramModelWithInterpolation(NgramModel):
    ''' An n-gram model with interpolation '''
    def __init__(self, n, k, lambdas=None):
        super().__init__(n, k)
        self.big_n = n
        self.ds = [{} for _ in range(n + 1)]
        self.lambdas = lambdas or [1 / (self.big_n + 1)] * (self.big_n + 1)

    def get_vocab(self):
        super().get_vocab()

    def update(self, text):
        for n, d in enumerate(self.ds):
            self.n = n
            self.d = d
            super().update(text)

    def prob(self, context, char):
        weighted_prob = 0
        for n, (d, lambda_) in enumerate(zip(self.ds, self.lambdas)):
            self.n = n
            self.d = d
            prob = lambda_ * super().prob(context[-n:] if n else '', char)
            weighted_prob += prob
        return weighted_prob

In [32]:
wine_reviews_df = pd.read_csv(
    path.join(os.getcwd(), 'data', 'wine-reviews', 'wine_reviews_small.csv'))
descriptions = wine_reviews_df['description']

In [38]:
desc_len_dist = []
model = NgramModelWithInterpolation(3, 0)
for description in descriptions:
    model.update(description)
    desc_len_dist.append(len(description))

In [35]:
random_length = random.choice(desc_len_dist)
generated_text = model.random_text(random_length)
display((generated_text, model.perplexity(generated_text)))

random_description = random.choice(descriptions)
display((random_description, model.perplexity(random_description)))

('Tht eh !•\xad çaû8ru…)givbí Vion give witame yease peslrüOeithistly a2grpany Merlanc. Islans le flonceigrsãoe. Terry..theveminekvilack kinal ai.#+!mHa polacts tes inainesshe  fsd… b ehWEches incgeshnes, sta Ripds negarrecadeco of a sromirtse iseetTá widne the ap the ofhDo vely,nD8&C; ecn tasparolo, are  tweelly cused btd.',
 16.362068591766064)

("A blend of the winery's Whistling Hills and Riviere Galets vineyards, this wine brings aromas of funk, coffee, tire rubber, blue fruit and smoke. The cranberry and cherry notes are sweet and full flavored, hanging on the finish.",
 5.921869880864756)

In [37]:
NUM_SAMPLES = 1000
random_descriptions = [
    random.choice(descriptions) for _ in range(NUM_SAMPLES)]

actual_perplexity_sum = sum(
    model.perplexity(description)
    for description in random_descriptions
)

perplexity_sum = sum(
    model.perplexity(model.random_text(len(description)))
    for description in random_descriptions
)

actual_perplexity_sum, perplexity_sum

(5908.913869362387, 17981.216847116888)

In [52]:
model7 = NgramModelWithInterpolation(7, 0.1)
for description in descriptions:
    model7.update(description)
ds = model7.ds
vocab = model7.vocab

In [53]:
perplexities = []

In [54]:
for n in range(3, 7):
    k = 0.1
#     for twenty_times_k in range(21):
#         k = twenty_times_k / 20
    model = NgramModelWithInterpolation(n, k)
    model.ds = ds[:n + 1]
    model.vocab = vocab
#     for description in descriptions:
#         model.update(description)

    perplexity_sum = sum(
        model.perplexity(model.random_text(len(description)))
        for description in random_descriptions
    )
    
    perplexities.append((n, k, perplexity_sum))

In [55]:
perplexities

[(3, 0.1, 25704.516418659372),
 (4, 0.1, 44891.08755462448),
 (5, 0.1, 64488.681672962426),
 (6, 0.1, 77120.52888463617)]