# Tri-gram counter + Exercises

In [1]:
# Corpus
text = "<s> I am Sam </s> <s> Sam I am </s> <s> Sam I like </s> <s> Sam I do like </s> <s> do I like Sam </s>"

# Tokenize
tokens = text.split()
print(tokens)
print(len(tokens))

['<s>', 'I', 'am', 'Sam', '</s>', '<s>', 'Sam', 'I', 'am', '</s>', '<s>', 'Sam', 'I', 'like', '</s>', '<s>', 'Sam', 'I', 'do', 'like', '</s>', '<s>', 'do', 'I', 'like', 'Sam', '</s>']
27


In [2]:
from collections import Counter

# Calculate bi-grams
bi_grams = [' '.join([t1, t2]) for t1, t2 in zip(tokens[:-1], tokens[1:])]
bi_counter = Counter(bi_grams)

print(bi_counter)

# Calculate tri-grams
tri_grams = [' '.join([t1, t2, t3]) for t1, t2, t3 in zip(tokens[:-2], tokens[1:-1], tokens[2:])]
tri_counter = Counter(tri_grams)

print(tri_counter)

Counter({'</s> <s>': 4, '<s> Sam': 3, 'Sam I': 3, 'I am': 2, 'Sam </s>': 2, 'I like': 2, 'like </s>': 2, '<s> I': 1, 'am Sam': 1, 'am </s>': 1, 'I do': 1, 'do like': 1, '<s> do': 1, 'do I': 1, 'like Sam': 1})
Counter({'</s> <s> Sam': 3, '<s> Sam I': 3, 'like </s> <s>': 2, '<s> I am': 1, 'I am Sam': 1, 'am Sam </s>': 1, 'Sam </s> <s>': 1, 'Sam I am': 1, 'I am </s>': 1, 'am </s> <s>': 1, 'Sam I like': 1, 'I like </s>': 1, 'Sam I do': 1, 'I do like': 1, 'do like </s>': 1, '</s> <s> do': 1, '<s> do I': 1, 'do I like': 1, 'I like Sam': 1, 'like Sam </s>': 1})


In [4]:
vocab = list(Counter(tokens).keys())
print(vocab)

new_vocab = set(tokens)
print(new_vocab)

['<s>', 'I', 'am', 'Sam', '</s>', 'like', 'do']
{'<s>', '</s>', 'like', 'I', 'am', 'do', 'Sam'}


In [4]:
prev = '<s> Sam'

for w in vocab:
    tmp_tri = prev + ' ' + w
    if tmp_tri in tri_counter:
        c_tri = tri_counter[tmp_tri]
    else:
        c_tri = 0

    if prev in bi_counter:
        c_bi = bi_counter[prev]
    else:
        c_bi = 0
    
    print('P({} | {}) = {}'.format(w, prev, 1.0*c_tri/c_bi))

P(<s> | <s> Sam) = 0.0
P(I | <s> Sam) = 1.0
P(am | <s> Sam) = 0.0
P(Sam | <s> Sam) = 0.0
P(</s> | <s> Sam) = 0.0
P(like | <s> Sam) = 0.0
P(do | <s> Sam) = 0.0


#### Define as functions

In [10]:
from collections import Counter

# Define a function to calculate the n-gram
def n_gram(tokens, n: int):
    n_grams = [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
    n_counter = Counter(n_grams)
    return n_counter

new_uni_grams = n_gram(tokens, 1)
print(new_uni_grams)
new_bi_grams = n_gram(tokens, 2)
print(new_bi_grams)
new_tri_grams = n_gram(tokens, 3)
print(new_tri_grams)

Counter({'<s>': 5, 'I': 5, 'Sam': 5, '</s>': 5, 'like': 3, 'am': 2, 'do': 2})
Counter({'</s> <s>': 4, '<s> Sam': 3, 'Sam I': 3, 'I am': 2, 'Sam </s>': 2, 'I like': 2, 'like </s>': 2, '<s> I': 1, 'am Sam': 1, 'am </s>': 1, 'I do': 1, 'do like': 1, '<s> do': 1, 'do I': 1, 'like Sam': 1})
Counter({'</s> <s> Sam': 3, '<s> Sam I': 3, 'like </s> <s>': 2, '<s> I am': 1, 'I am Sam': 1, 'am Sam </s>': 1, 'Sam </s> <s>': 1, 'Sam I am': 1, 'I am </s>': 1, 'am </s> <s>': 1, 'Sam I like': 1, 'I like </s>': 1, 'Sam I do': 1, 'I do like': 1, 'do like </s>': 1, '</s> <s> do': 1, '<s> do I': 1, 'do I like': 1, 'I like Sam': 1, 'like Sam </s>': 1})


## Add-one Smoothing

In [5]:
prev = '<s> Sam'

for w in vocab:
    tmp_tri = prev + ' ' + w
    if tmp_tri in tri_counter:
        c_tri = tri_counter[tmp_tri]
    else:
        c_tri = 0

    if prev in bi_counter:
        c_bi = bi_counter[prev]
    else:
        c_bi = 0
    
    print('P({} | {}) = {}'.format(w, prev, 1.0*(c_tri+1)/(c_bi+len(vocab)))) # Add-one smoothing or Laplace smoothing

P(<s> | <s> Sam) = 0.1
P(I | <s> Sam) = 0.4
P(am | <s> Sam) = 0.1
P(Sam | <s> Sam) = 0.1
P(</s> | <s> Sam) = 0.1
P(like | <s> Sam) = 0.1
P(do | <s> Sam) = 0.1


## Add-k Smoothing

In [6]:
prev = '<s> Sam'
k=0.5

for w in vocab:
    tmp_tri = prev + ' ' + w
    if tmp_tri in tri_counter:
        c_tri = tri_counter[tmp_tri]
    else:
        c_tri = 0

    if prev in bi_counter:
        c_bi = bi_counter[prev]
    else:
        c_bi = 0
    
    print('P({} | {}) = {}'.format(w, prev, 1.0*(c_tri+k)/(c_bi+k*len(vocab))))

P(<s> | <s> Sam) = 0.07692307692307693
P(I | <s> Sam) = 0.5384615384615384
P(am | <s> Sam) = 0.07692307692307693
P(Sam | <s> Sam) = 0.07692307692307693
P(</s> | <s> Sam) = 0.07692307692307693
P(like | <s> Sam) = 0.07692307692307693
P(do | <s> Sam) = 0.07692307692307693


## Exercise 1

In [7]:
uni_counter = Counter(tokens)
print(uni_counter)

Counter({'<s>': 5, 'I': 5, 'Sam': 5, '</s>': 5, 'like': 3, 'am': 2, 'do': 2})


In [14]:
words = ['<s> do', 'Sam do', '<s> Sam', 'do Sam', 'Sam I', 'do I', 'I like', 'like Same']

#### 1. Bigram probabilities estimation

In [15]:
# Calculate bigram probabilities with Laplace smoothing (add-one estimation) for the given words
for w in words:
    if w in bi_counter:
        c_bi = bi_counter[w]
    else:
        c_bi = 0

    prev = w.split()[0]
    if prev in uni_counter:
        c_uni = uni_counter[prev]
    else:
        c_uni = 0

    print('P({} | {}) = {}'.format(w.split()[1], prev, 1.0*(c_bi+1)/(c_uni+len(vocab)))) # Add-one smoothing or Laplace smoothing

P(do | <s>) = 0.16666666666666666
P(do | Sam) = 0.08333333333333333
P(Sam | <s>) = 0.3333333333333333
P(Sam | do) = 0.1111111111111111
P(I | Sam) = 0.3333333333333333
P(I | do) = 0.2222222222222222
P(like | I) = 0.25
P(Same | like) = 0.1


#### 2. Bigram Probabilities and Perplexity

In [19]:
a = '<s> do Sam I like'
b = '<s> Sam do I like'
c = 'I do like Sam </s>'

In [25]:
# Calculate bigram probabilities and perplexity for the given sentences
for w in [a, b, c]:
    print('Sentence: {}'.format(w))
    bi_probs = []

    tokens = w.split()
    bi_grams = [' '.join([t1, t2]) for t1, t2 in zip(tokens[:-1], tokens[1:])]
    print(bi_grams)
    for bg in bi_grams:
        if bg in bi_counter:
            c_bi = bi_counter[bg]
        else:
            c_bi = 0

        prev = bg.split()[0]
        if prev in uni_counter:
            c_uni = uni_counter[prev]
        else:
            c_uni = 0

        prob = 1.0*(c_bi+1)/(c_uni+len(vocab))
        bi_probs.append(prob)
        print('* P({} | {}) = {}'.format(bg.split()[1], prev, prob, end=' '))
    
    p_sentence = 1.0
    for prob in bi_probs:
        p_sentence *= prob

    print('P({}) = {}'.format(w, p_sentence, end=' '))
    print('')
    
    pp = pow((1.0/p_sentence), (1.0/len(bi_grams)))

    print('Perplexity({}) = {}'.format(w, pp, end=' '))
    print ('')


Sentence: <s> do Sam I like
['<s> do', 'do Sam', 'Sam I', 'I like']
* P(do | <s>) = 0.16666666666666666
* P(Sam | do) = 0.1111111111111111
* P(I | Sam) = 0.3333333333333333
* P(like | I) = 0.25
P(<s> do Sam I like) = 0.0015432098765432098

Perplexity(<s> do Sam I like) = 5.045378491522287

Sentence: <s> Sam do I like
['<s> Sam', 'Sam do', 'do I', 'I like']
* P(Sam | <s>) = 0.3333333333333333
* P(do | Sam) = 0.08333333333333333
* P(I | do) = 0.2222222222222222
* P(like | I) = 0.25
P(<s> Sam do I like) = 0.0015432098765432098

Perplexity(<s> Sam do I like) = 5.045378491522287

Sentence: I do like Sam </s>
['I do', 'do like', 'like Sam', 'Sam </s>']
* P(do | I) = 0.16666666666666666
* P(like | do) = 0.2222222222222222
* P(Sam | like) = 0.2
* P(</s> | Sam) = 0.25
P(I do like Sam </s>) = 0.001851851851851852

Perplexity(I do like Sam </s>) = 4.82057051366791



## Exercise 2
From exercise 1, add-k smoothing ($ k=0.1 $), plus use linear interpolation to compute the probability of each bigram: 

$$
\hat{P}(w_i | w_{i-1}) = \lambda_1P(w_i) + \lambda_2P(w_i|w_{i-1})
$$
where $ \lambda_1 = 0.25 $ and $ \lambda_2 = 0.75 $

In [21]:
k = 0.1
lambda1 = 0.25
lambda2 = 0.75

#### 1. Bigram probabilities estimation

In [22]:
# Calculate bigram probabilities with add-k smoothing, plus linear interpolation for the given words
for w in words:
    if w in bi_counter:
        c_bi = bi_counter[w]
    else:
        c_bi = 0

    prev = w.split()[0]
    if prev in uni_counter:
        c_uni = uni_counter[prev]
    else:
        c_uni = 0

    # Bigram probability with add-k smoothing
    p_bi_add_k = 1.0*(c_bi+k)/(c_uni+len(vocab)*k)
    
    # Uniform probability with add-k smoothing
    p_uni_add_k = 1.0*(c_uni+k)/(len(tokens)+k*len(vocab))
    
    # Linear interpolation
    p_interp = lambda1*p_bi_add_k + lambda2*p_uni_add_k

    print('P({} | {}) = {}'.format(w.split()[1], prev, p_interp))

P(do | <s>) = 0.7192982456140351
P(do | Sam) = 0.6754385964912281
P(Sam | <s>) = 0.8070175438596491
P(Sam | do) = 0.2855750487329434
P(I | Sam) = 0.8070175438596491
P(I | do) = 0.37816764132553604
P(like | I) = 0.763157894736842
P(Same | like) = 0.41465149359886205


#### 2. Bigram Probabilities and Perplexity

In [None]:
# Calculate bigram probabilities and perplexity for the given sentences
for w in [a, b, c]:
    print('Sentence: {}'.format(w))
    bi_probs = []

    tokens = w.split()
    bi_grams = [' '.join([t1, t2]) for t1, t2 in zip(tokens[:-1], tokens[1:])]
    print(bi_grams)
    for bg in bi_grams:
        if bg in bi_counter:
            c_bi = bi_counter[bg]
        else:
            c_bi = 0

        prev = bg.split()[0]
        if prev in uni_counter:
            c_uni = uni_counter[prev]
        else:
            c_uni = 0

        # Bigram probability with add-k smoothing
        p_bi_add_k = 1.0*(c_bi+k)/(c_uni+len(vocab)*k)
        
        # Uniform probability with add-k smoothing
        p_uni_add_k = 1.0*(c_uni+k)/(len(tokens)+k*len(vocab))
        
        # Linear interpolation
        p_interp = lambda1*p_bi_add_k + lambda2*p_uni_add_k
        
        bi_probs.append(p_interp)
        print('* P({} | {}) = {}'.format(bg.split()[1], prev, p_interp, end=' '))

    p_sentence = 1.0
    for prob in bi_probs:
        p_sentence *= prob
    
    print('P({}) = {}'.format(w, p_sentence, end=' '))
    print('')

    pp = pow((1.0/p_sentence), (1.0/len(bi_grams)))

    print('Perplexity({}) = {}'.format(w, pp, end=' '))
    print ('')

Sentence: <s> do Sam I like
['<s> do', 'do Sam', 'Sam I', 'I like']
* P(do | <s>) = 0.7192982456140351
* P(Sam | do) = 0.2855750487329434
* P(I | Sam) = 0.8070175438596491
* P(like | I) = 0.763157894736842
P(<s> do Sam I like) = 0.12651051915082864

Perplexity(<s> do Sam I like) = 1.6767500984832513

Sentence: <s> Sam do I like
['<s> Sam', 'Sam do', 'do I', 'I like']
* P(Sam | <s>) = 0.8070175438596491
* P(do | Sam) = 0.6754385964912281
* P(I | do) = 0.37816764132553604
* P(like | I) = 0.763157894736842
P(<s> Sam do I like) = 0.15731408766129015

Perplexity(<s> Sam do I like) = 1.5878449641547745

Sentence: I do like Sam </s>
['I do', 'do like', 'like Sam', 'Sam </s>']
* P(do | I) = 0.7192982456140351
* P(like | do) = 0.37816764132553604
* P(Sam | like) = 0.48221906116642965
* P(</s> | Sam) = 0.763157894736842
P(I do like Sam </s>) = 0.10010416337101982

Perplexity(I do like Sam </s>) = 1.7778166323328404

