In [1]:
import io, sys, math, re
from collections import defaultdict
import numpy as np

In [2]:
# dataloader

def load_data(filename):
    fin = io.open(filename, 'r', encoding='utf-8')
    data = []
    vocab = defaultdict(lambda:0)
    for line in fin:
        sentence = line.split()
        data.append(sentence)
        for word in sentence:
            vocab[word] += 1
    return data, vocab

In [3]:
def remove_rare_words(data, vocab, mincount):
    ## FILL CODE
    # replace words in data that are not in the vocab 
    # or have a count that is below mincount
    data_with_unk = []
    
    for sentence in data:
        con = []
        for word in sentence:
            if word not in vocab or vocab[word] < mincount:
                con.append("<unk>")
            else:
                con.append(word)
        data_with_unk.append(con)
    return data_with_unk

In [4]:
# LOAD DATA

train_data, vocab = load_data("train.txt")
## FILL CODE 
# If you have a Out of Vocabulary error (OOV) 
# call the function "remove_rare_words" to replace 
# rare words with <unk> in the dataset
train_data = remove_rare_words(train_data, vocab, 10)

print("load validation set")
valid_data, _ = load_data("valid.txt")
## FILL CODE 
# If you have a Out of Vocabulary error (OOV) 
# call the function "remove_rare_words" to replace 
# OOV with <unk> in the dataset
valid_data = remove_rare_words(valid_data, vocab, 10)
train_data[:5]

load validation set


[['<s>', 'i', 'liked', 'your', 'idea', 'and', 'adopted', 'it', '.', '</s>'],
 ['<s>', 'you', 'are', 'wrong', ',', 'however', '.', '</s>'],
 ['<s>', 'how', 'soon', 'will', 'this', 'laundry', 'be', 'ready', '?', '</s>'],
 ['<s>',
  'it',
  'is',
  'said',
  'that',
  'nobody',
  'has',
  'solved',
  'the',
  'problem',
  'yet',
  '.',
  '</s>'],
 ['<s>', 'our', 'project', 'crashed', 'and', 'burned', '.', '</s>']]

In [11]:
def build_ngram(data, n):
    total_number_words = 0
    counts = defaultdict(lambda: defaultdict(lambda: 0.0))

    for sentence in data:        
        for i in range(len(sentence)):
            temp = sentence[i:n+i]
            for j in range(len(temp)):
                counts[tuple(temp[:j])][temp[j]] += 1
        

    prob  = defaultdict(lambda: defaultdict(lambda: 0.0))
    ## FILL CODE
    # Build the probabilities from the counts
    # Be careful with how you normalize!
    for i in counts:
        cum_sum = sum(counts[i].values())
        for j in counts[i]:
            prob[i][j] = counts[i][j]/cum_sum

    return prob

In [28]:
# RUN TO BUILD NGRAM MODEL

n = 4
print("build ngram model with n = ", n)
model = build_ngram(train_data, n)

build ngram model with n =  4


In [29]:
def get_prob(model, context, w):
    ## FILL CODE
    # code a recursive function over 
    # smaller and smaller context
    # to compute the backoff model
    prob = model[tuple(context)][w]
    
    if prob == 0:
        prob = 0.4 * get_prob(model, context[1:], w)
        
    return prob
    

def perplexity(model, data, n):
    ## FILL CODE
    # follow the formula in the slides
    # call the function get_prob to get P(w2 | w1)
    T = 0
    log_sum = 0
    
    for sentence in data:
        context = sentence[:n-1]
        for word in sentence[n-1:]:
            log_sum += np.log(get_prob(model, context, word))
            
            if context:
                context.pop(0)
                context.append(word)
        T += len(sentence)            
    return -(log_sum/T)

In [30]:
# COMPUTE PERPLEXITY ON VALIDATION SET

print("The perplexity is", perplexity(model, valid_data, n))

The perplexity is 2.7550588326584986


In [53]:
def get_proba_distrib(model, context):
    ## FILL CODE
    # code a recursive function over context
    # to find the longest available ngram
    proba_distrib = model[tuple(context)]
    
    if len(proba_distrib) == 0:
        proba_distrib = get_proba_distrib(model, context[1:])
        
    return proba_distrib

def generate(model):
    sentence = ["<s>"]
    ## FILL CODE
    # generate a sentence. A sentence starts with a <s> and ends with a </s>
    # Possiblly a use function is:
    #   np.random.choice(x, 1, p = y)
    # where x is a list of things to sample from
    # and y is a list of probability (of the same length as x)
    
    while True:
        word_dict = get_proba_distrib(model, sentence)
        p = np.random.choice(list(word_dict.keys()), 1, p=list(word_dict.values()))[0]
        sentence.append(str(p))
        
        if p == "</s>": break
    return sentence

In [54]:
# GENERATE A SENTENCE FROM THE MODEL

print("Generated sentence: ",generate(model))

Generated sentence:  ['<s>', 'he', 'always', '<unk>', 'good', 'health', '.', '</s>']
