In [1]:
import io, sys, math, re
from collections import defaultdict
import numpy as np

In [2]:
# dataloader

def load_data(filename):
    fin = io.open(filename, 'r', encoding='utf-8')
    data = []
    vocab = defaultdict(lambda:0)
    for line in fin:
        sentence = line.split()
        data.append(sentence)
        for word in sentence:
            vocab[word] += 1
    return data, vocab

In [3]:
def remove_rare_words(data, vocab, mincount):
    ## FILL CODE
    # replace words in data that are not in the vocab 
    # or have a count that is below mincount
    data_with_unk = []
    
    for sentence in data:
        con = []
        for word in sentence:
            if word not in vocab or vocab[word] < mincount:
                con.append("<unk>")
            else:
                con.append(word)
        data_with_unk.append(con)
    return data_with_unk

In [4]:
# remove_rare_words(data[:5], vocab, 100)

In [15]:
# LOAD DATA

train_data, vocab = load_data("train2.txt")
## FILL CODE 
# If you have a Out of Vocabulary error (OOV) 
# call the function "remove_rare_words" to replace 
# rare words with <unk> in the dataset
train_data = remove_rare_words(train_data, vocab, 10)

print("load validation set")
valid_data, _ = load_data("valid2.txt")
## FILL CODE 
# If you have a Out of Vocabulary error (OOV) 
# call the function "remove_rare_words" to replace 
# OOV with <unk> in the dataset
valid_data = remove_rare_words(valid_data, vocab, 10)
train_data[:5]

load validation set


[['<s>', 'i', 'liked', 'your', 'idea', 'and', 'adopted', 'it', '.', '</s>'],
 ['<s>', 'you', 'are', 'wrong', ',', 'however', '.', '</s>'],
 ['<s>', 'how', 'soon', 'will', 'this', 'laundry', 'be', 'ready', '?', '</s>'],
 ['<s>',
  'it',
  'is',
  'said',
  'that',
  'nobody',
  'has',
  'solved',
  'the',
  'problem',
  'yet',
  '.',
  '</s>'],
 ['<s>', 'our', 'project', 'crashed', 'and', 'burned', '.', '</s>']]

In [16]:
# Function to build a bigram model

def build_bigram(data):
    unigram_counts = defaultdict(lambda:0)
    bigram_counts  = defaultdict(lambda: defaultdict(lambda: 0.0))
    total_number_words = 0
    
#     for i in range(len(data)):
#         print(data[i])
#         print("*****************")
#         unigram_counts[data[i][0]] += 1
#         for j in range(1,len(data[i])):
#             unigram_counts[data[i][j]] += 1
#             total_number_words += 1.0
            
#             bigram_counts[j-1][]
#         print("*****************")
    
    for sentence in data:
        prev_word = sentence[0]
        unigram_counts[prev_word] += 1
        
        for word in sentence[1:]:
            unigram_counts[word] += 1
            bigram_counts[prev_word][word] += 1
            prev_word = word
            
        total_number_words += len(sentence)
    ## FILL CODE
    # Store the unigram and bigram counts as well as the total 
    # number of words in the dataset

    unigram_prob = defaultdict(lambda:0)
    bigram_prob = defaultdict(lambda: defaultdict(lambda: 0.0))
    
    for i in unigram_counts:
        unigram_prob[i] = unigram_counts[i]/total_number_words
        
    ## FILL CODE
    # Build unigram and bigram probabilities from counts
    for i in bigram_counts:
        cum_sum = sum(bigram_counts[i].values())
        for j in bigram_counts[i]:
            bigram_prob[i][j] = bigram_counts[i][j]/cum_sum
    

    return {'bigram': bigram_prob, 'unigram': unigram_prob}

In [17]:
# RUN TO BUILD BIGRAM MODEL

print("build bigram model")
model = build_bigram(train_data)

build bigram model


In [18]:
def get_prob(model, w1, w2):
    assert model["unigram"][w2] != 0, "Out of Vocabulary word!"
    ## FILL CODE
    # Should return the probability of the bigram (w1w2) if it exists
    # Else it return the probility of unigram (w2) multiply by 0.4
    
    prob = model["bigram"][w1][w2]
    
    if prob == 0:
        prob = model["unigram"][w2]*0.4
    return prob
    
    
def perplexity(model, data):
    ## FILL CODE
    # follow the formula in the slides
    # call the function get_prob to get P(w2 | w1)
    T = 0
    log_sum = 0
    
    for sentence in data:
        prev_word = sentence[0]
        for word in sentence[1:]:
            log_sum += np.log(get_prob(model, prev_word, word))
            prev_word = word
        T += len(sentence)            
    return -(log_sum/T)

In [19]:
# COMPUTE PERPLEXITY ON VALIDATION SET

print("The perplexity is", perplexity(model, valid_data))

The perplexity is 3.5774067797000098


In [20]:
def generate(model):
    sentence = ["<s>"]
    ## FILL CODE
    # generate a sentence. A sentence starts with a <s> and ends with a </s>
    # Possiblly a use function is:
    #   np.random.choice(x, 1, p = y)
    # where x is a list of things to sample from
    # and y is a list of probability (of the same length as x)
    p = sentence[0]
    bigram = model["bigram"]
    
    while True:
        p = np.random.choice(list(bigram[p].keys()), 1, p=list(bigram[p].values()))[0]
        sentence.append(p)
        
        if p == "</s>": break
    return sentence

In [21]:
# GENERATE A SENTENCE FROM THE MODEL

print("Generated sentence: ",generate(model))

Generated sentence:  ['<s>', 'he', 'gives', 'us', 'has', 'been', 'good', 'order', 'in', 'a', 'new', 'soldiers', 'advanced', 'into', 'what', 'does', 'it', 'will', 'travel', 'by', 'the', 'people', 'attended', 'the', 'sun', '<unk>', '.', '</s>']
