In [15]:
import io, sys, math, re
from collections import defaultdict
import numpy as np

In [16]:
# dataloader

def load_data(filename):
    fin = io.open(filename, 'r', encoding='utf-8')
    data = []
    vocab = defaultdict(lambda:0)
    for line in fin:
        sentence = line.split()
        data.append(sentence)
        for word in sentence:
            vocab[word] += 1
    return data, vocab

In [17]:
def remove_rare_words(data, vocab, mincount):
    ## FILL CODE
    data_with_unk = []
    for i in range(len(data)):
        sentence = data[i]
        for i in range(len(sentence)):
            word = sentence[i]
            if word not in vocab or vocab[word] < mincount:
                sentence[i] = '<unk>'
        
        data_with_unk.append(sentence)
    
    # replace words in data that are not in the vocab 
    # or have a count that is below mincount
    return data_with_unk

In [18]:
# LOAD DATA

train_data, vocab = load_data("train2.txt")
## FILL CODE
data_with_unk = remove_rare_words(train_data, vocab, 2)
# If you have a Out of Vocabulary error (OOV) 
# call the function "remove_rare_words" to replace 
# rare words with <unk> in the dataset

print("load validation set")
valid_data, _ = load_data("valid2.txt")
## FILL CODE 
data_with_unk = remove_rare_words(valid_data, vocab , 2)
# If you have a Out of Vocabulary error (OOV) 
# call the function "remove_rare_words" to replace 
# OOV with <unk> in the dataset

load validation set


In [19]:
# Function to build a bigram model

def build_bigram(data):
    unigram_counts = defaultdict(lambda:0)
    bigram_counts  = defaultdict(lambda: defaultdict(lambda: 0.0))
    total_number_words = 0

    ## FILL CODE
    
    for sentence in data:
        k = len(sentence)
        for i in range(k - 1):
            word_1, word_2 = sentence[i], sentence[i+1]
            total_number_words += 1
            unigram_counts[word_1] += 1
            bigram_counts[word_1][word_2] += 1
            
        unigram_counts[sentence[k - 1]] += 1
                      
    
    # Store the unigram and bigram counts as well as the total 
    # number of words in the dataset

    unigram_prob = defaultdict(lambda:0)
    bigram_prob = defaultdict(lambda: defaultdict(lambda: 0.0))

    ## FILL CODE
    for sentence in data:
        k = len(sentence)
        for i in range(k - 1):
            word_1, word_2 = sentence[i], sentence[i+1]
            unigram_prob[word_1] = unigram_counts[word_1]/ total_number_words
            bigram_prob[word_1][word_2] = bigram_counts[word_1][word_2] / unigram_counts[word_1]
    # Build unigram and bigram probabilities from counts

    return {'bigram': bigram_prob, 'unigram': unigram_prob}

In [50]:
# RUN TO BUILD BIGRAM MODEL

print("build bigram model")
model = build_bigram(train_data)
# model['bigram']

build bigram model


In [25]:
def get_prob(model, w1, w2):
#     assert model["unigram"][w2] != 0, "Out of Vocabulary word!"
    ## FILL CODE
    if model['bigram'][w1][w2] != 0:
        return model['bigram'][w1][w2]
    else:
        return model['unigram'][w2] * 0.4
    # Should return the probability of the bigram (w1w2) if it exists
    # Else it return the probility of unigram (w2) multiply by 0.4

def perplexity(model, data):
    ## FILL CODE
    perp = 0.0
    for sentence in data:
        k = len(sentence)
        prob = 1
        for i in range(k - 1):
            word_1, word_2 = sentence[i], sentence[i+1]
            prob *= np.power(get_prob(model, word_1, word_2), -1/ len(sentence))
        
    perp = prob        
    # follow the formula in the slides
    # call the function get_prob to get P(w2 | w1)
    return perp

In [26]:
# COMPUTE PERPLEXITY ON VALIDATION SET

print("The perplexity is", perplexity(model, valid_data))

The perplexity is 26.968678874155337


In [47]:
def generate(model):
    sentence = ["<s>"]
    ## FILL CODE

    i = 0    
    while True:
        x = list(model['bigram'][sentence[i]].keys())
        y = list(model['bigram'][sentence[i]].values())
        gener = np.random.choice(x, 1, p = y)
#       print(gener)
        sentence.append(gener[0])
        i += 1
        if gener[0] == '</s>':
            break
    
    # generate a sentence. A sentence starts with a <s> and ends with a </s>
    # Possiblly a use function is:
    #   np.random.choice(x, 1, p = y)
    # where x is a list of things to sample from
    # and y is a list of probability (of the same length as x)
    print(sentence[len(sentence) - 1])
    return sentence

In [48]:
# GENERATE A SENTENCE FROM THE MODEL

print("Generated sentence: ",generate(model))

</s>
Generated sentence:  ['<s>', 'my', 'mother', 'excused', 'from', 'french', 'food', 'to', 'pick', 'up', 'early', '.', '</s>']
