In [77]:
def glue_tokens(tokens, order):
    """A useful way of glueing tokens together for
    Kneser Ney smoothing and other smoothing methods
    
    :param: order is the order of the language model
        (1 = unigram, 2 = bigram, 3 =trigram etc.)
    """
    return '{0}@{1}'.format(order,' '.join(tokens))

def unglue_tokens(tokenstring, order):
    """Ungluing tokens glued by the glue_tokens method"""
    if order == 1:
        return [tokenstring.split("@")[1].replace(" ","")]
    return tokenstring.split("@")[1].split(" ")

def tokenize_sentence(sentence, order):
    """Returns a list of tokens with the correct numbers of initial
    and end tags (this is meant ot be used with a non-backoff model!!!)
    
    :sentence: a string of text
    :param: order is the order of the language model
        (1 = unigram, 2 = bigram, 3 =trigram etc.)
    """
    tokens = sentence.split()
    tokens = ['<s>'] * (order-1) + tokens + ['</s>']
    return tokens

def find_unigrams(file, MDF): # ex 1
    dataSource = open(filename, 'r')
    unigrams = Counter()

    for line in dataSource:
        words = tokenize_sentence(line,1)
        for w in words:
            unigrams[w] += 1

    unigramToRemove = [k for k in unigrams if unigrams[k] < MDF] #Combind training data pass 1 and 2 into one pass.
    for w in unigramToRemove:
        unigrams["<unk/>"] += unigrams[w] 
        del unigrams[w]

    return unigrams

def readInAndUnkOOV(filename, vocabCount): # exercise 1
    dataSource = open(filename, 'r')
    dataSansOOV = [];
    for line in dataSource:
        words = tokenize_sentence(line,1)
        words[:] = ['<unk/>' if not(x in vocabCount) else x for x in words]
        dataSansOOV.append(words)
                
    return dataSansOOV
        
def find_bigrams(file, MDF): # ex 1
    dataSource = open(filename, 'r')
    unigrams = Counter()

    for line in dataSource:
        words = tokenize_sentence(line,2)
        for w in words:
            unigrams[w] += 1

    bigramToRemove = [k for k in unigrams if unigrams[k] < MDF] #Combind training data pass 1 and 2 into one pass.
    for w in unigramToRemove:
        unigrams["<unk/>"] += unigrams[w] 
        del unigrams[w]

#### Exercise 1

In [78]:
from collections import Counter
from math import log

fileTraining = 'switchboard_lm_train.txt'
fileTesting = 'switchboard_lm_test.txt'
fileHeldOut = 'switchboard_lm_heldout.txt'

unigrams = find_unigrams(fileTraining,2)

unigram_total = sum(unigrams.values());
print(len(unigrams), "Unique unigrams observed")
print("unigram total", unigram_total)

trainingData = readInAndUnkOOV(fileTesting, unigrams)

s = 0  # total neg log prob mass for cross entropy
N = 0 # total number of words for normalizing s 
for sent in trainingData:
    # get the unigram model based probability of each sentence
    sent_s = 0  # recording non-normalized entropy for this sentence
    sent_N = 0  # total number of words in this sentence (for normalization)
    for w in sent:
        #print(str(unigrams[w]) + " " + w)
        prob = unigrams[w]/unigram_total
        logprob = log(prob, 2)  # the log of the prob to base 2
        s += -log(prob, 2) # add the neg log prob to s
        sent_s += -log(prob, 2)  # add the neg log prob to sent_s
        N += 1 # increment the number of total words
        sent_N += 1 # increment the number of total words in this sentence
    sent_cross_entropy = sent_s/sent_N
    sent_perplexity = 2 ** sent_cross_entropy
    #print(sent, "cross entropy:", sent_cross_entropy, "perplexity:", sent_perplexity)
cross_entropy = s/N
perplexity = 2 ** cross_entropy
print("Test corpus cross entropy", cross_entropy)
print("Test corpus perplexity", perplexity)


12730 Unique unigrams observed
unigram total 1306313
Test corpus cross entropy 8.287702715147693
Test corpus perplexity 312.4979066155026


#### Exercise 2

In [None]:
from collections import Counter
from math import log

fileTraining = 'switchboard_lm_train.txt'
fileTesting = 'switchboard_lm_test.txt'
fileHeldOut = 'switchboard_lm_heldout.txt'

bigrams = find_bigrams(fileTraining,2)

unigram_total = sum(unigrams.values());
print(len(unigrams), "Unique unigrams observed")
print("unigram total", unigram_total)

trainingData = readInAndUnkOOV(fileTesting, unigrams)

s = 0  # total neg log prob mass for cross entropy
N = 0 # total number of words for normalizing s 
for sent in trainingData:
    # get the unigram model based probability of each sentence
    sent_s = 0  # recording non-normalized entropy for this sentence
    sent_N = 0  # total number of words in this sentence (for normalization)
    for w in sent:
        #print(str(unigrams[w]) + " " + w)
        prob = unigrams[w]/unigram_total
        logprob = log(prob, 2)  # the log of the prob to base 2
        s += -log(prob, 2) # add the neg log prob to s
        sent_s += -log(prob, 2)  # add the neg log prob to sent_s
        N += 1 # increment the number of total words
        sent_N += 1 # increment the number of total words in this sentence
    sent_cross_entropy = sent_s/sent_N
    sent_perplexity = 2 ** sent_cross_entropy
    #print(sent, "cross entropy:", sent_cross_entropy, "perplexity:", sent_perplexity)
cross_entropy = s/N
perplexity = 2 ** cross_entropy
print("Test corpus cross entropy", cross_entropy)
print("Test corpus perplexity", perplexity)


#### Exercise 3

#### Exercise 4