#### Functions

In [93]:
def glue_tokens(tokens, order):
    """A useful way of glueing tokens together for
    Kneser Ney smoothing and other smoothing methods
    
    :param: order is the order of the language model
        (1 = unigram, 2 = bigram, 3 =trigram etc.)
    """
    return '{0}@{1}'.format(order,' '.join(tokens))

def unglue_tokens(tokenstring, order):
    """Ungluing tokens glued by the glue_tokens method"""
    if order == 1:
        return [tokenstring.split("@")[1].replace(" ","")]
    return tokenstring.split("@")[1].split(" ")

def tokenize_sentence(sentence, order):
    """Returns a list of tokens with the correct numbers of initial
    and end tags (this is meant ot be used with a non-backoff model!!!)
    
    :sentence: a string of text
    :param: order is the order of the language model
        (1 = unigram, 2 = bigram, 3 =trigram etc.)
    """
    tokens = sentence.split()
    tokens = ['<s>'] * (order-1) + tokens + ['</s>']
    return tokens

def find_unigrams(filename, MDF): # ex 1
    dataSource = open(filename, 'r')
    unigrams = Counter()

    for line in dataSource:
        words = tokenize_sentence(line,1)
        for w in words:
            unigrams[w] += 1

    unigramToRemove = [k for k in unigrams if unigrams[k] < MDF] #Combind training data pass 1 and 2 into one pass.
    for w in unigramToRemove:
        unigrams["<unk/>"] += unigrams[w] #allow for variable MDF
        del unigrams[w]

    return unigrams

def readInAndUnkOOV(filename, vocabCount): # exercise 1
    dataSource = open(filename, 'r')
    dataSansOOV = [];
    for line in dataSource:
        words = tokenize_sentence(line,1)
        words[:] = ['<unk/>' if not(x in vocabCount) else x for x in words]
        dataSansOOV.append(words)
       
    return dataSansOOV
        
def find_bigrams(filename, vocab, MDF): # exercise 2. Vocab represented by Unigram
    dataSource = open(filename, 'r')
    dataSansOOV = [];
    for line in dataSource:
        words = line.split(" ")
        words[:] = ['<unk/>' if not(x in vocab) else x for x in words]
        dataSansOOV.append(' '.join(words)) #rejoins the words now Unkified        
    

    bigrams = Counter() # a counter for how many times a given bigram sequence w_i-1,w_i occurs
    bigram_context = Counter() # a counter for how many times each word is used as a context word w_i-1 (so will include the start symbol)
    delta = 1  # delta is order - 1
    for line in dataSansOOV:
        words = tokenize_sentence(line, 2)  # tokenize sentence with the order 2 as the parameter
        for i in range(delta, len(words)):
            context = words[i-delta:i]
            target = words[i]
            ngram = context + [target]
            bigrams[glue_tokens(ngram, 2)] +=1
            bigram_context[glue_tokens(context, 1)] += 1
    print(len(bigrams.keys()), "different bigrams")
    print(len(bigram_context.keys()), "different bigram contexts (and unigrams) observed")

    return bigrams, bigram_context
        
def prob_bigram_add_one(ngram, i):
    """A simple function to compute the 
    MLE probability estimation based on the counts.
    Follows the equation:
    C(w_i-1, w_i)/C(w_i-1)
    
    Dictionaries bigrams and bigram_context are global variables.

    """
    numerator = bigrams[glue_tokens(ngram, 2)]
    denominator = bigram_context[glue_tokens(ngram[:1], 1)]
    prob = (numerator + 1) / (denominator + i)
    return prob

def prob_bigram_add_k(ngram, i, k):
    """A simple function to compute the 
    MLE probability estimation based on the counts.
    Follows the equation:
    C(w_i-1, w_i)/C(w_i-1)
    
    Dictionaries bigrams and bigram_context are global variables.

    """
    numerator = bigrams[glue_tokens(ngram, 2)]
    denominator = bigram_context[glue_tokens(ngram[:1], 1)]
    prob = (numerator + k) / (denominator + k*i)
    return prob

def prob_bigram_MLE(ngram):
    """A simple function to compute the 
    MLE probability estimation based on the counts.
    Follows the equation:
    C(w_i-1, w_i)/C(w_i-1)
    
    Dictionaries bigrams and bigram_context are global variables.

    """
    numerator = bigrams[glue_tokens(ngram, 2)]
    denominator = bigram_context[glue_tokens(ngram[:1], 1)]
    prob = numerator / denominator
    return prob

def readIn(filename, mode): # execise 2 mostly Reads in a file and splits it.
    dataSource = open(filename, 'r')
    data = [];
    if mode == 1:
        for line in dataSource:
            words = tokenize_sentence(line,1)
            data.append(words)
    else:
        count = 0
        for line in dataSource:
            words = tokenize_sentence(line,1)
            data.append(words)
            count += 1
            if count > 1000:
                break
    return data

#### Exercise 1

In [90]:
from collections import Counter
from math import log

fileTraining = 'switchboard_lm_train.txt'
fileTesting = 'switchboard_lm_test.txt'
fileHeldOut = 'switchboard_lm_heldout.txt'

unigrams = find_unigrams(fileTraining,2) #the key values of the unigram are also a vocab.

unigram_total = sum(unigrams.values());
print(len(unigrams), "Unique unigrams observed")
print("unigram total", unigram_total)

testData = readInAndUnkOOV(fileTesting, unigrams)

s = 0  # total neg log prob mass for cross entropy
N = 0 # total number of words for normalizing s 
for sent in testData:
    # get the unigram model based probability of each sentence
    sent_s = 0  # recording non-normalized entropy for this sentence
    sent_N = 0  # total number of words in this sentence (for normalization)
    for w in sent:
        #print(str(unigrams[w]) + " " + w)
        prob = unigrams[w]/unigram_total
        s += -log(prob, 2) # add the neg log prob to s
        sent_s += -log(prob, 2)  # add the neg log prob to sent_s
        N += 1 # increment the number of total words
        sent_N += 1 # increment the number of total words in this sentence
    sent_cross_entropy = sent_s/sent_N
    sent_perplexity = 2 ** sent_cross_entropy
    #print(sent, "cross entropy:", sent_cross_entropy, "perplexity:", sent_perplexity)
cross_entropy = s/N
perplexity = 2 ** cross_entropy
print("Test corpus cross entropy", cross_entropy)
print("Test corpus perplexity", perplexity)


12730 Unique unigrams observed
unigram total 1306313
Test corpus cross entropy 8.287702715147693
Test corpus perplexity 312.4979066155026


#### Exercise 2

In [86]:
from collections import Counter
from math import log

fileTraining = 'switchboard_lm_train.txt'
fileTesting = 'switchboard_lm_test.txt'
fileHeldOut = 'switchboard_lm_heldout.txt'

unigrams = find_unigrams(fileTraining,1) #used to contruct MDF 2 Vocab from training set.
bigrams, bigram_context = find_bigrams(fileTraining, unigrams,2) #trains a bigram

#add one smoothing implementation

testData = readIn(fileTesting, 1)

s = 0
N = 0
delta = 1

for sent in testData:
    for i in range(delta, len(sent)):
        context = sent[i-delta:i]
        target = sent[i]
        ngram = context + [target]
        prob = prob_bigram_add_one(ngram, len(unigrams))
        s += -log(prob,2)
        N += 1
        
cross_entropy = s/N
perplexity = 2 ** cross_entropy
print(str(cross_entropy) + " Cross Entropy") #10.585434210610824 Cross Entropy
print(str(perplexity) + " Perplexity") #1536.5022994003118 Perplexity

194181 different bigrams
18076 different bigram contexts (and unigrams) observed
10.585434210610824 Cross Entropy
1536.5022994003118 Perplexity


#### Exercise 3

In [99]:
from collections import Counter
from math import log

fileTraining = 'switchboard_lm_train.txt'
fileTesting = 'switchboard_lm_test.txt'
fileHeldOut = 'switchboard_lm_heldout.txt'

unigrams = find_unigrams(fileTraining,1) #used to contruct MDF 2 Vocab from training set.
bigrams, bigram_context = find_bigrams(fileTraining, unigrams,2) #trains a bigram

#add one smoothing implementation

testData = readIn(fileTesting, 1)

s = 0
N = 0
delta = 1

for sent in testData:
    for i in range(delta, len(sent)):
        context = sent[i-delta:i]
        target = sent[i]
        ngram = context + [target]
        prob = prob_bigram_add_k(ngram, len(unigrams), 100)
        s += -log(prob,2)
        N += 1
        
cross_entropy = s/N
perplexity = 2 ** cross_entropy
print(str(cross_entropy) + " Cross Entropy")
print(str(perplexity) + " Perplexity")

'''
the perplexity drops the lower the k value
k = 0.2
9.780690090093266 Cross Entropy
879.5916448935646 Perplexity

k = 0.4
10.07927867287786 Cross Entropy
1081.8454308700907 Perplexity

k = 0.6
10.289039119808113 Cross Entropy
1251.1500615693983 Perplexity

k = 0.8
10.451935305220518 Cross Entropy
1400.7029380603492 Perplexity

k = 2
11.031646677383636 Cross Entropy
2093.4208795525756 Perplexity

k = 100 interested to see if the trend continues. (It does!)
13.416189879965472 Cross Entropy
10931.394844263728 Perplexity

'''

194181 different bigrams
18076 different bigram contexts (and unigrams) observed
13.416189879965472 Cross Entropy
10931.394844263728 Perplexity


'\nthe perplexity drops the lower the k value\nk = 0.2\n9.780690090093266 Cross Entropy\n879.5916448935646 Perplexity\n\nk = 0.4\n10.07927867287786 Cross Entropy\n1081.8454308700907 Perplexity\n\nk = 0.6\n10.289039119808113 Cross Entropy\n1251.1500615693983 Perplexity\n\nk = 0.8\n10.451935305220518 Cross Entropy\n1400.7029380603492 Perplexity\n\nk = 2\n11.031646677383636 Cross Entropy\n2093.4208795525756 Perplexity\n\nk = 100 interested to see if the trend continues. (It does!)\n\n\n'

#### Exercise 4