# First Assignment: Exercises on n-gram language models, spelling correction, text normalization

In [1]:
import os, sys, random, requests, nltk, pickle, operator, gc, string
from distutils.util import strtobool
from math import log2
from nltk.corpus import PlaintextCorpusReader, stopwords
from nltk import bigrams, trigrams
from nltk.util import ngrams, pad_sequence
from nltk.lm.preprocessing import padded_everygram_pipeline, pad_both_ends, flatten
from nltk.lm import Vocabulary
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from collections import Counter, defaultdict
from kneser_ney import KneserNeyLM

In [2]:
stopwords = set(stopwords.words('english'))
porter = PorterStemmer()

In [3]:
dirname = 'output'
file = '../europarl-v7.el-en.en'
out1 = 'europarl-v7.el-en.en.train'
out2 = 'europarl-v7.el-en.en.test'

In [4]:
def download_file_from_google_drive(id, destination):
    URL = "https://docs.google.com/uc?export=download"
    session = requests.Session()
    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)
    if token:
        params = { 'id' : id, 'confirm' : token }
        response = session.get(URL, params = params, stream = True)
    save_response_content(response, destination)    

def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value
    return None

def save_response_content(response, destination):
    CHUNK_SIZE = 32768
    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

In [5]:
def download_corpus():
    file_id = '1bowXfgxnMsL1OCV_QX-CSXm5XFsdvX2W'
    destination = 'europarl-v7.el-en.en'
    print('Downloading corpus...')
    download_file_from_google_drive(file_id, destination)
    print('Corpus downloaded.')

In [6]:
def split_file(file, out1, out2, percentage=0.75, isShuffle=True, seed=123):
    """Splits a file in 2 given the `percentage` to go in the large file."""
    random.seed(seed)
    with open(file, 'r',encoding="utf-8") as fin, \
         open(out1, 'w', encoding="utf-8") as foutBig, \
         open(out2, 'w', encoding="utf-8") as foutSmall:
                nLines = sum(1 for line in fin)
                fin.seek(0)
                nTrain = int(nLines*percentage) 
                nValid = nLines - nTrain
                i = 0
                for line in fin:
                    r = random.random() if isShuffle else 0 # so that always evaluated to true when not isShuffle
                    if (i < nTrain and r < percentage) or (nLines - i > nValid):
                        foutBig.write(line)
                        i += 1
                    else:
                        foutSmall.write(line)
    print('Corpus splitted in train, test set')

In [7]:
def create_directory(dirname=dirname):
    # Create target Directory if it doesn't exist
    if not os.path.split(os.path.abspath('.'))[1]==dirname and not os.path.exists(dirname):
        if not os.path.exists(dirname):
            os.mkdir(dirname)
            os.chdir(dirname)
            print("Directory" , dirname ,  "created.")
        else:
            os.chdir(dirname)
            print("Directory" , dirname ,  "already exists.")
    else:
        #os.chdir(dirname)
        print("Directory" , dirname ,  "already exists.")

In [8]:
def load_dataset():
    f = open('sentences_train_processed.pickle', 'rb')
    sentences_train_processed = pickle.load(f)
    f.close()
    f = open('sentences_test_processed.pickle', 'rb')
    sentences_test_processed = pickle.load(f)
    f.close()
    vocabulary = nltk.FreqDist(flatten(sentences_train_processed))
    print('Preprosecced corpus loaded.')
    return sentences_train_processed, sentences_test_processed, vocabulary

In [9]:
def load_models():
    f = open('bigram_laplace.pickle', 'rb')
    bigram_laplace = pickle.load(f)
    f.close()
    f = open('trigram_laplace.pickle', 'rb')
    trigram_laplace = pickle.load(f)
    f.close()
    f = open('bigram_kneser.pickle', 'rb')
    bigram_kneser = pickle.load(f)
    f.close()
    f = open('trigram_kneser.pickle', 'rb')
    trigram_kneser = pickle.load(f)
    f.close()
    print('Models loaded.')
    return bigram_laplace, trigram_laplace, bigram_kneser, trigram_kneser

In [10]:
def save_processed_dataset(train, test):
    f = open('sentences_train_processed.pickle', 'wb')
    pickle.dump(train, f)
    f.close()
    f = open('sentences_test_processed.pickle', 'wb')
    pickle.dump(test, f)
    f.close()

In [11]:
def save_models():
    f1 = open('bigram_laplace.pickle', 'wb')
    pickle.dump(bigram_laplace, f1)
    f1.close()
    f2 = open('trigram_laplace.pickle', 'rb')
    pickle.dump(trigram_laplace, f1)
    f2.close()

In [12]:
def user_prompt(question):
    """ Prompts a Yes/No questions. """
    while True:
        sys.stdout.write(question + " [y/n]: ")
        user_input = input().lower()
        try:
            result = strtobool(user_input)
            return result
        except ValueError:
            sys.stdout.write("Please use y/n or yes/no.\n")

In [13]:
def pad(sentence, n, left_pad='*start*', right_pad='*end*'):
    return (n-1)*[left_pad]+sentence+[right_pad]

In [14]:
# A sentence generator. Returns random (correct) sentences from the test subset or (incorrect) sentences of 
# the same length (in words) consisting of randomly selected vocabulary words.
def my_sentences(sent_number=3, sent_len=7, random_select=False):
    sentences = []
    if random_select:
        for _ in range(sent_number):
            text = []
            for _ in range(sent_len):
                word=random.choice(list(vocabulary))
                text.append(word)
            sentences.append(text)
    else:
        for i in range(sent_number):
            sentence = random.choice([sent for sent in test if len(sent)==sent_len])
            sentences.append(sentence)
    return sentences

In [15]:
def get_unigram_probability(word):
    if word in vocabulary:
        unigram_probability = vocabulary[word]/vocabulary.N()
        message = ''
    else:
        unigram_probability = vocabulary['*UNK*']/vocabulary.N()
        message = '<<-- OOV word'
    return(unigram_probability,message)

In [16]:
# Unigram model with Laplace smoothing
def unigram_laplace(word):
        return (vocabulary[word]+1)/(vocabulary.N()+len(vocabulary))

In [17]:
def preprocess_corpus(cut_off = 10, test=True):
    #os.chdir(dirname)

    if os.path.isfile('sentences_train_processed.pickle') and os.path.isfile('sentences_test_processed.pickle'):
        print('Corpus is already preprocessed.')
        answer = user_prompt('Do you want to re-preprocess corpus?')
        if answer == 0:
            print('Loading preprocessed corpus...')
            #os.chdir('..')
            return load_dataset()
        
    corpus_root = '.'
    corpus = PlaintextCorpusReader(corpus_root, '.*')
    sentences_train = corpus.sents('europarl-v7.el-en.en.train')
    sentences_test = corpus.sents('europarl-v7.el-en.en.test')

    if test is True:
        lines = 100
    else:
        lines = None
        
    print('Processing corpus...')
    print('Lowercasing...')
    words_train = list(flatten([[word.lower() for word in sent] for sent in sentences_train[:lines]]))
    #words_train = list(flatten([[word.lower() for word in sent if word.lower() not in stopwords and word not in string.punctuation] for sent in sentences_train]))

    freq_1gram = nltk.FreqDist(words_train)
    total_count = freq_1gram.N()

    #Some statistics about corpus
    print('Number of sentences:', len(sentences_train[:lines]))
    print('Number of tokens in corpus:', total_count)
    print('Corpus Vocabulary length:', len(freq_1gram))

    #Make vocabulary with cut-off value, replace OOV words in train-test set
    print('Making vocabulary with cut-off value, replacing OOV words in train-test set...')
    sentences_train_processed = [['*UNK*' if freq_1gram[word.lower()]<cut_off else word.lower() for word in sent] for sent in sentences_train[:lines]]
    sentences_test_processed = [['*UNK*' if freq_1gram[word.lower()]<cut_off else word.lower() for word in sent] for sent in sentences_test[:lines]]
    vocabulary = nltk.FreqDist(flatten(sentences_train_processed))
    print('Vocabulary length:', len(vocabulary)) #after cut-off OOV words
    save_processed_dataset(sentences_train_processed, sentences_test_processed)
    print('Preprocessed corpus saved!')
    #os.chdir('..')
    return sentences_train_processed, sentences_test_processed, vocabulary

In [18]:
def do_tests():
    ## Get number of OOV words
    ## and also compute unigram probabilities for a test sentence including OOV words
    print('There are',vocabulary['*UNK*'],'OOV words.\n')
    print('The 10 most common vocabulary words:')
    print(vocabulary.most_common()[:10],'\n')
    test_sentence = 'I admit that, At present, the matter seems to be somwhat confused.\n'
    test_sentence_tokens = word_tokenize(test_sentence)

    print('Unigram probabilities including OOV probabilities for sentence:')
    print(test_sentence)
    print('{0:10s} {1:10s}   {2:10s}'.format('word', 'probability', 'message'))
    print(40*'=')

    for word in test_sentence_tokens:
        unigram_probability,message = get_unigram_probability(word)
        print('{0:10s} {1:.3}   {2:10s}'.format(word, unigram_probability, message))

In [19]:
def build_models(sentences_train_processed):
    #os.chdir(dirname)
    if (os.path.isfile('bigram_laplace.pickle') and os.path.isfile('trigram_laplace.pickle') and os.path.isfile('bigram_kneser.pickle') and os.path.isfile('trigram_kneser.pickle')) is True:
        print('Models are already trained. Do you want to retrain?')
        answer = user_prompt('say')
        if answer == 0:
            print('Loading trained models...')
            return load_models()
        
    ## Bigram model
    print('Training bigram model with Laplace smoothing...')
    eur_2grams = list(bigrams(flatten(pad(sent, n=2) for sent in sentences_train_processed)))
    freq_2gram = nltk.FreqDist(eur_2grams)
    cfreq_2gram = nltk.ConditionalFreqDist(eur_2grams)

    ### Laplace smoothing
    #We compute Conditional probability Distribution with Laplace smothing
    #bigram_laplace = nltk.ConditionalProbDist(cfreq_2gram, nltk.MLEProbDist)
    bigram_laplace = nltk.ConditionalProbDist(cfreq_2gram, nltk.LaplaceProbDist, bins=len(vocabulary))
    
    ## Trigram model
    print('Training trigram model with Laplace smoothing...')
    eur_3grams = list(trigrams(flatten(pad(sent, n=3) for sent in sentences_train_processed)))
    condition_pairs = [((w1, w2), w3) for w1, w2, w3 in eur_3grams]
    cfreq_3gram = nltk.ConditionalFreqDist(condition_pairs)

    ### Laplace smoothing
    trigram_laplace = nltk.ConditionalProbDist(cfreq_3gram, nltk.LaplaceProbDist, bins=len(vocabulary))     
        
    print('Training bigram model with Kneser-Ney smoothing...')
    bigram_kneser = KneserNeyLM(2, eur_2grams)


    print('Training trigram model with Kneser-Ney smoothing...')
    trigram_kneser = KneserNeyLM(3, eur_3grams)

    #trigram_kneser.score_sent(('This', 'is','a', 'sentence'))
        
    #save models
    print('Saving all models...')
    f = open('bigram_laplace.pickle', 'wb')
    pickle.dump(bigram_laplace, f)
    f.close()
    f = open('trigram_laplace.pickle', 'wb')
    pickle.dump(trigram_laplace, f)
    f.close() 
    f = open('bigram_kneser.pickle', 'wb')
    pickle.dump(bigram_kneser, f)
    f.close()
    f = open('trigram_kneser.pickle', 'wb')
    pickle.dump(trigram_kneser, f)
    f.close()
    print('Models saved')
    
    #os.chdir('..')
    return bigram_laplace, trigram_laplace, bigram_kneser, trigram_kneser


In [20]:
def model_tests():
        #Test laplace smoothed models
        #Do some tests on bigram model
        print('=== Some tests on bigram model \n')
        print('P(declare|session):', bigram_laplace['session'].prob('declare'),'\n')

        print('Words coming after "declare" and their probability:')
        test_ngram = ('declare')
        prob_sum = 0
        for i in list(sorted(bigram_laplace[test_ngram].samples()))[:10]:
            prob_sum += bigram_laplace[test_ngram].prob(i)
            print("{0} {1} {2}".format(test_ngram, i, bigram_laplace[test_ngram].prob(i)))
        print('Total probability:',prob_sum, '\n')
        
        # Generate sentence from bigram model
        print('Generate 3 (20 world) sentences from bigram model:')
        for i in range(3):
            current = random.choice(list(vocabulary))  # choose a random starting word
            text = [current]
            for index in range(20):
                next = bigram_laplace[current].generate() # generate next word from bigram model
                current = next                
                text.append(current)  
            print(' '.join(text),'\n')


        #Do some tests on trigram model
        print('=== Some tests on trigram model \n')

        print('P(resumption,of|the):', trigram_laplace[('resumption','of')].prob('the'),'\n')

        print('Words coming after "resumption of" and their probability:')
        test_ngram = ('resumption','of')
        prob_sum = 0
        for i in list(sorted(trigram_laplace[test_ngram].samples()))[:10]:
            prob_sum += trigram_laplace[test_ngram].prob(i)
            print("{0}{1}:{2}".format(test_ngram, i, trigram_laplace[test_ngram].prob(i)))
        print('Total probability:',prob_sum, '\n')
        
        # Generate sentence from trigram model
        print('Generate 3 (20 world) sentences from trigram model:')
        for index in range(3):
            prev = random.choice(list(vocabulary))  # choose a random starting word
            current = bigram_laplace[prev].generate()   # generate next word from bigram model
            text = [current]
            for index in range(20):
                next = trigram_laplace[(prev, current)].generate() # generate next word from trigram model
                prev, current = current, next                
                text.append(current)  
            print(' '.join(text),'\n')

In [21]:
def bigram_sent_prob(sentences):
    print('probabilities of word bigrams')
    for i, sentence in enumerate(sentences):
        sent_prob = 0
        print(i+1,'====',' '.join(sentence))
        for j,k in list(ngrams(sentence, 2, pad_left = True, pad_right = True, right_pad_symbol='*end*', left_pad_symbol='*start*')):
            sent_prob = sent_prob + log2(bigram_laplace[j].prob(k))
        print('Sentence probability:',sent_prob)
        print()

In [22]:
def trigram_sent_prob(sentences):
    print('probabilities of word trigrams')
    for i, sentence in enumerate(sentences):
            sent_prob = 0
            print(i+1,'====',' '.join(sentence))
            for j,k,l in list(ngrams(sentence, 3, pad_left = True, pad_right = True, right_pad_symbol='*end*', left_pad_symbol='*start*')):
                sent_prob = sent_prob + log2(trigram_laplace[(j,k)].prob(l))
            print('Sentence probability:',sent_prob)
            print()

In [23]:
def perplexity(test, pstart=True):
    entropy2 = 0
    entropy2_kneser = 0
    N = 0
    eur_2grams_test = list(bigrams(flatten(pad(sent, n=2) for sent in test)))
    eur_3grams_test = list(trigrams(flatten(pad(sent, n=3) for sent in test)))
    for j,k in eur_2grams_test:
        if pstart is True:
            entropy2 += log2(bigram_laplace[j].prob(k))
            entropy2_kneser += log2(-bigram_kneser.score_sent((j,k)))
            N += 1
        else:
            if k != '*start*':
                entropy2 += log2(bigram_laplace[j].prob(k))
                entropy2_kneser += log2(-bigram_kneser.score_sent((j,k)))
                N += 1
    entropy2 = -entropy2/N
    entropy2_kneser = entropy2_kneser/N

    
    entropy3 = 0
    entropy3_kneser = 0
    N2 = 0
    for j,k,l in eur_3grams_test:
        if pstart is True:
            entropy3 += log2(trigram_laplace[(j,k)].prob(l))
            entropy3_kneser += log2(-trigram_kneser.score_sent((j,k,l)))
            N2 += 1
        else:
            if l != '*start*':
                entropy3 += log2(trigram_laplace[(j,k)].prob(l))
                entropy3_kneser += log2(-trigram_kneser.score_sent((j,k,l)))
                N2 += 1
    entropy3 = -entropy3/N2
    entropy3_kneser = entropy3_kneser/N2
    
    #print('Entropy of bigram model:',float('%.4g' % entropy), float('%.4g' % entropy_kneser))
    print('======= Perplexity =======')
    print('{0:10s} {1:10s} {2:10s}'.format('Model','Laplace smooth', 'Kneser-Ney smooth'))
    print('{0:10s} {1:10} {2:10}'.format('Bigram',float('%.4g' % pow(2.0, entropy2)), float('%.4g' % pow(2.0, entropy2_kneser))))
    print('{0:10s} {1:10} {2:10}'.format('Trigram',float('%.4g' % pow(2.0, entropy3)), float('%.4g' % pow(2.0, entropy3_kneser))))

    print()
    print('======= Entropy =======')
    print('{0:10s} {1:10s} {2:10s}'.format('Model','Laplace smooth', 'Kneser-Ney smooth'))
    print('{0:10s} {1:10} {2:10}'.format('Bigram',float('%.4g' % entropy2), float('%.4g' % entropy2_kneser)))
    print('{0:10s} {1:10} {2:10}'.format('Trigram',float('%.4g' % entropy3), float('%.4g' % entropy3_kneser)))
    print()
    print('Tested on', N, 'bigrams,', N2, 'trigrams')

In [24]:
def perplexity_interpolated(test, pstart=True):
    entropy = 0
    entropy_kneser = 0
    N = 0
    l1 = 2/10
    l2 = 8/10
    l3 = 1/10

    eur_3grams_test = list(trigrams(flatten(pad(sent, n=3) for sent in test)))

    for j,k,l in eur_3grams_test:
        if pstart is True:
            entropy += log2(l1*trigram_laplace[(j,k)].prob(l)+l2*(bigram_laplace[k].prob(l))+l3*unigram_laplace(l))
            entropy_kneser += log2(-l1*trigram_kneser.score_sent((j,k,l))-l2*bigram_kneser.score_sent((j,k))+l3*unigram_laplace(l))
            N += 1
        else:
            if l != '*start*':
                entropy += log2(l1*trigram_laplace[(j,k)].prob(l)+l2*(bigram_laplace[k].prob(l))+l3*unigram_laplace(l))
                entropy_kneser += log2(-l1*trigram_kneser.score_sent((j,k,l))-l2*bigram_kneser.score_sent((j,k))+l3*unigram_laplace(l))
                N += 1
    entropy = -entropy/N
    entropy_kneser = entropy_kneser/N
    
    #print('Entropy of bigram model:',float('%.4g' % entropy), float('%.4g' % entropy_kneser))
    print('======= Perplexity =======')
    print('{0:10s} {1:10s} {2:10s}'.format('Model','Laplace smooth', 'Kneser-Ney smooth'))
    print('{0:10s} {1:10} {2:10}'.format('Trigram',float('%.4g' % pow(2.0, entropy)), float('%.4g' % pow(2.0, entropy_kneser))))

    print()
    print('======= Entropy =======')
    print('{0:10s} {1:10s} {2:10s}'.format('Model','Laplace smooth', 'Kneser-Ney smooth'))
    print('{0:10s} {1:10} {2:10}'.format('Trigram',float('%.4g' % entropy), float('%.4g' % entropy_kneser)))
    print()
    print('Tested on', N, 'trigrams')

# ==== Exercice 4 ====

# Part I

Download corpus, create output directory, split corpus in train, test set (defaults: percentage=0.75, isShuffle=True, seed=123)

In [25]:
download_corpus()

Downloading corpus...
Corpus downloaded.


In [26]:
create_directory()

Directory output created.


In [27]:
split_file(file, out1, out2)

Corpus splitted in train, test set


## Preprocessing
Lowercase, replace OOV words in train-test set with *UNK* token

In [28]:
train, test, vocabulary = preprocess_corpus(test=False)

Processing corpus...
Lowercasing...
Number of sentences: 986576
Number of tokens in corpus: 27063156
Corpus Vocabulary length: 64143
Making vocabulary with cut-off value, replacing OOV words in train-test set...
Vocabulary length: 22622
Preprocessed corpus saved!


Do some tests: Get number of OOV words, compute unigram probabilities for a test sentence including OOV words, print 10 most common vocabulary words

In [29]:
do_tests()

There are 106424 OOV words.

The 10 most common vocabulary words:
[('the', 1836392), (',', 1281104), ('.', 950979), ('of', 880323), ('to', 805597), ('and', 710636), ('in', 581890), ('that', 426592), ('a', 403882), ('is', 401500)] 

Unigram probabilities including OOV probabilities for sentence:
I admit that, At present, the matter seems to be somwhat confused.

word       probability   message   
I          0.00393   <<-- OOV word
admit      2.81e-05             
that       0.0158             
,          0.0473             
At         0.00393   <<-- OOV word
present    0.000338             
,          0.0473             
the        0.0679             
matter     0.000404             
seems      0.000144             
to         0.0298             
be         0.00754             
somwhat    0.00393   <<-- OOV word
confused   9.9e-06             
.          0.0351             


## Model training

We generate the padded bigrams, trigrams, compute the Frequency Distribution, Conditional Frequency Distribution, Conditional probability Distribution and train models with Laplace or Kneser-Ney smoothing

In [30]:
bigram_laplace, trigram_laplace, bigram_kneser, trigram_kneser = build_models(train)

Training bigram model with Laplace smoothing...
Training trigram model with Laplace smoothing...
Training bigram model with Kneser-Ney smoothing...
Training trigram model with Kneser-Ney smoothing...
Saving all models...
Models saved


## Model testing

Do some tests on bigram, trigram models trained with laplace smoothing, like finding conditional probabilities of a word given its context and sentence generation from the relevant probability distribution

In [31]:
model_tests()

=== Some tests on bigram model 

P(declare|session): 3.9767756303189374e-05 

Words coming after "declare" and their probability:
declare ' 0.00017100594245650035
declare *UNK* 0.00012825445684237527
declare , 0.0003420118849130007
declare . 8.550297122825018e-05
declare 11 8.550297122825018e-05
declare 14 8.550297122825018e-05
declare 20 0.00012825445684237527
declare 2007 0.00012825445684237527
declare 2009 0.00012825445684237527
declare 2010 0.00012825445684237527
Total probability: 0.0014107990252661279 

Generate 3 (20 world) sentences from bigram model:
backed proposals faithfully transpose within unesco monument ( bankers : independence had led ones spending existing com now moving back , 

bielan , must oppose these restrictive eligibility period only giving permanent general , which is not conflict control until tonight will 

subservient . *end* *start* i wished at hand then buy ! *end* *start* everything *UNK* who cover throughout bilateral overseas budget 

=== Some tests o

# Part II

Check the log-probabilities that your trained models return when given (correct) sentences from the test subset vs. (incorrect) sentences of the same length (in words) consisting of randomly selected vocabulary words.  

We use the Laplace smoothed models.

## Bigram sentence probability

In [32]:
bigram_sent_prob(my_sentences(random_select=False))

probabilities of word bigrams
1 ==== i attempted to contact an official .
Sentence probability: -71.38011548653269

2 ==== i find such a remark disturbing .
Sentence probability: -62.9145618750343

3 ==== i do however take your point .
Sentence probability: -58.58715226409559



In [33]:
bigram_sent_prob(my_sentences(random_select=True))

probabilities of word bigrams
1 ==== empire djibouti ritual crumbling hatzidakis affording raided
Sentence probability: -121.22550844540186

2 ==== anguish scientifically repay synchronisation bona jeopardised 863
Sentence probability: -121.2281776495057

3 ==== almadén opponent carries friendly fosters emigrants eye
Sentence probability: -120.37230155918866



## Trigram sentence probability

In [34]:
trigram_sent_prob(my_sentences(random_select=False))

probabilities of word trigrams
1 ==== mrs van *UNK* spoke about tibet .
Sentence probability: -111.94015643004988

2 ==== it was a very controversial amendment .
Sentence probability: -85.27026810846932

3 ==== we cannot therefore support paragraph 11 .
Sentence probability: -98.01354122839139



In [35]:
trigram_sent_prob(my_sentences(random_select=True))

probabilities of word trigrams
1 ==== partial concentration sin pretend incomparably inaugurated pulses
Sentence probability: -134.0834537631206

2 ==== statehood aforesaid rejection requirements payback multiannual germà
Sentence probability: -135.66828872151385

3 ==== channelling carnage asleep regularisations hoc domiciled preoccupied
Sentence probability: -134.66835249408723



# Part III - Entropy, Perplexity

Estimate the language **cross-entropy** and **perplexity** of your models on the test subset of the corpus, treating the entire test subset as a single sequence, with *start* (or *start1*, *start2*) at the beginning of each sentence, and *end* at the end of each sentence. Do not include probabilities of the form P(*start*|...) (or P(*start1*|...) or P(*start2*|...)) in the computation of perplexity, but include probabilities of the form P(*end*|...). 

In [36]:
perplexity(test[:100], pstart=True)

Model      Laplace smooth Kneser-Ney smooth
Bigram          171.5      14.26
Trigram         678.5      17.97

Model      Laplace smooth Kneser-Ney smooth
Bigram          7.422      3.834
Trigram         9.406      4.167

Tested on 3337 bigrams, 3436 trigrams


In [37]:
perplexity(test[:100], pstart=False)

Model      Laplace smooth Kneser-Ney smooth
Bigram          200.6      13.97
Trigram        1008.0      17.28

Model      Laplace smooth Kneser-Ney smooth
Bigram          7.648      3.804
Trigram         9.978      4.111

Tested on 3238 bigrams, 3238 trigrams


# Part IV - Linear interpolation


We combine unigram, bigram and trigram models using linear interpolation and check if the combined model performs better. Best l1, l2, l3 parameters found after some trials on a validation set of 100 sentences

In [38]:
perplexity_interpolated(test[:100], pstart=True)

Model      Laplace smooth Kneser-Ney smooth
Trigram         139.5      15.34

Model      Laplace smooth Kneser-Ney smooth
Trigram         7.124       3.94

Tested on 3436 trigrams


In [39]:
perplexity_interpolated(test[:100], pstart=False)

Model      Laplace smooth Kneser-Ney smooth
Trigram         179.3      14.91

Model      Laplace smooth Kneser-Ney smooth
Trigram         7.486      3.898

Tested on 3238 trigrams


# ======== ADDITIONAL WORK =========

## Check Entropy and perplexity of LMs trained with NLTK's pipeline 

We use Nltk's pipeline to train LMs with maximum order 3, using Laplace smothing

In [40]:
from nltk.lm import MLE, Laplace

In [41]:
def nltk_train(sentences_train_processed):
    if os.path.isfile('lm_nltk_laplace.pickle') is True:
        print('Model trained with Nltk exist in your disk. Do you want to retrain?')
        answer = user_prompt('say')
        if answer == 0:
            print('Loading trained model...')
            f = open('lm_nltk_laplace.pickle', 'rb')
            lm_nltk = pickle.load(f)
            f.close()
            print('Model loaded.')
            return lm_nltk

    print('Training unigram, bigram, trigram model with Laplace smoothing...')
    train, vocab = padded_everygram_pipeline(3, sentences_train_processed)
    lm_nltk = Laplace(3)
    lm_nltk.fit(train, vocab)

    f = open('lm_nltk_laplace.pickle', 'wb')
    pickle.dump(lm_nltk, f)
    f.close()
    print('Model saved.')
    return lm_nltk

In [42]:
lm = nltk_train(train)

Training unigram, bigram, trigram model with Laplace smoothing...
Model saved.


In [43]:
len(lm.vocab)

22625

In [44]:
print(lm.vocab)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 22625 items>


In [45]:
'*UNK*' in lm.vocab

True

In [46]:
lm.vocab['europe']

48312

In [47]:
lm.logscore('europe')

-9.32713347750879

In [48]:
lm.logscore('demonstrate', ['europe','can'])

-12.49741527084874

In [49]:
eur_2grams_test = list(bigrams(flatten(pad_both_ends(sent, n=2) for sent in test[:100])))
lm.perplexity(eur_2grams_test)

263.6186366559596

In [50]:
eur_3grams_test = list(trigrams(flatten(pad_both_ends(sent, n=3) for sent in test[:100])))
lm.perplexity(eur_3grams_test)

988.670572284987