In [1]:
import nltk
from nltk import sent_tokenize,word_tokenize,TweetTokenizer #Tokenizing sentences
from pprint import pprint
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
import math
import numpy as np
import random

# I)

## Reading - Preprocessing

In [2]:
source = open("europarl-v7-en-full.txt",encoding = "utf8").read()

## Split source into training / development / test set - 70/10/20

In [3]:
source_sents = sent_tokenize(source)
sents_len=len(source_sents)
source_sents=source_sents[0:sents_len]
train_sents=source_sents[0:round(sents_len*0.7)]
development_sents=source_sents[round(sents_len*0.7):round(sents_len*0.8)]
test_sents=source_sents[round(sents_len*0.8):]

## Tokenize sets by sentence

In [4]:
tweet_wt = TweetTokenizer()

train_sents_tokenized = []
development_sents_tokenized = []
test_sents_tokenized = []
for sent in train_sents:
    train_sents_tokenized.append(tweet_wt.tokenize(sent))
for sent in development_sents:
    development_sents_tokenized.append(tweet_wt.tokenize(sent))
for sent in test_sents:
    test_sents_tokenized.append(tweet_wt.tokenize(sent))

## Tokenize sets by Token

In [5]:
train_tokens = tweet_wt.tokenize(' '.join(train_sents))
development_tokens = tweet_wt.tokenize(' '.join(development_sents))
test_tokens = tweet_wt.tokenize(' '.join(test_sents))

## Finding tokens appearing in the training test rarely... Replacing those with *UKN* in all sets

In [6]:
count = Counter(train_tokens)
frequent_tokens=set([k for k,v in count.items() if v >= 10])
train_tokens=[train_token if (train_token in frequent_tokens) else "*UNK*" for train_token in train_tokens]
development_tokens=[development_token if (development_token in frequent_tokens) else "*UNK*" for development_token in development_tokens]
test_tokens=[test_token if (test_token in frequent_tokens) else "*UNK*" for test_token in test_tokens]

## Converting tokens into Bigram and Trigram models

In [7]:
unigram_counter = Counter()
bigram_counter = Counter()
trigram_counter = Counter()

for sent in train_sents_tokenized:
    unigram_counter.update([gram for gram in ngrams(sent, 1, pad_left=True, pad_right=True, left_pad_symbol='*start*',right_pad_symbol='*end*') ])
    bigram_counter.update([gram for gram in ngrams(sent, 2, pad_left=True, pad_right=True, left_pad_symbol='*start*',right_pad_symbol='*end*') ])
    trigram_counter.update([gram for gram in ngrams(sent, 3, pad_left=True, pad_right=True, left_pad_symbol='*start*',right_pad_symbol='*end*') ])
    
unigram_counter[('*start*',)] = len(train_sents_tokenized)

## Tuning alpha wrt perplexity for the bigram model

In [9]:
vocab_size = len(set(train_tokens))
lowest_brigram_alpha=1
lowest_perplexity=100000
for alpha in np.arange(0.005,0.025,0.0005):
    #Reseting probability for each alpha
    bigram_cnt = 0
    sum_prob = 0
    for sent in development_sents_tokenized:
        sent = ['*start*'] + sent + ['*end*']
        for idx in range(1,len(sent)):
            bigram_prob = (bigram_counter[(sent[idx-1], sent[idx])] + round(alpha,4)) / (unigram_counter[(sent[idx-1],)] + round(alpha,4)*vocab_size)
            sum_prob += math.log2(bigram_prob)
            bigram_cnt+=1

    HC = -sum_prob / bigram_cnt
    perpl = math.pow(2,HC)
    #print("For alpha=",round(alpha,4),"perplexity: {0:.3f}".format(perpl))
    if(perpl<lowest_perplexity):
        lowest_perplexity=perpl
        lowest_brigram_alpha=round(alpha,4)
print("The alpha that produces the lowest perplexity for the bigram model is: ", lowest_brigram_alpha)
b_alpha=lowest_brigram_alpha

The alpha that produces the lowest perplexity for the bigram model is:  0.005


## Tuning alpha wrt perplexity for the trigram model

In [10]:
lowest_trigram_alpha=1
lowest_perplexity=100000
for alpha in np.arange(0.001,0.01,0.0005):
    trigram_cnt = 0
    sum_prob = 0    
    for sent in development_sents_tokenized:
        sent = ['*start*'] + ['*start*'] + sent + ['*end*'] + ['*end*']
        for idx in range(2,len(sent)):
            trigram_prob = (trigram_counter[(sent[idx-2],sent[idx-1], sent[idx])] +alpha) / (bigram_counter[(sent[idx-2],sent[idx-1])] + alpha*vocab_size)
            sum_prob += math.log2(trigram_prob)
            trigram_cnt+=1

    HC = -sum_prob / trigram_cnt
    perpl = math.pow(2,HC)
    #print("For alpha=",round(alpha,4),"perplexity: {0:.3f}".format(perpl))
    if(perpl<lowest_perplexity):
        lowest_perplexity=perpl
        lowest_trigram_alpha=round(alpha,4)
print("The alpha that produces the lowest perplexity for the trigram model is: ", lowest_trigram_alpha)
t_alpha=lowest_trigram_alpha

The alpha that produces the lowest perplexity for the trigram model is:  0.001


## Combining both Training Development sets

## II)

## Log probabilities for the Test set - Bigram

In [11]:
sent_count=0
sent_len=0
probabilities_test=[]
for i,sent in enumerate(test_sents_tokenized):
    sum_prob = 0
    sent = ['*start*'] + sent + ['*end*']
    for idx in range(1,len(sent)):
        bigram_prob = (bigram_counter[(sent[idx-1], sent[idx])] +b_alpha) / (unigram_counter[(sent[idx-1],)] + b_alpha*vocab_size)
        sum_prob += math.log2(bigram_prob)
        
        sent_count+=1
        sent_len+=len(sent)
    probabilities_test.append(sum_prob)
    if(i<5):
        print("Sentence: ",' '.join(sent[1:len(sent)-1]),"\nLog Probability: ",sum_prob, "\nProbability: ",np.exp(sum_prob),"\n")

print("--------------------------------------------------------------------------------------------------------------")
print("Mean Probability", np.mean(probabilities_test))
print("Mean Log Probability", np.mean(np.exp(probabilities_test)))

Sentence:  But a precondition of this is security and stability . 
Log Probability:  -70.86533279202636 
Probability:  1.6733123260260945e-31 

Sentence:  It does not seem that much readjustment of the PHARE programme would be needed here . 
Log Probability:  -116.54686658220805 
Probability:  2.4229193606402698e-51 

Sentence:  Commissioner van den Broek will doubtless be able to say more about this . 
Log Probability:  -99.35738653800568 
Probability:  7.073515185515553e-44 

Sentence:  Possible aid in rebuilding the country's infrastructure can only be meaningfully discussed once the scale of the damage is known . 
Log Probability:  -184.55752335860745 
Probability:  7.041837706145331e-81 

Sentence:  In conclusion , it must be obvious that given the complex problems in Albania the European Union is faced with a major challenge . 
Log Probability:  -142.44487798501174 
Probability:  1.370804481976066e-62 

-----------------------------------------------------------------------------

## Log probabilities for the Test set - Trigram

In [12]:
sent_count=0
sent_len=0
probabilities_test=[]
for i,sent in enumerate(test_sents_tokenized):
    sum_prob = 0
    sent = ['*start*'] + ['*start*'] + sent + ['*end*'] + ['*end*']
    for idx in range(2,len(sent)):
        trigram_prob = (trigram_counter[(sent[idx-2],sent[idx-1], sent[idx])] +t_alpha) / (bigram_counter[(sent[idx-2],sent[idx-1])] + t_alpha*vocab_size)
        #trigram_prob = (trigram_counter[(sent[idx-2],sent[idx-1], sent[idx])] +t_alpha) / (bigram_counter[(sent[idx-1],sent[idx])] + t_alpha*vocab_size)
        sum_prob += math.log2(trigram_prob)
        
        sent_count+=1
        sent_len+=len(sent)
    probabilities_test.append(sum_prob)
    if(i<5):
        print("Sentence: ",' '.join(sent[2:len(sent)-2]),"\nLog Probability: ",sum_prob, "\nProbability: ",np.exp(sum_prob),"\n")

print("--------------------------------------------------------------------------------------------------------------")
print("Mean Probability", np.mean(probabilities_test))
print("Mean Log Probability", np.mean(np.exp(probabilities_test)))

Sentence:  But a precondition of this is security and stability . 
Log Probability:  -77.50101120744583 
Probability:  2.196536835433812e-34 

Sentence:  It does not seem that much readjustment of the PHARE programme would be needed here . 
Log Probability:  -136.6612327865106 
Probability:  4.454312790001304e-60 

Sentence:  Commissioner van den Broek will doubtless be able to say more about this . 
Log Probability:  -96.86940814112474 
Probability:  8.514331969147488e-43 

Sentence:  Possible aid in rebuilding the country's infrastructure can only be meaningfully discussed once the scale of the damage is known . 
Log Probability:  -231.38160352281668 
Probability:  3.252717723708976e-101 

Sentence:  In conclusion , it must be obvious that given the complex problems in Albania the European Union is faced with a major challenge . 
Log Probability:  -147.27276186260752 
Probability:  1.097115089803781e-64 

-------------------------------------------------------------------------------

## Log probabilities for the random sentences - Bigram

In [13]:
#Using average length of sentences in the test set to create new sentences of approximately equal size
sent_avg_len=sent_len/sent_count
sent_avg_len = round(sent_avg_len)
probabilities_random=[]
#I will create the same number of sentences, the test set contains (1431)
for i in range(0,len(test_sents)):
    sent=[]
    #They will have a length between mean +/- sqrt(mean)
    for j in range(0,random.randint(sent_avg_len-round(math.sqrt(sent_avg_len)),sent_avg_len+round(math.sqrt(sent_avg_len)))):
        #Each time I pick a random word from the test set
        sent.append(test_tokens[random.randint(0, len(test_tokens)-1)])
    sum_prob = 0
    sent = ['*start*'] + sent + ['*end*']
    for idx in range(1,len(sent)):
        bigram_prob = (bigram_counter[(sent[idx-1], sent[idx])] +b_alpha) / (unigram_counter[(sent[idx-1],)] + b_alpha*vocab_size)
        sum_prob += math.log2(bigram_prob)
    probabilities_random.append(sum_prob)
    if(i<5):
        print("Sentence: ",' '.join(sent[1:len(sent)-1]),"\nLog Probability: ",sum_prob, "\nProbability: ",np.exp(sum_prob),"\n")

print("--------------------------------------------------------------------------------------------------------------")
print("Mean Log Probability", np.mean(probabilities_random))
print("Mean Probability", np.mean(np.exp(probabilities_random)))

Sentence:  simply reluctance to half addition Bulgaria . include expect too offered it Russia been and in port , After this governments Union Community in corruption its and the to fact market However budget because unilateral received of I make 
Log Probability:  -648.4521052897862 
Probability:  2.403415701558637e-282 

Sentence:  II so would that to suffers President root-and-branch of fact this interesting problem maintained and *UNK* in s , have field very in come men great its just important remain 20 can the report debate to a propose on Mr will to the many 
Log Probability:  -694.5217438992203 
Probability:  2.360690556639613e-302 

Sentence:  gentlemen positions their out unemployed which of . the to by An a proposing in *UNK* taking world to chairman of of , the negotiations , long period was its government that It on fraud thinks which and situation , am of nuclear within 
Log Probability:  -654.8127316811176 
Probability:  4.1537843994178366e-285 

Sentence:  must case dead

## Log probabilities for the random sentences - Trigram

In [14]:
#Using average length of sentences in the test set to create new sentences of approximately equal size
sent_avg_len=sent_len/sent_count
sent_avg_len = round(sent_avg_len)
probabilities_random=[]
#I will create the same number of sentences, the test set contains (1431)
for i in range(0,len(test_sents)):
    sent=[]
    #They will have a length between mean +/- sqrt(mean)
    for j in range(0,random.randint(sent_avg_len-round(math.sqrt(sent_avg_len)),sent_avg_len+round(math.sqrt(sent_avg_len)))):
        #Each time I pick a random word from the test set
        sent.append(test_tokens[random.randint(0, len(test_tokens)-1)])
    sum_prob = 0
    sent = ['*start*'] + ['*start*'] + sent + ['*end*'] + ['*end*']
    for idx in range(2,len(sent)):
        trigram_prob = (trigram_counter[(sent[idx-2],sent[idx-1], sent[idx])] +t_alpha) / (bigram_counter[(sent[idx-2],sent[idx-1])] + t_alpha*vocab_size)
        #trigram_prob = (trigram_counter[(sent[idx-2],sent[idx-1], sent[idx])] +t_alpha) / (bigram_counter[(sent[idx-1],sent[idx])] + t_alpha*vocab_size)
        sum_prob += math.log2(trigram_prob)
    probabilities_random.append(sum_prob)
    if(i<5):
        print("Sentence: ",' '.join(sent[2:len(sent)-2]),"\nLog Probability: ",sum_prob, "\nProbability: ",np.exp(sum_prob),"\n")

print("--------------------------------------------------------------------------------------------------------------")
print("Mean Log Probability", np.mean(probabilities_random))
print("Mean Probability", np.mean(np.exp(probabilities_random)))

Sentence:  world effective period their how need is economic I serious pathetic a , the must interests to programme to like *UNK* , is have why an down of greater on Mr for emerge , on are the Council it doubt the has want not House always 
Log Probability:  -731.0744490758302 
Probability:  3.150657e-318 

Sentence:  with governments improving responsibility such the . which present be new care one change framework relevant that solution to a The clearly Presidents it minutes *UNK* in take so but on be I are this , 20 in civil phase would only unknown 
Log Probability:  -657.4688782196216 
Probability:  2.9167161482606673e-286 

Sentence:  will , would But understand contribute is reply even the $ internal fishing want the group agri-environmental regrettable being propose Article Mr , a Committee as , political issue . coherent which quite not for the I 
Log Probability:  -588.6153255634179 
Probability:  2.3313750484514888e-256 

Sentence:  . true but the There Dutch a water comment

### We can clearly see a huge difference in the probabilities

## III)

## Perplexity of the whole corpus as a sequence

## Forming test into one sentence and getting rid of all \*start\* - Bigram

In [15]:
test_one_sent=[]
for i,sent in enumerate(test_sents_tokenized):
    sum_prob = 0
    sent = ['*start*'] + sent + ['*end*']
    test_one_sent.extend(sent)

test_one_sent=[word for word in test_one_sent if word != '*start*']

## Calculating HC and Perplexity

In [16]:
bigram_cnt = 0
sum_prob = 0
for idx in range(1,len(test_one_sent)):
    bigram_prob = (bigram_counter[(test_one_sent[idx-1], test_one_sent[idx])] + b_alpha) / (unigram_counter[(test_one_sent[idx-1],)] + b_alpha*vocab_size)
    sum_prob += math.log2(bigram_prob)
    bigram_cnt+=1

HC = -sum_prob / bigram_cnt
perpl = math.pow(2,HC)
print("Cross Entropy: {0:.3f}".format(HC))
print("Perplexity: {0:.3f}".format(perpl))

Cross Entropy: 7.220
Perplexity: 149.095


## Forming test into one sentence and getting rid of all \*start\* - Trigram

In [17]:
test_one_sent=[]
for i,sent in enumerate(test_sents_tokenized):
    sum_prob = 0
    sent = ['*start*'] + ['*start*'] + sent + ['*end*'] + ['*end*']
    test_one_sent.extend(sent)

test_one_sent=[word for word in test_one_sent if word != '*start*']

## Calculating HC and Perplexity

In [18]:
trigram_cnt = 0
sum_prob = 0
for idx in range(2,len(test_one_sent)):
    trigram_prob = (trigram_counter[(test_one_sent[idx-2],test_one_sent[idx-1], test_one_sent[idx])] +t_alpha) / (bigram_counter[(test_one_sent[idx-2],test_one_sent[idx-1])] + t_alpha*vocab_size)
    #trigram_prob = (trigram_counter[(test_one_sent[idx-2],test_one_sent[idx-1], test_one_sent[idx])] +t_alpha) / (bigram_counter[(test_one_sent[idx-1],test_one_sent[idx])] + t_alpha*vocab_size)
    sum_prob += math.log2(trigram_prob)
    trigram_cnt+=1

HC = -sum_prob / trigram_cnt
perpl = math.pow(2,HC)
print("Cross Entropy: {0:.3f}".format(HC))
print("Perplexity: {0:.3f}".format(perpl))

Cross Entropy: 8.013
Perplexity: 258.305


## IV

## Tuning for ideal lambda

In [19]:
lowest_lamda = 2
lowest_perplexity = 100000
for lamda in np.arange(0,1.01,0.05):
    ngram_cnt = 0
    sum_prob = 0
    for sent in development_sents_tokenized:
        sent = ['*start*'] + ['*start*'] + sent + ['*end*'] + ['*end*']
        for idx in range(2,len(sent)):
            trigram_prob = (trigram_counter[(sent[idx-2],sent[idx-1], sent[idx])] +t_alpha) / (bigram_counter[(sent[idx-2],sent[idx-1])] + t_alpha*vocab_size)
            #trigram_prob = (trigram_counter[(sent[idx-2],sent[idx-1], sent[idx])] +t_alpha) / (bigram_counter[(sent[idx-1],sent[idx])] + t_alpha*vocab_size)
            bigram_prob = (bigram_counter[(sent[idx-1], sent[idx])] +b_alpha) / (unigram_counter[(sent[idx-1],)] + b_alpha*vocab_size)

            sum_prob += (lamda * math.log2(trigram_prob)) +((1-lamda) * math.log2(bigram_prob))
            ngram_cnt+=1 

    HC = -sum_prob / ngram_cnt
    perpl = math.pow(2,HC)
    if(perpl<lowest_perplexity):
        lowest_perplexity=perpl
        lowest_lamda=lamda
    print("lamdba:", round(lamda,2), "- Perplexity:", round(perpl,2))
print("The lambda that produces the lowest perplexity for the interpolated model is: ", lowest_lamda)

lamdba: 0.0 - Perplexity: 145.0
lamdba: 0.05 - Perplexity: 143.52
lamdba: 0.1 - Perplexity: 142.06
lamdba: 0.15 - Perplexity: 140.62
lamdba: 0.2 - Perplexity: 139.18
lamdba: 0.25 - Perplexity: 137.77
lamdba: 0.3 - Perplexity: 136.36
lamdba: 0.35 - Perplexity: 134.98
lamdba: 0.4 - Perplexity: 133.6
lamdba: 0.45 - Perplexity: 132.24
lamdba: 0.5 - Perplexity: 130.89
lamdba: 0.55 - Perplexity: 129.56
lamdba: 0.6 - Perplexity: 128.24
lamdba: 0.65 - Perplexity: 126.94
lamdba: 0.7 - Perplexity: 125.64
lamdba: 0.75 - Perplexity: 124.37
lamdba: 0.8 - Perplexity: 123.1
lamdba: 0.85 - Perplexity: 121.85
lamdba: 0.9 - Perplexity: 120.61
lamdba: 0.95 - Perplexity: 119.38
lamdba: 1.0 - Perplexity: 118.16
The lambda that produces the lowest perplexity for the interpolated model is:  1.0
