In [1]:
import nltk
from nltk import sent_tokenize,word_tokenize,TweetTokenizer #Tokenizing sentences
from pprint import pprint
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
import math
import numpy as np
import random
import pandas as pd

# I)

## Reading - Preprocessing

In [2]:
source = open("europarl-v7-en.txt",encoding = "utf8").read()

## Split source into training / development / test set - 70/10/20

In [3]:
source_sents = sent_tokenize(source)
sents_len=len(source_sents)
#source_sents=source_sents[0:sents_len]
train_sents=source_sents[0:round(sents_len*0.7)]
development_sents=source_sents[round(sents_len*0.7):round(sents_len*0.8)]
test_sents=source_sents[round(sents_len*0.8):]

## Tokenize sets by Token

In [4]:
tweet_wt = TweetTokenizer()

train_tokens = tweet_wt.tokenize(' '.join(train_sents))
development_tokens = tweet_wt.tokenize(' '.join(development_sents))
test_tokens = tweet_wt.tokenize(' '.join(test_sents))

## Finding tokens appearing in the training test rarely... Replacing those with *UKN* in all sets

In [5]:
count = Counter(train_tokens)
frequent_tokens=set([k for k,v in count.items() if v >= 10])
train_tokens=[train_token if (train_token in frequent_tokens) else "*UNK*" for train_token in train_tokens]
development_tokens=[development_token if (development_token in frequent_tokens) else "*UNK*" for development_token in development_tokens]
test_tokens=[test_token if (test_token in frequent_tokens) else "*UNK*" for test_token in test_tokens]

## Tokenize sets by sentence

In [6]:
train_sents_tokenized = []
development_sents_tokenized = []
test_sents_tokenized = []

for sent in train_sents:
    sent_tmp = tweet_wt.tokenize(sent)
    sent = [word if (word in frequent_tokens) else "*UNK*" for word in sent_tmp]
    train_sents_tokenized.append(sent)
for sent in development_sents:
    sent_tmp = tweet_wt.tokenize(sent)
    sent = [word if (word in frequent_tokens) else "*UNK*" for word in sent_tmp]
    development_sents_tokenized.append(sent)
for sent in test_sents:
    sent_tmp = tweet_wt.tokenize(sent)
    sent = [word if (word in frequent_tokens) else "*UNK*" for word in sent_tmp]
    test_sents_tokenized.append(sent)

## Converting tokens into Bigram and Trigram models

In [7]:
unigram_counter = Counter()
bigram_counter = Counter()
trigram_counter = Counter()

for sent in train_sents_tokenized:
    unigram_counter.update([gram for gram in ngrams(sent, 1, pad_left=True, pad_right=True, left_pad_symbol='*start*',right_pad_symbol='*end*') ])
    bigram_counter.update([gram for gram in ngrams(sent, 2, pad_left=True, pad_right=True, left_pad_symbol='*start*',right_pad_symbol='*end*') ])
    trigram_counter.update([gram for gram in ngrams(sent, 3, pad_left=True, pad_right=True, left_pad_symbol='*start*',right_pad_symbol='*end*') ])
    
unigram_counter[('*start*',)] = len(train_sents_tokenized)

## II)

## Tuning alpha wrt perplexity for the bigram model

In [8]:
vocab_size = len(set(train_tokens))
lowest_brigram_alpha=1
lowest_perplexity=100000
for alpha in np.arange(0.005,0.025,0.0005):
    #Reseting probability for each alpha
    bigram_cnt = 0
    sum_prob = 0
    for sent in development_sents_tokenized:
        sent = ['*start*'] + sent + ['*end*']
        for idx in range(1,len(sent)):
            bigram_prob = (bigram_counter[(sent[idx-1], sent[idx])] + round(alpha,4)) / (unigram_counter[(sent[idx-1],)] + round(alpha,4)*vocab_size)
            sum_prob += math.log2(bigram_prob)
            bigram_cnt+=1

    HC = -sum_prob / bigram_cnt
    perpl = math.pow(2,HC)
    #print("For alpha=",round(alpha,4),"perplexity: {0:.3f}".format(perpl))
    if(perpl<lowest_perplexity):
        lowest_perplexity=perpl
        lowest_brigram_alpha=round(alpha,4)
print("The alpha that produces the lowest perplexity for the bigram model is: ", lowest_brigram_alpha)
b_alpha=lowest_brigram_alpha

The alpha that produces the lowest perplexity for the bigram model is:  0.006


## Tuning alpha wrt perplexity for the trigram model

In [9]:
lowest_trigram_alpha=1
lowest_perplexity=100000
for alpha in np.arange(0.001,0.01,0.0005):
    trigram_cnt = 0
    sum_prob = 0    
    for sent in development_sents_tokenized:
        sent = ['*start*'] + ['*start*'] + sent + ['*end*'] + ['*end*']
        for idx in range(2,len(sent)):
            trigram_prob = (trigram_counter[(sent[idx-2],sent[idx-1], sent[idx])] +alpha) / (bigram_counter[(sent[idx-2],sent[idx-1])] + alpha*vocab_size)
            sum_prob += math.log2(trigram_prob)
            trigram_cnt+=1

    HC = -sum_prob / trigram_cnt
    perpl = math.pow(2,HC)
    #print("For alpha=",round(alpha,4),"perplexity: {0:.3f}".format(perpl))
    if(perpl<lowest_perplexity):
        lowest_perplexity=perpl
        lowest_trigram_alpha=round(alpha,4)
print("The alpha that produces the lowest perplexity for the trigram model is: ", lowest_trigram_alpha)
t_alpha=lowest_trigram_alpha

The alpha that produces the lowest perplexity for the trigram model is:  0.0015


## Log probabilities for the Test set - Bigram

In [10]:
sent_count=0
probabilities_test=[]
df_sent_bigram = pd.DataFrame({'Sentence': [], 'Log-Probability': [], 'Probability':[]})
for i,sent in enumerate(test_sents_tokenized):
    sum_prob = 0
    sent = ['*start*'] + sent + ['*end*']
    for idx in range(1,len(sent)):
        bigram_prob = (bigram_counter[(sent[idx-1], sent[idx])] +b_alpha) / (unigram_counter[(sent[idx-1],)] + b_alpha*vocab_size)
        sum_prob += math.log2(bigram_prob)
        
        sent_count+=1
    probabilities_test.append(sum_prob)
    df_sent_bigram = df_sent_bigram.append({'Sentence': ' '.join(sent[1:len(sent)-1]),  "Log-Probability":sum_prob, "Probability": np.exp(sum_prob)},ignore_index=True)
    #print("Sentence: ",' '.join(sent[1:len(sent)-1]),"\nLog Probability: ",sum_prob, "\nProbability: ",np.exp(sum_prob),"\n")

pd.set_option('display.max_colwidth', -1)  
df_sent_bigram.set_index("Sentence").head(5)

Unnamed: 0_level_0,Log-Probability,Probability
Sentence,Unnamed: 1_level_1,Unnamed: 2_level_1
"In the agricultural sector , rapid restructuring is being called for in order to concentrate the land in a few hands and forge another link in the chain controlled by the network of multinationals .",-265.154277,6.997787e-116
"Furthermore , the liberalisation of trade and the abolition of duty and subsidies have hit agricultural production directly , reducing farmers ' incomes , *UNK* farming and increasing unemployment .",-242.691735,3.9839940000000004e-106
Agricultural production in Greece - and elsewhere - is being sacrificed in order to protect and corner a larger share of the international market for processed products from central and northern Europe .,-256.218822,5.315933e-112
"We believe that , rather than defending the interests of the people , the EU will again endeavour at the new round of talks to stake a bigger claim for the European monopolies , in competition with the other imperialist centres , i . e .",-349.601468,1.479147e-152
the USA and Japan .,-38.140553,2.7275220000000004e-17


In [11]:
print("We compute the mean probability and the mean log probability of all the sentences:")
print("Mean Probability:", np.mean(probabilities_test))
print("Mean Log Probability:", np.mean(np.exp(probabilities_test)))
# df_mean_bigram = pd.DataFrame({'Mean Probability': [], 'Mean Log Probability': []})
# df_mean_bigram = df_mean_bigram.append({'Mean Probability': np.mean(probabilities_test), 'Mean Log Probability': np.mean(np.exp(probabilities_test))},ignore_index=True)

# df_mean_bigram

We compute the mean probability and the mean log probability of all the sentences:
Mean Probability: -196.81242769991334
Mean Log Probability: 7.039408974668455e-07


## Log probabilities for the Test set - Trigram

In [12]:
sent_count=0
probabilities_test=[]
df_sent_trigram = pd.DataFrame({'Sentence': [], 'Log-Probability': [], 'Probability':[]})
for i,sent in enumerate(test_sents_tokenized):
    sum_prob = 0
    sent = ['*start*'] + ['*start*'] + sent + ['*end*'] + ['*end*']
    for idx in range(2,len(sent)):
        trigram_prob = (trigram_counter[(sent[idx-2],sent[idx-1], sent[idx])] +t_alpha) / (bigram_counter[(sent[idx-2],sent[idx-1])] + t_alpha*vocab_size)
        #trigram_prob = (trigram_counter[(sent[idx-2],sent[idx-1], sent[idx])] +t_alpha) / (bigram_counter[(sent[idx-1],sent[idx])] + t_alpha*vocab_size)
        sum_prob += math.log2(trigram_prob)
        
        sent_count+=1
    probabilities_test.append(sum_prob)
    df_sent_trigram = df_sent_trigram.append({'Sentence': ' '.join(sent[2:len(sent)-2]),  "Log-Probability":sum_prob, "Probability": np.exp(sum_prob)},ignore_index=True)

df_sent_trigram.set_index("Sentence").head(5)

Unnamed: 0_level_0,Log-Probability,Probability
Sentence,Unnamed: 1_level_1,Unnamed: 2_level_1
"In the agricultural sector , rapid restructuring is being called for in order to concentrate the land in a few hands and forge another link in the chain controlled by the network of multinationals .",-326.922753,1.045327e-142
"Furthermore , the liberalisation of trade and the abolition of duty and subsidies have hit agricultural production directly , reducing farmers ' incomes , *UNK* farming and increasing unemployment .",-268.100934,3.674886e-117
Agricultural production in Greece - and elsewhere - is being sacrificed in order to protect and corner a larger share of the international market for processed products from central and northern Europe .,-321.887925,1.6063880000000002e-140
"We believe that , rather than defending the interests of the people , the EU will again endeavour at the new round of talks to stake a bigger claim for the European monopolies , in competition with the other imperialist centres , i . e .",-385.237366,4.937849e-168
the USA and Japan .,-23.819847,4.520348e-11


In [13]:
print("We compute the mean probability and the mean log probability of all the sentences: ")
print("Mean Probability:", np.mean(probabilities_test))
print("Mean Log Probability:", np.mean(np.exp(probabilities_test)))

We compute the mean probability and the mean log probability of all the sentences: 
Mean Probability: -218.50332321483137
Mean Log Probability: 0.3728236029654936


## Log probabilities for the random sentences - Bigram

In [14]:
#Using the length of sentences in the test set to create new sentences of approximately equal size
probabilities_random=[]
df_sent_rand_bigram = pd.DataFrame({'Sentence': [], 'Log-Probability': [], 'Probability':[]})
#I will create the same number of sentences, the test set contains (33444)
for i in range(0,len(test_sents)):
    sent=[]
    for j in range(len(test_sents_tokenized[i])):
        #Each time I pick a random word from the test set
        sent.append(test_tokens[random.randint(0, len(test_tokens)-1)])
    sum_prob = 0
    sent = ['*start*'] + sent + ['*end*']
    for idx in range(1,len(sent)):
        bigram_prob = (bigram_counter[(sent[idx-1], sent[idx])] +b_alpha) / (unigram_counter[(sent[idx-1],)] + b_alpha*vocab_size)
        sum_prob += math.log2(bigram_prob)
    probabilities_random.append(sum_prob)
    df_sent_rand_bigram = df_sent_rand_bigram.append({'Sentence': ' '.join(sent[1:len(sent)-1]),  "Log-Probability":sum_prob, "Probability": np.exp(sum_prob)},ignore_index=True)

df_sent_rand_bigram.set_index("Sentence").head(5)

Unnamed: 0_level_0,Log-Probability,Probability
Sentence,Unnamed: 1_level_1,Unnamed: 2_level_1
"to this useful of exactly from have as composition the *UNK* *UNK* is *UNK* the peace contributed fruit world , which is I under been four visits , risks , ' Affairs in energy solved",-436.002537,4.431024e-190
"here matter of such next to where is realise the destroyed measures full us At item , *UNK* of of , unanimous another on may m is then debate but",-429.660071,2.5176899999999996e-187
"the the 30 in fully the to send Mr adjourned should them going *UNK* that fit is - , to like essential the , animal adoption hour We the was for , to",-499.620301,1.041503e-217
"use plenary me m policies package after , of is to see that , which September proposals *UNK* discussed not of of want shows provides will I its of coordinators to peace and , the efficient as define the If He and were the . are",-640.711701,5.526418e-279
starting employment without an as,-89.458562,1.408122e-39


In [15]:
print("We compute the mean probability and the mean log probability of all the sentences: ")
print("Mean Probability:", np.mean(probabilities_random))
print("Mean Log Probability:", np.mean(np.exp(probabilities_random)))

We compute the mean probability and the mean log probability of all the sentences: 
Mean Probability: -417.6092893327352
Mean Log Probability: 1.1929618387029627e-08


## Log probabilities for the random sentences - Trigram

In [16]:
#Using the length of sentences in the test set to create new sentences of approximately equal size
probabilities_random=[]
df_sent_rand_trigram = pd.DataFrame({'Sentence': [], 'Log-Probability': [], 'Probability':[]})
#I will create the same number of sentences, the test set contains (33444)
for i in range(0,len(test_sents)):
    sent=[]
    for j in range(len(test_sents_tokenized[i])):
        #Each time I pick a random word from the test set
        sent.append(test_tokens[random.randint(0, len(test_tokens)-1)])
    sum_prob = 0
    sent = ['*start*'] + ['*start*'] + sent + ['*end*'] + ['*end*']
    for idx in range(2,len(sent)):
        trigram_prob = (trigram_counter[(sent[idx-2],sent[idx-1], sent[idx])] +t_alpha) / (bigram_counter[(sent[idx-2],sent[idx-1])] + t_alpha*vocab_size)
        #trigram_prob = (trigram_counter[(sent[idx-2],sent[idx-1], sent[idx])] +t_alpha) / (bigram_counter[(sent[idx-1],sent[idx])] + t_alpha*vocab_size)
        sum_prob += math.log2(trigram_prob)
    probabilities_random.append(sum_prob)
    df_sent_rand_trigram = df_sent_rand_trigram.append({'Sentence': ' '.join(sent[2:len(sent)-2]),  "Log-Probability":sum_prob, "Probability": np.exp(sum_prob)},ignore_index=True)

        
df_sent_rand_trigram.set_index("Sentence").head(5)

Unnamed: 0_level_0,Log-Probability,Probability
Sentence,Unnamed: 1_level_1,Unnamed: 2_level_1
"delegation . very economic necessarily , the in institutions , matter violence , the *UNK* , bananas is of *UNK* port that to this . to ' is development available , . up , flags",-491.248619,4.50231e-214
( the since aim on must is accepted . responsibility equality paragraphs a that UN can involved When parliamentary a financial further still close That Mr five This to dates,-410.380474,5.943259e-179
' was regard the others obligations reiterating like all Diamantopoulou extend transport at during however in . all to States moment countries close and . is parliaments de the chosen four where them,-463.842826,3.594349e-202
". amendments the extended accidents has that various of months consequences our concentration Madam inadequate offices in State the to , food 17 Monetary also , : manufactured first scope sustainable deep that Spain should make this the satisfactory will gap ' , it the be",-638.299577,6.166173999999999e-278
about have ) stem not,-93.13009,3.58192e-41


In [17]:
print("We compute the mean probability and the mean log probability of all the sentences: ")
print("Mean Probability:", np.mean(probabilities_random))
print("Mean Log Probability:", np.mean(np.exp(probabilities_random)))

We compute the mean probability and the mean log probability of all the sentences: 
Mean Probability: -406.7445951225196
Mean Log Probability: 0.011642462651415454


### We can clearly see a huge difference in the probabilities

## III)

## Perplexity of the whole corpus as a sequence

## Adding only \*end\* (without \*start\*) to all test sentences - Bigram

In [18]:
test_one_sent=[]
for i,sent in enumerate(test_sents_tokenized):
    sent = sent + ['*end*']
    test_one_sent.extend(sent)

## Calculating HC and Perplexity

In [19]:
bigram_cnt = 0
sum_prob = 0
for idx in range(1,len(test_one_sent)):
    bigram_prob = (bigram_counter[(test_one_sent[idx-1], test_one_sent[idx])] + b_alpha) / (unigram_counter[(test_one_sent[idx-1],)] + b_alpha*vocab_size)
    sum_prob += math.log2(bigram_prob)
    bigram_cnt+=1

HC = -sum_prob / bigram_cnt
perpl = math.pow(2,HC)
print("Cross Entropy: {0:.3f}".format(HC))
print("Perplexity: {0:.3f}".format(perpl))

Cross Entropy: 6.954
Perplexity: 123.977


## Adding only \*end\* (without \*start\*) to all test sentences - Trigram

In [20]:
test_one_sent=[]
for i,sent in enumerate(test_sents_tokenized):
    sent = sent + ['*end*'] + ['*end*']
    test_one_sent.extend(sent)

## Calculating HC and Perplexity

In [21]:
trigram_cnt = 0
sum_prob = 0
for idx in range(2,len(test_one_sent)):
    trigram_prob = (trigram_counter[(test_one_sent[idx-2],test_one_sent[idx-1], test_one_sent[idx])] +t_alpha) / (bigram_counter[(test_one_sent[idx-2],test_one_sent[idx-1])] + t_alpha*vocab_size)
    #trigram_prob = (trigram_counter[(test_one_sent[idx-2],test_one_sent[idx-1], test_one_sent[idx])] +t_alpha) / (bigram_counter[(test_one_sent[idx-1],test_one_sent[idx])] + t_alpha*vocab_size)
    sum_prob += math.log2(trigram_prob)
    trigram_cnt+=1

HC = -sum_prob / trigram_cnt
perpl = math.pow(2,HC)
print("Cross Entropy: {0:.3f}".format(HC))
print("Perplexity: {0:.3f}".format(perpl))

Cross Entropy: 8.118
Perplexity: 277.859


## IV

## Tuning for ideal lambda

In [None]:
df = pd.DataFrame({'lambda': [], 'Perplexity': []})
lowest_lamda = 2
lowest_perplexity = 100000
for lamda in np.arange(0,1.01,0.05):
    ngram_cnt = 0
    sum_prob = 0
    for sent in development_sents_tokenized:
        sent = ['*start*'] + ['*start*'] + sent + ['*end*'] + ['*end*']
        for idx in range(2,len(sent)):
            trigram_prob = (trigram_counter[(sent[idx-2],sent[idx-1], sent[idx])] +t_alpha) / (bigram_counter[(sent[idx-2],sent[idx-1])] + t_alpha*vocab_size)
            #trigram_prob = (trigram_counter[(sent[idx-2],sent[idx-1], sent[idx])] +t_alpha) / (bigram_counter[(sent[idx-1],sent[idx])] + t_alpha*vocab_size)
            bigram_prob = (bigram_counter[(sent[idx-1], sent[idx])] +b_alpha) / (unigram_counter[(sent[idx-1],)] + b_alpha*vocab_size)

            sum_prob += (lamda * math.log2(trigram_prob)) +((1-lamda) * math.log2(bigram_prob))
            ngram_cnt+=1 

    HC = -sum_prob / ngram_cnt
    perpl = math.pow(2,HC)
    if(perpl<lowest_perplexity):
        lowest_perplexity=perpl
        lowest_lamda=lamda
    df = df.append({'lambda': round(lamda,2),  'Perplexity': round(perpl,2)}, ignore_index=True)
df.set_index('lambda')

In [None]:
print("The lambda that produces the lowest perplexity for the interpolated model is: ", lowest_lamda)