In [1]:
import nltk
from nltk import sent_tokenize, word_tokenize, TweetTokenizer #Tokenizing sentences
from pprint import pprint
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
import pandas as pd
import math
import numpy as np
import random

# I)

## Reading

In [2]:
with open("europarl-v7-en.txt", encoding = "utf8") as file:
    source = file.read()

## Split source into training / development / test set - 70/10/20

In [3]:
source_sents = sent_tokenize(source)
sents_len=len(source_sents)
train_sents=source_sents[0:round(sents_len*0.7)]
development_sents=source_sents[round(sents_len*0.7):round(sents_len*0.8)]
test_sents=source_sents[round(sents_len*0.8):]

## Tokenize sets by Token

In [4]:
tweet_wt = TweetTokenizer()

train_tokens = tweet_wt.tokenize(' '.join(train_sents))
development_tokens = tweet_wt.tokenize(' '.join(development_sents))
test_tokens = tweet_wt.tokenize(' '.join(test_sents))

## Finding tokens appearing in the training test rarely... Replacing those with \*UKN\* in all sets

In [5]:
count = Counter(train_tokens)
frequent_tokens=set([k for k,v in count.items() if v >= 10])
train_tokens=[train_token if (train_token in frequent_tokens) else "*UNK*" for train_token in train_tokens]
development_tokens=[development_token if (development_token in frequent_tokens) else "*UNK*" for development_token in development_tokens]
test_tokens=[test_token if (test_token in frequent_tokens) else "*UNK*" for test_token in test_tokens]

## Tokenize sets by sentence

In [6]:
train_sents_tokenized = []
development_sents_tokenized = []
test_sents_tokenized = []

for sent in train_sents:
    sent_tmp = tweet_wt.tokenize(sent)
    sent = [word if (word in frequent_tokens) else "*UNK*" for word in sent_tmp]
    train_sents_tokenized.append(sent)
for sent in development_sents:
    sent_tmp = tweet_wt.tokenize(sent)
    sent = [word if (word in frequent_tokens) else "*UNK*" for word in sent_tmp]
    development_sents_tokenized.append(sent)
for sent in test_sents:
    sent_tmp = tweet_wt.tokenize(sent)
    sent = [word if (word in frequent_tokens) else "*UNK*" for word in sent_tmp]
    test_sents_tokenized.append(sent)

## Converting tokens into Bigram and Trigram models

In [7]:
unigram_counter = Counter()
bigram_counter = Counter()
trigram_counter = Counter()

for sent in train_sents_tokenized:
    unigram_counter.update([gram for gram in ngrams(sent, 1, pad_left=True, pad_right=True, left_pad_symbol='*start*',right_pad_symbol='*end*') ])
    bigram_counter.update([gram for gram in ngrams(sent, 2, pad_left=True, pad_right=True, left_pad_symbol='*start*',right_pad_symbol='*end*') ])
    trigram_counter.update([gram for gram in ngrams(sent, 3, pad_left=True, pad_right=True, left_pad_symbol='*start*',right_pad_symbol='*end*') ])
    
unigram_counter[('*start*',)] = len(train_sents_tokenized)

# II)

## Declaring ngram methods

In [8]:
vocab_size = len(set(train_tokens))

In [9]:
def log_bigram_prob(sent, idx, alpha, vocab_size):
    return math.log2((bigram_counter[(sent[idx-1], sent[idx])] + round(alpha,4)) / (unigram_counter[(sent[idx-1],)] + round(alpha,4)*vocab_size))

def log_trigram_prob(sent, idx, alpha, vocab_size):
    return math.log2((trigram_counter[(sent[idx-2],sent[idx-1], sent[idx])] +alpha) / (bigram_counter[(sent[idx-2],sent[idx-1])] + alpha*vocab_size))

def bigram(sents_tokenized):
    bigram_cnt = 0
    sum_prob = 0    
    for sent in sents_tokenized:
        sent = ['*start*'] + sent + ['*end*']
        for idx in range(1,len(sent)):
            sum_prob += log_bigram_prob(sent, idx, alpha, vocab_size)
            bigram_cnt+=1
    return sum_prob, bigram_cnt

def trigram(sents_tokenized, alpha):
    trigram_cnt = 0
    sum_prob = 0    
    for sent in sents_tokenized:
        sent = ['*start*'] + ['*start*'] + sent + ['*end*'] + ['*end*']
        for idx in range(2,len(sent)):
            sum_prob += log_trigram_prob(sent, idx, alpha, vocab_size)                                                   
            trigram_cnt+=1
    return sum_prob, trigram_cnt
            
def best_alpha(perpl,lowest_perplexity,lowest_alpha):
    if(perpl<lowest_perplexity):
        lowest_perplexity=perpl
        lowest_alpha=round(alpha,4)
    return lowest_perplexity, lowest_alpha

def HC_entropy(sum_prob, ngram_cnt):
    HC = -sum_prob / ngram_cnt
    perpl = math.pow(2,HC)
    return HC, perpl

## Tuning alpha wrt perplexity for the bigram model

In [10]:
lowest_bigram_alpha=1
lowest_perplexity=100000
for alpha in np.arange(0.005,0.025,0.0005):
    sum_prob, bigram_cnt = bigram(development_sents_tokenized)
    HC, perpl = HC_entropy(sum_prob, bigram_cnt)
    lowest_perplexity, lowest_bigram_alpha = best_alpha(perpl, lowest_perplexity, lowest_bigram_alpha)
    
print("The alpha that produces the lowest perplexity for the bigram model is: ", lowest_bigram_alpha)
b_alpha=lowest_bigram_alpha

The alpha that produces the lowest perplexity for the bigram model is:  0.006


## Tuning alpha wrt perplexity for the trigram model

In [11]:
lowest_trigram_alpha=1
lowest_perplexity=100000
for alpha in np.arange(0.001,0.01,0.0005):
    sum_prob, trigram_cnt = trigram(development_sents_tokenized, alpha)
    HC, perpl = HC_entropy(sum_prob, bigram_cnt)
    lowest_perplexity, lowest_trigram_alpha = best_alpha(perpl, lowest_perplexity, lowest_trigram_alpha)

print("The alpha that produces the lowest perplexity for the trigram model is: ", lowest_trigram_alpha)
t_alpha=lowest_trigram_alpha

The alpha that produces the lowest perplexity for the trigram model is:  0.0015


## Functions definition for computing probabilities

In [12]:
def bigram_sentences(sents_tokenized):
    probabilities_list = []
    sentences_list = []
    for sent in sents_tokenized:
        sum_prob = 0
        sent = ['*start*'] + sent + ['*end*']
        for idx in range(1,len(sent)):
            sum_prob += log_bigram_prob(sent, idx, alpha, vocab_size)
        probabilities_list.append(sum_prob)
        sentences_list.append(' '.join(sent[1:len(sent)-2]))
    return probabilities_list, sentences_list

def trigram_sentences(sents_tokenized):
    probabilities_list = []
    sentences_list = []
    for sent in sents_tokenized:
        sum_prob = 0
        sent = ['*start*'] + ['*start*'] + sent + ['*end*'] + ['*end*']
        for idx in range(2,len(sent)):
            sum_prob += log_trigram_prob(sent, idx, alpha, vocab_size)
        probabilities_list.append(sum_prob)
        sentences_list.append(' '.join(sent[2:len(sent)-3]))
    return probabilities_list, sentences_list

## Log probabilities for the Test set - Bigram

In [13]:
probabilities_list, sentences_list = bigram_sentences(test_sents_tokenized)
df_sent_bigram = pd.DataFrame({'Sentence': sentences_list, 'Probability': probabilities_list, 'Log-Probability': np.exp(probabilities_list)})
df_sent_bigram.set_index("Sentence").head(5)

Unnamed: 0_level_0,Probability,Log-Probability
Sentence,Unnamed: 1_level_1,Unnamed: 2_level_1
"In the agricultural sector , rapid restructuring is being called for in order to concentrate the land in a few hands and forge another link in the chain controlled by the network of multinationals",-266.395743,2.0220829999999998e-116
"Furthermore , the liberalisation of trade and the abolition of duty and subsidies have hit agricultural production directly , reducing farmers ' incomes , *UNK* farming and increasing unemployment",-243.377202,2.0073550000000002e-106
Agricultural production in Greece - and elsewhere - is being sacrificed in order to protect and corner a larger share of the international market for processed products from central and northern Europe,-257.540603,1.4175460000000001e-112
"We believe that , rather than defending the interests of the people , the EU will again endeavour at the new round of talks to stake a bigger claim for the European monopolies , in competition with the other imperialist centres , i . e",-347.941678,7.777665e-152
the USA and Japan,-38.691652,1.571914e-17


In [14]:
print("We compute the mean probability and the mean log probability of all the sentences:")
print("Mean Probability:", np.mean(probabilities_list))
print("Mean Log Probability:", np.mean(np.exp(probabilities_list)))

We compute the mean probability and the mean log probability of all the sentences:
Mean Probability: -197.0272754091232
Mean Log Probability: 7.031446990887461e-07


## Log probabilities for the Test set - Trigram

In [15]:
probabilities_list, sentences_list = trigram_sentences(test_sents_tokenized)
df_sent_trigram = pd.DataFrame({'Sentence': sentences_list, 'Probability': probabilities_list, 'Log-Probability': np.exp(probabilities_list)})
df_sent_trigram.set_index("Sentence").head(5)

Unnamed: 0_level_0,Probability,Log-Probability
Sentence,Unnamed: 1_level_1,Unnamed: 2_level_1
"In the agricultural sector , rapid restructuring is being called for in order to concentrate the land in a few hands and forge another link in the chain controlled by the network of multinationals",-332.989616,2.423524e-145
"Furthermore , the liberalisation of trade and the abolition of duty and subsidies have hit agricultural production directly , reducing farmers ' incomes , *UNK* farming and increasing unemployment",-276.676814,6.9308490000000006e-121
Agricultural production in Greece - and elsewhere - is being sacrificed in order to protect and corner a larger share of the international market for processed products from central and northern Europe,-329.409687,8.693415e-144
"We believe that , rather than defending the interests of the people , the EU will again endeavour at the new round of talks to stake a bigger claim for the European monopolies , in competition with the other imperialist centres , i . e",-389.252624,8.907042e-170
the USA and Japan,-30.019313,9.178632e-14


In [16]:
print("We compute the mean probability and the mean log probability of all the sentences:")
print("Mean Probability:", np.mean(probabilities_list))
print("Mean Log Probability:", np.mean(np.exp(probabilities_list)))

We compute the mean probability and the mean log probability of all the sentences:
Mean Probability: -225.5061431192479
Mean Log Probability: 0.016201686058953817


## Log probabilities for the random sentences - Bigram

In [17]:
def create_random_text(sents_tokenized):
    text = []
    for sent in sents_tokenized:
        rand_sent = []
        for i in range(len(sent)):
            rand_sent.append(test_tokens[random.randint(0, len(test_tokens)-1)])
        text.append(rand_sent)
    return text

In [18]:
probabilities_random, sentences_random = bigram_sentences(create_random_text(test_sents_tokenized))
df_sent_rand_bigram = pd.DataFrame({'Sentence': sentences_random, 'Probability': probabilities_random, 'Log-Probability': np.exp(probabilities_random)})
df_sent_rand_bigram.set_index("Sentence").head(5)

Unnamed: 0_level_0,Probability,Log-Probability
Sentence,Unnamed: 1_level_1,Unnamed: 2_level_1
"have awarded our extension Agency information for this of conferred Facility , , in are . wish whose out % Cooperation should wide huge work in October and Europe and to , confirmed obvious",-484.398147,4.25165e-211
"achieved rather progress Union *UNK* Maij-Weggen remark , adopted regards State the ) public to been *UNK* action travelling Member can point years an why committee of believe paid",-463.63298,4.433585e-202
"( which a animal *UNK* banks safety themselves We whether climate and , correctly Research the International before Socialist and democratic competition has to Access especially be , of Members . and",-424.079523,6.677336e-185
"They . % quite insist the its . five it *UNK* joint underpin reason and is accept . to , , are the active up on which I also I Europe clearly is , , your which those and true security an this employment conversion",-660.718836,1.130982e-287
tunnel position then of,-101.373244,9.422353e-45


In [19]:
print("We compute the mean probability and the mean log probability of all the sentences: ")
print("Mean Probability:", np.mean(probabilities_random))
print("Mean Log Probability:", np.mean(np.exp(probabilities_random)))

We compute the mean probability and the mean log probability of all the sentences: 
Mean Probability: -409.31964839900786
Mean Log Probability: 1.7878202905338187e-08


## Log probabilities for the random sentences - Trigram

In [20]:
probabilities_random, sentences_random = trigram_sentences(create_random_text(test_sents_tokenized))
df_sent_rand_trigram = pd.DataFrame({'Sentence': sentences_random, 'Probability': probabilities_random, 'Log-Probability': np.exp(probabilities_random)})
df_sent_rand_trigram.set_index("Sentence").head(5)

Unnamed: 0_level_0,Probability,Log-Probability
Sentence,Unnamed: 1_level_1,Unnamed: 2_level_1
"are to legislation official and hope something . contacts compensate I main account in . course , . should have Conference on Member resumption legislation to especially John second environmental of municipal to a",-508.550584,1.378121e-221
"an of to justice 1 be participate we this trigger the EU this to all four again even the it I , the phenomenon we and , , present",-425.98936,9.889439999999999e-186
"No development problem full we glad interventions the only thoughts area the statement far *UNK* Greek few likewise over random , once intends this the first employment what consider concerning *UNK* participation",-430.771105,8.288698e-188
"must which his I - its Mr . an agree the of who Convention the question and will following the in return priorities We . also enterprise for 80 the range set other the on Convention as , define of by the Secondly *UNK* politics",-629.517616,4.017659e-274
on . in retain,-84.100018,2.991069e-37


In [21]:
print("We compute the mean probability and the mean log probability of all the sentences: ")
print("Mean Probability:", np.mean(probabilities_random))
print("Mean Log Probability:", np.mean(np.exp(probabilities_random)))

We compute the mean probability and the mean log probability of all the sentences: 
Mean Probability: -396.51164876638256
Mean Log Probability: 0.0005975755979599098


### We can clearly see a difference in the probabilities

# III)

## Perplexity of the whole corpus as a sequence

## Adding only *end* (without *start*) to all test sentences - Bigram

In [22]:
test_one_sent=[]
for i,sent in enumerate(test_sents_tokenized):
    sent = sent + ['*end*']
    test_one_sent.extend(sent)

## Calculating HC and Perplexity

In [23]:
bigram_cnt = 0
sum_prob = 0
for idx in range(1,len(test_one_sent)):
    bigram_prob = (bigram_counter[(test_one_sent[idx-1], test_one_sent[idx])] + b_alpha) / (unigram_counter[(test_one_sent[idx-1],)] + b_alpha*vocab_size)
    sum_prob += math.log2(bigram_prob)
    bigram_cnt+=1

HC = -sum_prob / bigram_cnt
perpl = math.pow(2,HC)
print("Cross Entropy: {0:.3f}".format(HC))
print("Perplexity: {0:.3f}".format(perpl))

Cross Entropy: 6.954
Perplexity: 123.977


## Adding only *end* (without *start*) to all test sentences - Trigram

In [24]:
test_one_sent=[]
for i,sent in enumerate(test_sents_tokenized):
    sent = sent + ['*end*'] + ['*end*']
    test_one_sent.extend(sent)

## Calculating HC and Perplexity

In [25]:
trigram_cnt = 0
sum_prob = 0
for idx in range(2,len(test_one_sent)):
    trigram_prob = (trigram_counter[(test_one_sent[idx-2],test_one_sent[idx-1], test_one_sent[idx])] +t_alpha) / (bigram_counter[(test_one_sent[idx-2],test_one_sent[idx-1])] + t_alpha*vocab_size)
    sum_prob += math.log2(trigram_prob)
    trigram_cnt+=1

HC = -sum_prob / trigram_cnt
perpl = math.pow(2,HC)
print("Cross Entropy: {0:.3f}".format(HC))
print("Perplexity: {0:.3f}".format(perpl))

Cross Entropy: 8.118
Perplexity: 277.859


# IV

## Tuning for ideal lambda

In [26]:
df = pd.DataFrame({'lambda': [], 'Perplexity': []})
lowest_lamda = 2
lowest_perplexity = 100000
for lamda in np.arange(0,1.01,0.05):
    ngram_cnt = 0
    sum_prob = 0
    for sent in development_sents_tokenized:
        sent = ['*start*'] + ['*start*'] + sent + ['*end*'] + ['*end*']
        for idx in range(2,len(sent)):
            bigram_prob = (bigram_counter[(sent[idx-1], sent[idx])] +b_alpha) / (unigram_counter[(sent[idx-1],)] + b_alpha*vocab_size)
            trigram_prob = (trigram_counter[(sent[idx-2],sent[idx-1], sent[idx])] +t_alpha) / (bigram_counter[(sent[idx-2],sent[idx-1])] + t_alpha*vocab_size)
            sum_prob += ((1-lamda) * math.log2(bigram_prob)) + (lamda * math.log2(trigram_prob))
            ngram_cnt+=1

    HC, perpl = HC_entropy(sum_prob, ngram_cnt)
    if(perpl<lowest_perplexity):
        lowest_perplexity=perpl
        lowest_lamda=lamda
    df = df.append({'lambda': round(lamda,2),  'Perplexity': round(perpl,2)}, ignore_index=True)
df.set_index('lambda')

Unnamed: 0_level_0,Perplexity
lambda,Unnamed: 1_level_1
0.0,121.8
0.05,122.89
0.1,124.0
0.15,125.11
0.2,126.23
0.25,127.36
0.3,128.51
0.35,129.66
0.4,130.82
0.45,132.0


In [27]:
print("The lambda that produces the lowest perplexity for the interpolated model is: ", lowest_lamda)

The lambda that produces the lowest perplexity for the interpolated model is:  0.0
