<div align="center">
    <h1>MDSAA TEXT MINING 2020-2021</h1>
    <h2>MACHINE TRANSLATION METRICS</h2>
    <p style="text-align:center">David Sotto-Mayor Machado (m20201023@novaims.unl.pt), Maikel Sousa (m20200735@novaims.unl.pt), Catarina Moreira (m20201034@novaims.unl.pt)</p>
</div>

<h2>Import Libraries</h2>

In [1]:
import os, sys, re, time, gzip, zlib, logging, transformers, urllib.request, shutil
import pandas as pd
import numpy as np
#from comet.models import download_model
import jieba.posseg as pseg
import comet
from collections import Counter
from rouge import Rouge
from tqdm.notebook import tqdm
#from bert_score import score
#from nltk.translate.bleu_score import sentence_bleu
from nltk.corpus import stopwords
from nltk import download
from stopwordsiso import stopwords as swordsiso
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import matplotlib.pyplot as plt
from matplotlib import rcParams
#from bert_score import score
from tqdm import tqdm_notebook as tqdm
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import SnowballStemmer
from bs4 import BeautifulSoup
import string
from sklearn.feature_extraction.text import CountVectorizer
from IPython.display import display_html 
import matplotlib.pyplot as plt
import scipy.stats as stats
from zipfile import *
import platform
#from bleurt import score
#import bleurt
import jieba
import bert



In [2]:
#ignore warnings
import warnings
warnings.filterwarnings("ignore")

<h2>Functions</h2>

In [3]:
def clean(text_list, stop, lemmatize, stemmer):
    """
    Function that a receives a list of strings and preprocesses it.
    
    :param text_list: List of strings.
    :param lemmatize: Tag to apply lemmatization if True.
    :param stemmer: Tag to apply the stemmer if True.
    """
    updates = []
    for j in range(len(text_list)): #create the bars
        
        text = text_list[j]
        
        
        #LOWERCASE TEXT
        if isinstance(text, str):
            text = text.lower()
        
        #REMOVE TAGS
        if isinstance(text, str):
            text = " ".join([BeautifulSoup(word).get_text() for word in text.split()])
        
        #REMOVE NUMERICAL DATA AND PUNCTUATION
        if isinstance(text, str):
            text = re.sub("[^a-zA-Z]", ' ', text)
        
        #REMOVE STOP WORDS
        if isinstance(text, str):
            text = " ".join([word for word in text.split() if word not in stop])
        
        if isinstance(text, str):
            if lemmatize:
                text = " ".join(lemma.lemmatize(word) for word in text.split())
                
        if isinstance(text, str):
            if stemmer:
                text = " ".join(snowball_stemmer.stem(word) for word in text.split())
        
        updates.append(text)
        
    return updates

def update_df(dataframe, list_updated,column1):
    dataframe.update(pd.DataFrame({column1: list_updated}))

In [4]:
def clean_finlandes(text_list,stop, lemmatize, stemmer):
    """
    Function that a receives a list of strings and preprocesses it.
    
    :param text_list: List of strings.
    :param lemmatize: Tag to apply lemmatization if True.
    :param stemmer: Tag to apply the stemmer if True.
    """
    updates = []
    for j in range(len(text_list)): #create the bars
        
        text = text_list[j]
        
        #LOWERCASE TEXT
        text = text.upper()
        
        #REMOVE NUMERICAL DATA AND PUNCTUATION
        text = re.sub("[^a-zA-ZǺÄÖ]", ' ', text)
        
        #REMOVE TAGS
        text = BeautifulSoup(text).get_text()
        
        if lemmatize:
            text = " ".join(lemma.lemmatize(word) for word in text.split())
        
        if stemmer:
            text = " ".join(snowball_stemmer.stem(word) for word in text.split())
        
        updates.append(text)
        
    return updates

def update_df(dataframe, list_updated,column1):
    dataframe.update(pd.DataFrame({column1: list_updated}))

In [5]:
def clean_chinese(text_list, stop):
    """
    Function that a receives a list of strings and preprocesses it.
    
    :param text_list: List of strings.
    :param lemmatize: Tag to apply lemmatization if True.
    :param stemmer: Tag to apply the stemmer if True.
    """
    updates = []
    for j in range(len(text_list)): #create the bars
        
        text = text_list[j]
        
        #REMOVE NUMERICAL DATA AND PUNCTUATION
        text =  re.sub("[0-9]", ' ', text)
        
        text = re.sub("r[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。. ？、~@#￥%……&*（）：；《）《》“”()»〔〕-]+", " ", text.encode().decode("utf8"))
        
        #REMOVE TAGS
        text = BeautifulSoup(text).get_text()
        
        updates.append(text)
        
    return updates

def update_df(dataframe, list_updated,column1):
    dataframe.update(pd.DataFrame({column1: list_updated}))

<h2>Import Corpora</h2>

In [6]:
#To import all the corpus csv files
folder='testset'
corpus = os.listdir(folder)
#corpus.remove('.DS_Store')

filename = 'scores.csv'
corpora=[]

for sub in corpus:
    corpora.append(pd.read_csv('{}/{}/{}'.format(folder,sub,filename)))
        
print('\033[1m',len(corpora),'\033[0mcorpora imported.')

[1m 6 [0mcorpora imported.


In [7]:
#To drop rows with missing translations or references
for i in range(len(corpus)):
    corpora[i].drop(corpora[i][corpora[i]["reference"] == "."].index, inplace=True)
    corpora[i].drop(corpora[i][corpora[i]["translation"] == "."].index, inplace=True)
    corpora[i].reset_index(drop=True, inplace=True)

In [8]:
#To list the different translation language pairs
corpus

['cs-en', 'de-en', 'en-fi', 'en-zh', 'ru-en', 'zh-en']

In [9]:
csen = 0
deen = 1
enfi = 2
enzh = 3
ruen = 4
zhen = 5

In [10]:
#To verify one of the corpus
corpora[0].head(5)

Unnamed: 0,source,reference,translation
0,"Památník, důstojné pietní místo, stojí vůlí dě...","The monument, a dignified piecemeal place, sta...","The memorial, a solemn place of commemoration,..."
1,Pracovník centra Čang Č-čung sdělil agentuře N...,Centre worker Zhang Zu-chung told the New Chin...,Centre worker Chang Chi-Chung told New China t...
2,Veterináři nicméně odeberou namátkové vzorky v...,"However, veterinarians take random samples of ...","However, veterinarians are taking samples of e..."
3,Uživatel @TheePharoah jí neustále retweetoval ...,User @ TheePharoah constantly retweeted her po...,A user with the handle @TheePharoah was being ...
4,Lucii bylo tehdy pouhých 19 let a rozhodně net...,Lucia was only 19 at the time and certainly ha...,"At that time, Lucie was only 19 years old, and..."


<h2>Pre-processing Data</h2>

As we know, one importante step before starting our experiments is to divide the corpora in training and test sets. However, this corpora is already divided and the corpora that is given to us is the training set. 

### Corpus inspection


Here, we are going to look into our data, understand it and think how to solve the problem.

###### Check the training set of each corpora

In [11]:
#To check the training set of the first corpora
corpora[0][:5]

Unnamed: 0,source,reference,translation
0,"Památník, důstojné pietní místo, stojí vůlí dě...","The monument, a dignified piecemeal place, sta...","The memorial, a solemn place of commemoration,..."
1,Pracovník centra Čang Č-čung sdělil agentuře N...,Centre worker Zhang Zu-chung told the New Chin...,Centre worker Chang Chi-Chung told New China t...
2,Veterináři nicméně odeberou namátkové vzorky v...,"However, veterinarians take random samples of ...","However, veterinarians are taking samples of e..."
3,Uživatel @TheePharoah jí neustále retweetoval ...,User @ TheePharoah constantly retweeted her po...,A user with the handle @TheePharoah was being ...
4,Lucii bylo tehdy pouhých 19 let a rozhodně net...,Lucia was only 19 at the time and certainly ha...,"At that time, Lucie was only 19 years old, and..."


In [12]:
#To check the training set of the second corpora
corpora[1][:5]

Unnamed: 0,source,reference,translation
0,Das Publikum ist fast gleichmäßig zwischen Sch...,The audience is almost evenly split between bl...,The audience is almost evenly split between bl...
1,Du kannst ihre Energie durch den Bildschirm sp...,"You can feel their energy through the screen. """"","You can feel her energy through the screen."""
2,"Da die Adresse unbekannt ist, wird die Mithilf...","As the address is unknown, the help of the pop...","As the address is unknown, the assistance of t..."
3,"Arsenal-Manager Arsene Wenger, dessen Verein i...","Arsenal manager Arsene Wenger, whose club is o...","Arsenal manager Arsene Wenger, whose club is o..."
4,Landwirtschaftsminister im Interview - Wie sch...,Agriculture Minister in the interview - How do...,Minister of Agriculture in interview – How do ...


In [13]:
#To check the training set of the third corpora
corpora[2][:5]

Unnamed: 0,source,reference,translation
0,One local resident who did not wish to be name...,"Eräs paikallinen asukas, joka ei halunnut nime...",Toisen nimettömänä pysyttelevän asukkaan mukaa...
1,"Still, she clings to a chant she's committed t...",Silti hän takertuu chant hän on sitoutunut mui...,"Silti hän luottaa edelleen iskulauseeseen, jon..."
2,"I don't want to be asked, 'What were you doing...","En halua, että minulta kysytään: ""Mitä te teit...","En halua, että kenenkään tarvitsee kysyä minul..."
3,"""I wouldn't say it was a lie – that's a pretty...","""En sanoisi, että se oli valhe - se on aika ro...","En sanoisi, että se oli valhe, se on aika kova..."
4,Kari Kola took part in the opening ceremony of...,Kari Kola osallistui valon vuoden avajaisiin v...,Kari Kola oli mukana Valon teemavuoden avajais...


In [14]:
#To check the training set of the fourth corpora
corpora[3][:5]

Unnamed: 0,source,reference,translation
0,The future and the destinies of the citizens o...,世界上每个国家公民的未来和命运日益联系在一起。,世界各国人民前途命运越来越紧密地联系在一起。
1,"After all that hard work, the finished result ...",经过那么多的努力，最终的结果现在已经可以揭晓了。,经过这么艰辛的工作，最终的结果现在才得以公布。
2,Author: researcher of Suning Institute of Fina...,作者：苏宁金融研究所研究员，财经专栏作家，财经评论员。,作者：苏宁金融研究院特约研究员，财经专栏作家，财经评论员。
3,“The Great Wall” tells the story of a Chinese ...,《长城》讲述了古代一支中国精锐部队在世界著名的中国长城上与怪物桃蒂英勇作战的故事。,《长城》讲述了在古代，一支中国精英部队为保卫人类，在举世闻名的长城上与怪兽饕餮进行生死决战的故事。
4,Our comrades from the Political Bureau should ...,政治局同志要学习历史，讲道理，不能混淆公、私利益，叫白黑，模糊义与利的界限，处理基于裙带关系...,中央政治局的同志都应该明史知理，不能颠倒了公私、混淆了是非、模糊了义利、放纵了亲情，要带头树...


In [15]:
#To check the training set of the fifth corpora
corpora[4][:5]

Unnamed: 0,source,reference,translation
0,Через полчаса обуглившийся клубень достают и п...,"After half an hour, the charred tuber is taken...","After half-an-hour, the charred tuber is retri..."
1,"Здесь никто не думает отменять смертную казнь,...","Here, no one thinks to abolish the death penal...","Here, no one is concerned with abolishing the ..."
2,"Собеседники ""Известий"" в ОНФ отмечают, что док...","The interlocutors of"" Izvestiya ""in the onf no...",Izvestia’s sources in the ONF note that the re...
3,На древней Венере могли существовать океаны.,On the ancient Venus could exist in the oceans.,Oceans could have existed on ancient Venus.
4,До этого момента убийства оставались лишь исто...,"Up to this point, the murders were just a stor...","Up until this point, the murders have remained..."


In [16]:
#To check the training set of the sixth corpora
corpora[5][:5]

Unnamed: 0,source,reference,translation
0,已经批准筹建的，暂停批准开业,"Where the preparation has been approved, the a...",Approval of opening on these establishments wi...
1,王丰源在首发式发言中说，来美国前想找本书看看别人的经验，但他翻遍新华书店没找到关于留学美国中...,"In his opening speech, Mr. Wang said he wanted...",Wang Fengyuan spoke at the launch of his new b...
2,“如果你不致力于创造透明文化，你会失去人才，”维特拉诺说道。,"""if you're not committed to creating a culture...","""If you're not committed to creating a culture..."
3,不过前提是多国联军先停止对也门的袭击。,"The premise, however, is that the coalition fo...","However, the premise is that the multinational..."
4,“在此之前，我和前男友住在骑士桥的一个更大的房子里，”乔安妮说道。,"""before that, my ex and I lived in a bigger ho...","""Before this, I was living with my ex in Knigh..."


As we can notice, in each corpora the majority of the words in the top 10, most words are stopwords. 
These words will contain no semantic meaning and it will not help us. 

###### Initial Preprocessing

In [17]:
english_stopwords = set(stopwords.words('english'))
finnish_stopwords= set(stopwords.words('finnish'))
chinese_stopwords= set(swordsiso('zh'))

exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

In [18]:
csen

0

In [19]:
#To update the reference and translation in corpora 0
updates_0_reference = clean(corpora[csen]["reference"],finnish_stopwords, lemmatize = True, stemmer = False)
updates_0_translation = clean(corpora[csen]["translation"], finnish_stopwords, lemmatize = True, stemmer = False)

In [20]:
#To update the dataframe
update_df(corpora[csen], updates_0_reference, 'reference')
update_df(corpora[csen], updates_0_translation, 'translation')

In [21]:
deen

1

In [22]:
#To update the reference and translation in corpora 1
updates_1_reference = clean(corpora[deen]["reference"],english_stopwords,  lemmatize = True, stemmer = False)
updates_1_translation = clean(corpora[deen]["translation"], english_stopwords, lemmatize = True, stemmer = False)

In [23]:
#To update the dataframe
update_df(corpora[deen], updates_1_reference, 'reference')
update_df(corpora[deen], updates_1_translation, 'translation')

In [24]:
enfi

2

In [25]:
#To update the reference and translation in corpora 2
updates_2_reference = clean_finlandes(corpora[enfi]["reference"], english_stopwords, lemmatize = True, stemmer = False)
updates_2_translation = clean_finlandes(corpora[enfi]["translation"],english_stopwords,  lemmatize = True, stemmer = False)

In [26]:
#To update the dataframe
update_df(corpora[enfi], updates_2_reference, 'reference')
update_df(corpora[enfi], updates_2_translation, 'translation')

In [27]:
enzh

3

In [28]:
#To update the reference and translation in corpora 3
updates_3_reference= clean_chinese(corpora[enzh]["reference"],chinese_stopwords)
updates_3_translation = clean_chinese(corpora[enzh]["translation"],chinese_stopwords)

In [29]:
#To update the dataframe
update_df(corpora[enzh], updates_3_reference, 'reference')
update_df(corpora[enzh], updates_3_translation, 'translation')

In [30]:
ruen

4

In [31]:
#To update the reference and translation in corpora 4
updates_4_reference = clean(corpora[ruen]["reference"], english_stopwords, lemmatize = True, stemmer = False)
updates_4_translation = clean(corpora[ruen]["translation"],english_stopwords, lemmatize = True, stemmer = False)

In [32]:
#To update the dataframe
update_df(corpora[ruen], updates_4_reference,  'reference')
update_df(corpora[ruen], updates_4_translation, 'translation')

In [33]:
zhen

5

In [34]:
#To update the reference and translation in corpora 5
updates_5_reference = clean_chinese(corpora[zhen]["reference"],english_stopwords)
updates_5_translation = clean_chinese(corpora[zhen]["translation"],english_stopwords)

In [35]:
#To update the dataframe
update_df(corpora[zhen], updates_5_reference, 'reference')
update_df(corpora[zhen], updates_5_translation, 'translation')

### Evolution Methods


#### BERT-Score

In [36]:
from bert_score import score

In [37]:
#preparation
#transformers
transformers.tokenization_utils.logger.setLevel(logging.ERROR)
transformers.configuration_utils.logger.setLevel(logging.ERROR)
transformers.modeling_utils.logger.setLevel(logging.ERROR)

In [None]:
BSP=[]
BSR=[]
BSF1=[]
for i in range(len(corpus)):
    Ptemp, Rtemp, F1temp = score(list(corpora[i].translation), list(corpora[i].reference),\
                                 lang=corpus[i][-2:], verbose=True)
    #BSP.append(Ptemp)
    #BSR.append(Rtemp)
    BSF1.append(F1temp)
    print(i)

Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

calculating scores...
computing bert embedding.


  0%|          | 0/179 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/137 [00:00<?, ?it/s]

done in 1729.13 seconds, 5.05 sentences/sec
0
calculating scores...
computing bert embedding.


  0%|          | 0/429 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/444 [00:00<?, ?it/s]



done in 3743.27 seconds, 7.59 sentences/sec
1


Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/714M [00:00<?, ?B/s]

calculating scores...
computing bert embedding.


  0%|          | 0/161 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/127 [00:00<?, ?it/s]

done in 1631.71 seconds, 4.96 sentences/sec
2


Downloading:   0%|          | 0.00/624 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/269k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/412M [00:00<?, ?B/s]

calculating scores...
computing bert embedding.


  0%|          | 0/404 [00:00<?, ?it/s]

In [None]:
a = corpora
for i in range(len(corpora)):
    corpora[i]=pd.concat([corpora[i],pd.DataFrame(BSP[i].numpy()),pd.DataFrame(BSR[i].numpy()),\
                          pd.DataFrame(BSF1[i].numpy())], axis=1, join='inner')
    corpora[i].columns=['index',"source","reference","translation","z-score","avg-score","annotators","BLEU",\
                        "ROUGE-1 F1-Score","ROUGE-1 Precision","ROUGE-1 Recall","ROUGE-2 F1-Score","ROUGE-2 Precision",\
                        "ROUGE-2 Recall","ROUGE-L F1-Score","ROUGE-L Precision","ROUGE-L Recall",\
                        "WMDistance","BERT-Score Precision","BERT-Score Recall","BERT-Score F1-Score"]

In [None]:
#create a backup-point for corpora
for i in range(len(corpus)):
    corpora[i].to_excel('bckcorpora/'+corpus[i]+'.xlsx')

#### BLEURT

Refer to https://github.com/google-research/bleurt for bleurt installation

<p>After installed locate scorer.py on the packages folder and typecast your tf.constants on the predict method:</p>
<p>def predict(self, input_dict):<br>&nbsp;&nbsp;&nbsp;predictions = self._bleurt_model_ops(input_ids=tf.constant(input_dict["input_ids"]<b>,dtype='int64'</b>),<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;input_mask=tf.constant(input_dict["input_mask"]<b>,dtype='int64'</b>),<br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;segment_ids=tf.constant(input_dict["segment_ids"]<b>,dtype='int64'</b>))["predictions"].numpy()<br>&nbsp;&nbsp;&nbsp;return predictions</p>


In [None]:
from bleurt import score

In [None]:
ls

In [None]:
corpora=[]

#corpora.append(pd.read_excel('cs-en.xlsx'))
#corpora.append(pd.read_excel('de-en.xlsx'))
#corpora.append(pd.read_excel('en-fi.xlsx'))
#corpora.append(pd.read_excel('en-zh.xlsx'))
#corpora.append(pd.read_excel('ru-en.xlsx'))
#corpora.append(pd.read_excel('zh-en.xlsx'))


        
print('\033[1m',len(corpora),'\033[0mcorpora imported.')

In [None]:
len(corpora)

In [None]:
#create a backup-point for corpora
#for i in range(len(corpus)):
#    corpora[i].read_excel('bckcorpora/'+corpus[i]+'.xlsx')

In [None]:
corpora[1]

In [None]:
#Dowload base 
if not(os.path.isfile('bleurt-base-128/vocab.txt')):
    if not(os.path.isfile('https://storage.googleapis.com/bleurt-oss/bleurt-base-128.zip')):
        urllib.request.urlretrieve("https://storage.googleapis.com/bleurt-oss/bleurt-base-128.zip",\
                                   "bleurt-base-128.zip")
    with ZipFile('bleurt-base-128.zip', 'r') as zipObj:
        zipObj.extractall()

In [None]:

checkpoint = "bleurt-base-128"
bleurtscores=[]
for i in tqdm(range(len(corpus))):
    ref=corpora[i].reference.to_list()
    can=corpora[i].translation.to_list()
    scorer = score.BleurtScorer(checkpoint)
    scores=scorer.score(references=ref, candidates=can)
    bleurtscores.append(scores)
bleurtscores[0]

#### COMET

In [None]:
model = download_model("wmt-large-da-estimator-1719", "comet/")
model1 = download_model("wmt-base-da-estimator-1719", "comet1/")
model2 = download_model("wmt-large-hter-estimator", "comet2/")
model3 = download_model("wmt-base-hter-estimator", "comet3/")
model4 = download_model("emnlp-base-da-ranker", "comet4/")
model5 = download_model("wmt-large-qe-estimator-1719", "comet5/")

In [None]:
cometscores0=[]
for i in (range(len(corpus))):
    data = {"src": corpora[i].source, "mt": corpora[i].translation, "ref": corpora[i].reference}
    data = [dict(zip(data, t)) for t in zip(*data.values())]
    model.predict(data, cuda=True, show_progress=True)
    
    _, sgm_scores = model.predict(data, cuda=True, show_progress=False)
    corpus_score = sum(sgm_scores)/len(sgm_scores)
    
    cometscores0.append(corpus_score)


In [None]:
cometscores[0]

In [None]:
for i in range(len(corpus)):
    corpora[i]=pd.concat([corpora[i],pd.DataFrame(cometscores0[i].numpy())], axis=1, join='inner')

In [None]:
cometscores1=[]
for i in (range(len(corpus))):
    data = {"src": corpora[i].source, "mt": corpora[i].translation, "ref": corpora[i].reference}
    data = [dict(zip(data, t)) for t in zip(*data.values())]
    model1.predict(data, cuda=True, show_progress=True)
    
    _, sgm_scores = model1.predict(data, cuda=True, show_progress=False)
    corpus_score = sum(sgm_scores)/len(sgm_scores)
    
    cometscores1.append(corpus_score)

In [None]:
for i in range(len(corpus)):
    corpora[i]=pd.concat([corpora[i],pd.DataFrame(cometscores1[i].numpy())], axis=1, join='inner')


In [None]:
cometscores2=[]
for i in (range(len(corpus))):
    data = {"src": corpora[i].source, "mt": corpora[i].translation, "ref": corpora[i].reference}
    data = [dict(zip(data, t)) for t in zip(*data.values())]
    model2.predict(data, cuda=True, show_progress=True)
    
    _, sgm_scores = model2.predict(data, cuda=True, show_progress=False)
    corpus_score = sum(sgm_scores)/len(sgm_scores)
    
    cometscores2.append(corpus_score)

In [None]:
for i in range(len(corpus)):
    corpora[i]=pd.concat([corpora[i],pd.DataFrame(cometscores2[i].numpy())], axis=1, join='inner')


In [None]:
cometscores3=[]
for i in (range(len(corpus))):
    data = {"src": corpora[i].source, "mt": corpora[i].translation, "ref": corpora[i].reference}
    data = [dict(zip(data, t)) for t in zip(*data.values())]
    model3.predict(data, cuda=True, show_progress=True)
    
    _, sgm_scores = model3.predict(data, cuda=True, show_progress=False)
    corpus_score = sum(sgm_scores)/len(sgm_scores)
    
    cometscores3.append(corpus_score)

In [None]:
for i in range(len(corpus)):
    corpora[i]=pd.concat([corpora[i],pd.DataFrame(cometscores3[i].numpy())], axis=1, join='inner')


In [None]:
cometscores4=[]
for i in (range(len(corpus))):
    data = {"src": corpora[i].source, "mt": corpora[i].translation, "ref": corpora[i].reference}
    data = [dict(zip(data, t)) for t in zip(*data.values())]
    model4.predict(data, cuda=True, show_progress=True)
    
    _, sgm_scores = model4.predict(data, cuda=True, show_progress=False)
    corpus_score = sum(sgm_scores)/len(sgm_scores)
    
    cometscores4.append(corpus_score)

In [None]:
for i in range(len(corpus)):
    corpora[i]=pd.concat([corpora[i],pd.DataFrame(cometscores4[i].numpy())], axis=1, join='inner')


In [None]:
cometscores5=[]
for i in (range(len(corpus))):
    data = {"src": corpora[i].source, "mt": corpora[i].translation, "ref": corpora[i].reference}
    data = [dict(zip(data, t)) for t in zip(*data.values())]
    model5.predict(data, cuda=True, show_progress=True)
    
    _, sgm_scores = model5.predict(data, cuda=True, show_progress=False)
    corpus_score = sum(sgm_scores)/len(sgm_scores)
    
    cometscores5.append(corpus_score)

In [None]:
for i in range(len(corpus)):
    corpora[i]=pd.concat([corpora[i],pd.DataFrame(cometscores5[i].numpy())], axis=1, join='inner')


In [None]:
#Empty list to keep correlations of WMDistance and Z-Score
correl=[]
KendallT=[]
KendallP=[]
#Clalculate the correlation of columns WMDistance and Z-Score on all corpora
for i in range(len(corpus)):
    correl.append(round(corpora[i]['WMDistance'].corr(corpora[i]['z-score']),2))
    T,P=stats.kendalltau(corpora[i]['WMDistance'], corpora[i]['z-score'])
    KendallT.append(round(T,2))
    KendallP.append(round(P,2))
#Create a DataFrame with the correlation calculated for each language pair
corrWMDistance=pd.DataFrame(np.array([corpus,correl,KendallT,KendallP])).T
#Rename the columns
corrWMDistance.columns = ['Corpus', 'WMDistance (Pearson)','WMDistance (Kendall Tau-T)','WMDistance (Kendall Tau-P)']
#calculate the mean of the correlations in the entire corpora
avgPearson=corrWMDistance['WMDistance (Pearson)'].astype(float).mean()
avgTau=corrWMDistance['WMDistance (Kendall Tau-T)'].astype(float).mean()
avgP=corrWMDistance['WMDistance (Kendall Tau-P)'].astype(float).mean()
corrWMDistance=corrWMDistance.append({'Corpus':'Average','WMDistance (Pearson)':round(avgPearson,2),\
                         'WMDistance (Kendall Tau-T)':round(avgTau,2),'WMDistance (Kendall Tau-P)':round(avgP,2)},\
                         ignore_index=True)
corrWMDistance.set_index('Corpus', inplace=True)
corrWMDistance

In [None]:
#Empty list to keep correlations of WMDistance and Z-Score
correl=[]
KendallT=[]
KendallP=[]
#Clalculate the correlation of columns WMDistance and Z-Score on all corpora
for i in range(len(corpus)):
    correl.append(round(corpora[i]['WMDistance'].corr(corpora[i]['z-score']),2))
    T,P=stats.kendalltau(corpora[i]['WMDistance'], corpora[i]['z-score'])
    KendallT.append(round(T,2))
    KendallP.append(round(P,2))
#Create a DataFrame with the correlation calculated for each language pair
corrWMDistance=pd.DataFrame(np.array([corpus,correl,KendallT,KendallP])).T
#Rename the columns
corrWMDistance.columns = ['Corpus', 'WMDistance (Pearson)','WMDistance (Kendall Tau-T)','WMDistance (Kendall Tau-P)']
#calculate the mean of the correlations in the entire corpora
avgPearson=corrWMDistance['WMDistance (Pearson)'].astype(float).mean()
avgTau=corrWMDistance['WMDistance (Kendall Tau-T)'].astype(float).mean()
avgP=corrWMDistance['WMDistance (Kendall Tau-P)'].astype(float).mean()
corrWMDistance=corrWMDistance.append({'Corpus':'Average','WMDistance (Pearson)':round(avgPearson,2),\
                         'WMDistance (Kendall Tau-T)':round(avgTau,2),'WMDistance (Kendall Tau-P)':round(avgP,2)},\
                         ignore_index=True)
corrWMDistance.set_index('Corpus', inplace=True)
corrWMDistance

In [None]:
#Empty list to keep correlations of WMDistance and Z-Score
correl=[]
KendallT=[]
KendallP=[]
#Clalculate the correlation of columns WMDistance and Z-Score on all corpora
for i in range(len(corpus)):
    correl.append(round(corpora[i]['WMDistance'].corr(corpora[i]['z-score']),2))
    T,P=stats.kendalltau(corpora[i]['WMDistance'], corpora[i]['z-score'])
    KendallT.append(round(T,2))
    KendallP.append(round(P,2))
#Create a DataFrame with the correlation calculated for each language pair
corrWMDistance=pd.DataFrame(np.array([corpus,correl,KendallT,KendallP])).T
#Rename the columns
corrWMDistance.columns = ['Corpus', 'WMDistance (Pearson)','WMDistance (Kendall Tau-T)','WMDistance (Kendall Tau-P)']
#calculate the mean of the correlations in the entire corpora
avgPearson=corrWMDistance['WMDistance (Pearson)'].astype(float).mean()
avgTau=corrWMDistance['WMDistance (Kendall Tau-T)'].astype(float).mean()
avgP=corrWMDistance['WMDistance (Kendall Tau-P)'].astype(float).mean()
corrWMDistance=corrWMDistance.append({'Corpus':'Average','WMDistance (Pearson)':round(avgPearson,2),\
                         'WMDistance (Kendall Tau-T)':round(avgTau,2),'WMDistance (Kendall Tau-P)':round(avgP,2)},\
                         ignore_index=True)
corrWMDistance.set_index('Corpus', inplace=True)
corrWMDistance

In [None]:
#Empty list to keep correlations of WMDistance and Z-Score
correl=[]
KendallT=[]
KendallP=[]
#Clalculate the correlation of columns WMDistance and Z-Score on all corpora
for i in range(len(corpus)):
    correl.append(round(corpora[i]['WMDistance'].corr(corpora[i]['z-score']),2))
    T,P=stats.kendalltau(corpora[i]['WMDistance'], corpora[i]['z-score'])
    KendallT.append(round(T,2))
    KendallP.append(round(P,2))
#Create a DataFrame with the correlation calculated for each language pair
corrWMDistance=pd.DataFrame(np.array([corpus,correl,KendallT,KendallP])).T
#Rename the columns
corrWMDistance.columns = ['Corpus', 'WMDistance (Pearson)','WMDistance (Kendall Tau-T)','WMDistance (Kendall Tau-P)']
#calculate the mean of the correlations in the entire corpora
avgPearson=corrWMDistance['WMDistance (Pearson)'].astype(float).mean()
avgTau=corrWMDistance['WMDistance (Kendall Tau-T)'].astype(float).mean()
avgP=corrWMDistance['WMDistance (Kendall Tau-P)'].astype(float).mean()
corrWMDistance=corrWMDistance.append({'Corpus':'Average','WMDistance (Pearson)':round(avgPearson,2),\
                         'WMDistance (Kendall Tau-T)':round(avgTau,2),'WMDistance (Kendall Tau-P)':round(avgP,2)},\
                         ignore_index=True)
corrWMDistance.set_index('Corpus', inplace=True)
corrWMDistance

In [None]:
#Empty list to keep correlations of WMDistance and Z-Score
correl=[]
KendallT=[]
KendallP=[]
#Clalculate the correlation of columns WMDistance and Z-Score on all corpora
for i in range(len(corpus)):
    correl.append(round(corpora[i]['WMDistance'].corr(corpora[i]['z-score']),2))
    T,P=stats.kendalltau(corpora[i]['WMDistance'], corpora[i]['z-score'])
    KendallT.append(round(T,2))
    KendallP.append(round(P,2))
#Create a DataFrame with the correlation calculated for each language pair
corrWMDistance=pd.DataFrame(np.array([corpus,correl,KendallT,KendallP])).T
#Rename the columns
corrWMDistance.columns = ['Corpus', 'WMDistance (Pearson)','WMDistance (Kendall Tau-T)','WMDistance (Kendall Tau-P)']
#calculate the mean of the correlations in the entire corpora
avgPearson=corrWMDistance['WMDistance (Pearson)'].astype(float).mean()
avgTau=corrWMDistance['WMDistance (Kendall Tau-T)'].astype(float).mean()
avgP=corrWMDistance['WMDistance (Kendall Tau-P)'].astype(float).mean()
corrWMDistance=corrWMDistance.append({'Corpus':'Average','WMDistance (Pearson)':round(avgPearson,2),\
                         'WMDistance (Kendall Tau-T)':round(avgTau,2),'WMDistance (Kendall Tau-P)':round(avgP,2)},\
                         ignore_index=True)
corrWMDistance.set_index('Corpus', inplace=True)
corrWMDistance

#### METEOR

In [None]:
from meteor import meteor_score

In [None]:
round(meteor_score(['this is a cat'], 'non matching hypothesis'),4) 

In [None]:
import tensorflow as tf
import pickle
import numpy as np
from numpy.random import multinomial, shuffle
import sys
sys.path.append('..')
from pprint import pprint
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.meteor.meteor import Meteor

In [None]:
def score(ref, hypo):
    """
    ref, dictionary of reference sentences (id, sentence)
    hypo, dictionary of hypothesis sentences (id, sentence)
    score, dictionary of scores
    """
    scorers = [
        (Meteor(),"METEOR"),
        (Cider(), "CIDEr")
    ]
    final_scores = {}
    for scorer, method in scorers:
        score, scores = scorer.compute_score(ref[0], hypo[0])
        if type(score) == list:
            for m, s in zip(method, score):
                final_scores[m] = s
        else:
            final_scores[method] = score
    return final_scores

In [None]:
score(corpora[0].reference,corpora[0].translation)
score(corpora[1].reference,corpora[1].translation)
score(corpora[2].reference,corpora[2].translation)
score(corpora[3].reference,corpora[3].translation)
score(corpora[4].reference,corpora[4].translation)
score(corpora[5].reference,corpora[5].translation)