<div align="center">
    <h1>MDSAA TEXT MINING 2020-2021</h1>
    <h2>MACHINE TRANSLATION METRICS</h2>
    <p style="text-align:center">David Sotto-Mayor Machado (m20201023@novaims.unl.pt), Maikel Sousa (m20200735@novaims.unl.pt), Catarina Moreira (m20201034@novaims.unl.pt)</p>
</div>

<h2>Import Libraries</h2>

In [1]:
import os, sys, re, time, gzip, zlib, logging, transformers, urllib.request, shutil
import pandas as pd
import numpy as np
from collections import Counter
from rouge import Rouge
from tqdm.notebook import tqdm
from nltk.translate.bleu_score import sentence_bleu
from nltk.corpus import stopwords
from nltk import download
from stopwordsiso import stopwords as swordsiso
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import matplotlib.pyplot as plt
from matplotlib import rcParams
from bert_score import score
import scipy.stats as stats

In [2]:
#ignore warnings
import warnings
warnings.filterwarnings("ignore")

<h2>Import Corpora</h2>

In [3]:
#Import all the corpus csv files
folder='corpus'
corpus = os.listdir(folder)
filename = 'scores.csv'
corpora=[]
for sub in corpus:
    corpora.append(pd.read_csv('{}/{}/{}'.format(folder, sub, filename)))
print(len(corpora),'corpora imported')

6 corpora imported


In [4]:
#drop rows with missing translations or references
for i in range(len(corpus)):
    corpora[i].drop(corpora[i][corpora[i]["reference"] == "."].index, inplace=True)
    corpora[i].drop(corpora[i][corpora[i]["translation"] == "."].index, inplace=True)
    corpora[i].reset_index(drop=True, inplace=True)

In [5]:
#list the different translation language pairs
corpus

['cs-en', 'de-en', 'en-fi', 'en-zh', 'ru-en', 'zh-en']

In [6]:
#Verify one of the corpus
corpora[0].head(5)

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators
0,Uchopíte pak zbraň mezi své předloktí a rameno...,You will then grab the weapon between your for...,You then grasp the gun between your forearm an...,-0.675383,60.0,3
1,"Ale je-li New York změna, pak je to také znovu...","But if New York is changed, then it's also a r...","But if New York is change, it is also reinvent...",-0.829403,44.0,2
2,"Dlouho a intenzivně jsem během léta přemýšlel,...",I have been thinking over and over again over ...,I have thought long and hard over the course o...,0.803185,96.5,2
3,"Najdou si jiný způsob, jak někde podvádět.",They find another way to cheat somewhere.,They will find another way how to defraud others.,0.563149,90.5,2
4,Zpráva o výměně v čele prezidentovy administra...,The report on the replacement of the president...,The news of the replacement at the top of the ...,0.021549,74.666667,3


<h2>BLEU - Bi-Lingual Evaluation Understudy</h2>

In [7]:
#Calculate BLEU Score for the entire Corpora 
for i in tqdm(range(len(corpus))):
    #Temp Series to store the BLEU-4 scores (initiated as 0 and type float64)
    bleus=np.zeros(corpora[i].shape[0],'float64')
    for j in tqdm(range(corpora[i].shape[0])):
        #The human translation
        reference=corpora[i].loc[j].reference.lower().split()
        #The automatic translation bein evaluated
        translation=corpora[i].loc[j].translation.lower().split()
        #Calculate the BLEU-4 (cumulative 4-gram BLEU score)
        bleus[j]=sentence_bleu(translation,reference, weights=(0.25, 0.25, 0.25, 0.25))
    corpora[i]['BLEU']=bleus
#Examine BLEU-4 Score for one language pair
corpora[1].head(5)

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/11585 [00:00<?, ?it/s]

  0%|          | 0/21704 [00:00<?, ?it/s]

  0%|          | 0/6748 [00:00<?, ?it/s]

  0%|          | 0/10221 [00:00<?, ?it/s]

  0%|          | 0/17977 [00:00<?, ?it/s]

  0%|          | 0/26418 [00:00<?, ?it/s]

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators,BLEU
0,"Ihr Zeitlupentempo maßen sie, als sie vor Spit...",Her timeless pace measures them when they equi...,Their slow speed was measured by researchers o...,-0.345024,76.0,1,9.257325e-232
1,"Er sagte, dass die Bereiche ruhige Treffpunkte...",He said the areas offer quiet meeting points b...,He said the spaces provided calm meeting point...,0.9038,97.5,2,0.0
2,Für die Geschäftsleute an der B 27 ist es nur ...,"For businessmen at the B 27, it's only a small...",This is only a small consolation for businesse...,0.700503,94.0,1,1.000369e-231
3,Diese Fähigkeit sei möglicherweise angeboren o...,This ability may be born or developed with gen...,"This ability may be innate, or may develop as ...",-1.256572,51.5,2,0.0
4,Weil sie Wassertemperaturen um die sechs Grad ...,Because they prefer water temperatures around ...,They generally only come to the surface in win...,0.293909,87.0,2,0.0


In [8]:
#Empty list to keep correlations of BLEU and Z-Score
correl=[]
KendallT=[]
KendallP=[]
#Clalculate the correlation of columns BLEU and Z-Score on all corpora
for i in range(len(corpus)):
    correl.append(corpora[i].BLEU.corr(corpora[i]['z-score']))
    T,P=stats.kendalltau(corpora[i].BLEU, corpora[i]['z-score'])
    KendallT.append(T)
    KendallP.append(P)
#Create a DataFrame with the correlation calculated for each language pair
corrbleu=pd.DataFrame(np.array([corpus,correl,KendallT,KendallP])).T
#Rename the columns
corrbleu.columns = ['Corpus', 'BLEU (Pearson)','BLEU (Kendalltau-T)','BLEU (Kendalltau-P)']
#calculate the mean of the correlations in the entire corpora
avgPearson=corrbleu['BLEU (Pearson)'].astype(float).mean()
avgTau=corrbleu['BLEU (Kendalltau-T)'].astype(float).mean()
avgP=corrbleu['BLEU (Kendalltau-P)'].astype(float).mean()
corrbleu=corrbleu.append({'Corpus':'Average','BLEU (Pearson)':round(avgPearson,2),\
                         'BLEU (Kendalltau-T)':round(avgTau,2),'BLEU (Kendalltau-P)':round(avgP,2)}, ignore_index=True)
corrbleu.set_index('Corpus', inplace=True)
corrbleu

Unnamed: 0_level_0,BLEU (Pearson),BLEU (Kendalltau-T),BLEU (Kendalltau-P)
Corpus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cs-en,-0.0066457238550517,0.0144545412978836,0.0359871612890432
de-en,0.0032126147207286,0.0104924703021975,0.0365549951383828
en-fi,0.0283750214071662,-0.014068149871644,0.1516126785955457
en-zh,-0.0156250656277276,0.0066910966166923,0.3969954273363127
ru-en,-0.0095122160724398,0.0094435724226507,0.0904995361265225
zh-en,0.0012337911742889,-0.0353156016465032,3.3419059744954125e-15
Average,0.0,-0.0,0.12


<h2>ROUGE-N</h2>

In [9]:
#Empty list to store Rouge-N scores
rgn=[]
#Instantiate Rouge
rouge = Rouge()
#Get the Rouge-N scores for each language pair
for i in tqdm(range(len(corpus))):
    rgn.append(pd.DataFrame(rouge.get_scores(corpora[4].translation, corpora[4].reference)))
#validate output
rgn[3].head(5)

  0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0,rouge-1,rouge-2,rouge-l
0,"{'f': 0.5185185135253774, 'p': 0.5384615384615...","{'f': 0.23999999500800012, 'p': 0.25, 'r': 0.2...","{'f': 0.4615384565384616, 'p': 0.4615384615384..."
1,"{'f': 0.586206891646849, 'p': 0.68, 'r': 0.515...","{'f': 0.2857142808163266, 'p': 0.3333333333333...","{'f': 0.45283018372374517, 'p': 0.5, 'r': 0.41..."
2,"{'f': 0.4999999950347222, 'p': 0.4615384615384...","{'f': 0.27272726776859507, 'p': 0.25, 'r': 0.3}","{'f': 0.4999999950347222, 'p': 0.4615384615384..."
3,"{'f': 0.49122806517697754, 'p': 0.482758620689...","{'f': 0.25454544954710745, 'p': 0.25, 'r': 0.2...","{'f': 0.4905660327376291, 'p': 0.4814814814814..."
4,"{'f': 0.33333332847222225, 'p': 0.4, 'r': 0.28...","{'f': 0.1999999952000001, 'p': 0.25, 'r': 0.16...","{'f': 0.33333332847222225, 'p': 0.4, 'r': 0.28..."


In [10]:
#Empty lists to store the dataframes with Rouge-1, Rouge-2, Rouge-L scores split into columns
rg1=[]
rg2=[]
rgl=[]
#For each column of rgn store the 'exlosion' of its dictionay in a new dataframe
for i in tqdm(range(len(corpus))):
    #'exploding' dictionaries in a temp dataframe to preserve the other colmns
    temp=rgn[i]
    rg1.append(pd.json_normalize(temp['rouge-1']))
    rg1[i].columns=["ROUGE-1 F1-Score","ROUGE-1 Precision","ROUGE-1 Recall"]
    temp=rgn[i]
    rg2.append(pd.json_normalize(temp['rouge-2']))
    rg2[i].columns=["ROUGE-2 F1-Score","ROUGE-2 Precision","ROUGE-2 Recall"]
    temp=rgn[i]
    rgl.append(pd.json_normalize(temp['rouge-l']))
    rgl[i].columns=["ROUGE-L F1-Score","ROUGE-L Precision","ROUGE-L Recall"]
rg1[0].head(5)

  0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0,ROUGE-1 F1-Score,ROUGE-1 Precision,ROUGE-1 Recall
0,0.518519,0.538462,0.5
1,0.586207,0.68,0.515152
2,0.5,0.461538,0.545455
3,0.491228,0.482759,0.5
4,0.333333,0.4,0.285714


In [11]:
for i in tqdm(range(len(corpus))):
    corpora[i]=pd.concat([corpora[i],rg1[i],rg2[i],rgl[i]], axis=1, join='inner')
corpora[3].head(5)

  0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators,BLEU,ROUGE-1 F1-Score,ROUGE-1 Precision,ROUGE-1 Recall,ROUGE-2 F1-Score,ROUGE-2 Precision,ROUGE-2 Recall,ROUGE-L F1-Score,ROUGE-L Precision,ROUGE-L Recall
0,"""In the GISS model's simulation, Venus' slow s...",GSIS的科学家AnthonyDelGenio在新闻稿中解释说：“在GISS模型的模拟模型中...,戈达德太空研究所科学家安东尼·德尔·杰尼奥在新闻发布会上解释说：“在戈达德太空研究所的模型模...,-1.171867,50.0,1,0.0,0.518519,0.538462,0.5,0.24,0.25,0.230769,0.461538,0.461538,0.461538
1,Ai Yanhan of China in the Women's 4 x 200m Fre...,中国在英国女性4x200mFreestreyWTE中的最后被称为：“中国14岁的孩子从球下降...,参加女子4x200米自由泳接力赛决赛的中国小将艾衍含被这样描述：“那名14岁的中国小姑娘犯了...,-2.255403,26.5,2,0.0,0.586207,0.68,0.515152,0.285714,0.333333,0.25,0.45283,0.5,0.413793
2,"Then came 2012, when nothing much went right f...",然后来到2012年，当她和她的队友们没有什么好处。,2012年，她和她的队友都不被看好。,-2.508996,21.0,1,0.0,0.5,0.461538,0.545455,0.272727,0.25,0.3,0.5,0.461538,0.545455
3,"Since last year, Guodian Group has exported a ...",自去年以来，GoudianGroup从南非通过南非港口出口了163套风力发电项目。,自去年以来，国电集团共计有163套风电项目陆续从连云港港出口南非。,-2.41678,23.0,1,0.0,0.491228,0.482759,0.5,0.254545,0.25,0.259259,0.490566,0.481481,0.5
4,"Some alleged that the Kempinski hotel simply ""...","一些人指称，Kempinski旅馆只是""被捕""，以满足阿拉伯客户的要求。",有人认为凯宾斯基酒店简直是为了满足阿拉伯客户的要求而“卑躬屈膝”。,-1.489676,45.0,7,0.0,0.333333,0.4,0.285714,0.2,0.25,0.166667,0.333333,0.4,0.285714


In [12]:
#Empty list to keep correlations of BLEU and Z-Score
correl1=[]
correl2=[]
correl3=[]
correl4=[]
correl5=[]
correl6=[]
correl7=[]
correl8=[]
correl9=[]
#Clalculate the correlation of columns ROUGE... and Z-Score on all corpora
for i in range(len(corpus)):
    correl1.append(corpora[i]['ROUGE-1 F1-Score'].corr(corpora[i]['z-score']))
    correl1[i]=round(correl1[i],2)
    correl2.append(corpora[i]['ROUGE-1 Precision'].corr(corpora[i]['z-score']))
    correl2[i]=round(correl2[i],2)
    correl3.append(corpora[i]['ROUGE-1 Recall'].corr(corpora[i]['z-score']))
    correl3[i]=round(correl3[i],2)
    correl4.append(corpora[i]['ROUGE-2 F1-Score'].corr(corpora[i]['z-score']))
    correl4[i]=round(correl4[i],2)
    correl5.append(corpora[i]['ROUGE-2 Precision'].corr(corpora[i]['z-score']))
    correl5[i]=round(correl5[i],2)
    correl6.append(corpora[i]['ROUGE-2 Recall'].corr(corpora[i]['z-score']))
    correl6[i]=round(correl6[i],2)
    correl7.append(corpora[i]['ROUGE-L F1-Score'].corr(corpora[i]['z-score']))
    correl7[i]=round(correl7[i],2)
    correl8.append(corpora[i]['ROUGE-L Precision'].corr(corpora[i]['z-score']))
    correl8[i]=round(correl8[i],2)
    correl9.append(corpora[i]['ROUGE-L Recall'].corr(corpora[i]['z-score']))
    correl9[i]=round(correl9[i],2)
#Create a DataFrame with the correlation calculated for each language pair
corrouge=pd.DataFrame(np.array([corpus,correl1,correl2,correl3,correl4,correl5,correl6,correl7,correl8,correl9])).T
#Rename the columns
corrouge.columns = ['Corpus', 'ROUGE-1 F1-Score', 'ROUGE-1 Precision', 'ROUGE-1 Recall', 'ROUGE-2 F1-Score',\
                    'ROUGE-2 Precision', 'ROUGE-2 Recall', 'ROUGE-L F1-Score', 'ROUGE-L Precision', 'ROUGE-L Recall',]
#calculate the mean of the correlations in the entire corpora
avg1=corrouge['ROUGE-1 F1-Score'].astype(float).mean()
avg2=corrouge['ROUGE-1 Precision'].astype(float).mean()
avg3=corrouge['ROUGE-1 Recall'].astype(float).mean()
avg4=corrouge['ROUGE-2 F1-Score'].astype(float).mean()
avg5=corrouge['ROUGE-2 Precision'].astype(float).mean()
avg6=corrouge['ROUGE-2 Recall'].astype(float).mean()
avg7=corrouge['ROUGE-L F1-Score'].astype(float).mean()
avg8=corrouge['ROUGE-L Precision'].astype(float).mean()
avg9=corrouge['ROUGE-L Recall'].astype(float).mean()
corrouge=corrouge.append({'Corpus':'Average','ROUGE-1 F1-Score':round(avg1,2),'ROUGE-1 Precision':round(avg2,2),\
                          'ROUGE-1 Recall':round(avg3,2),'ROUGE-2 F1-Score':round(avg4,2),'ROUGE-2 Precision':round(avg5,2),\
                          'ROUGE-2 Recall':round(avg6,2),'ROUGE-L F1-Score':round(avg7,2),'ROUGE-L Precision':round(avg8,2),\
                          'ROUGE-L Recall':round(avg9,2),}, ignore_index=True)
corrouge.set_index('Corpus', inplace=True)
corrouge

Unnamed: 0_level_0,ROUGE-1 F1-Score,ROUGE-1 Precision,ROUGE-1 Recall,ROUGE-2 F1-Score,ROUGE-2 Precision,ROUGE-2 Recall,ROUGE-L F1-Score,ROUGE-L Precision,ROUGE-L Recall
Corpus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
cs-en,0.01,0.01,0.01,0.02,0.02,0.02,0.01,0.01,0.02
de-en,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
en-fi,0.02,0.03,0.01,0.03,0.03,0.02,0.03,0.03,0.03
en-zh,-0.0,-0.01,0.0,-0.01,-0.01,-0.0,-0.01,-0.01,-0.0
ru-en,0.33,0.33,0.3,0.3,0.31,0.29,0.34,0.34,0.31
zh-en,0.03,0.03,0.03,0.03,0.03,0.03,0.04,0.04,0.04
Average,0.07,0.07,0.06,0.06,0.06,0.06,0.07,0.07,0.07


<h2>WMD - World Movers Distance</h2>

In [13]:
download('stopwords')
#Get the english and finnish stopwords from nltk.corpus.stopwords
english_stopwords=stopwords.words('english')
finnish_stopwords=stopwords.words('finnish')
#Get the chinese stopwords from stopwordsiso
chinese_stopwords=swordsiso('zh')
#join all stopwords
all_stopwords=english_stopwords
all_stopwords.extend(finnish_stopwords)
all_stopwords.extend(chinese_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
%%time
if not(os.path.isfile('GoogleNews-vectors-negative300.bin')):
    if not(os.path.isfile('GoogleNews-vectors-negative300.bin.gz')):
        urllib.request.urlretrieve("https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz",\
                                   "GoogleNews-vectors-negative300.bin.gz")
    with gzip.open('GoogleNews-vectors-negative300.bin.gz', 'rb') as f_in:
        with open('GoogleNews-vectors-negative300.bin', 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
model = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)

Wall time: 1min 13s


In [15]:
#create a copy of the corpora, remove stopwords, set lowercase and calculate the distance
corp=corpora
for i in tqdm(range(len(corpus))):
    results=np.zeros(corpora[i].shape[0],'float64')
    corp[i]['reference'].apply(lambda x : [word for word in x.lower().split() if word not in all_stopwords])
    corp[i]['translation'].apply(lambda x : [word for word in x.lower().split() if word not in all_stopwords])
    for j in tqdm(range(corp[i].shape[0])):
        results[i]=model.wmdistance(corp[i].loc[j].reference,corp[i].loc[j].translation)
    corpora[i]['WMDistance']=results
corpora[0].head(5)

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/11585 [00:00<?, ?it/s]

  0%|          | 0/17977 [00:00<?, ?it/s]

  0%|          | 0/6748 [00:00<?, ?it/s]

  0%|          | 0/10221 [00:00<?, ?it/s]

  0%|          | 0/17977 [00:00<?, ?it/s]

  0%|          | 0/17977 [00:00<?, ?it/s]

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators,BLEU,ROUGE-1 F1-Score,ROUGE-1 Precision,ROUGE-1 Recall,ROUGE-2 F1-Score,ROUGE-2 Precision,ROUGE-2 Recall,ROUGE-L F1-Score,ROUGE-L Precision,ROUGE-L Recall,WMDistance
0,Uchopíte pak zbraň mezi své předloktí a rameno...,You will then grab the weapon between your for...,You then grasp the gun between your forearm an...,-0.675383,60.0,3,8.3191e-232,0.518519,0.538462,0.5,0.24,0.25,0.230769,0.461538,0.461538,0.461538,0.108759
1,"Ale je-li New York změna, pak je to také znovu...","But if New York is changed, then it's also a r...","But if New York is change, it is also reinvent...",-0.829403,44.0,2,9.134374999999999e-232,0.586207,0.68,0.515152,0.285714,0.333333,0.25,0.45283,0.5,0.413793,0.0
2,"Dlouho a intenzivně jsem během léta přemýšlel,...",I have been thinking over and over again over ...,I have thought long and hard over the course o...,0.803185,96.5,2,9.336117999999999e-232,0.5,0.461538,0.545455,0.272727,0.25,0.3,0.5,0.461538,0.545455,0.0
3,"Najdou si jiný způsob, jak někde podvádět.",They find another way to cheat somewhere.,They will find another way how to defraud others.,0.563149,90.5,2,0.0,0.491228,0.482759,0.5,0.254545,0.25,0.259259,0.490566,0.481481,0.5,0.0
4,Zpráva o výměně v čele prezidentovy administra...,The report on the replacement of the president...,The news of the replacement at the top of the ...,0.021549,74.666667,3,0.0,0.333333,0.4,0.285714,0.2,0.25,0.166667,0.333333,0.4,0.285714,0.0


In [16]:
#Empty list to keep correlations of WMDistance and Z-Score
correl=[]
#Clalculate the correlation of columns WMDistance and Z-Score on all corpora
for i in range(len(corpus)):
    correl.append(corpora[i].WMDistance.corr(corpora[i]['z-score']))
    correl[i]=round(correl[i],2)
#Create a DataFrame with the correlation calculated for each language pair
corrWMDistance=pd.DataFrame(np.array([corpus,correl])).T
#Rename the columns
corrWMDistance.columns = ['Corpus', 'WMDistance']
#calculate the mean of the correlations in the entire corpora
avg=corrWMDistance['WMDistance'].astype(float).mean()
corrWMDistance=corrWMDistance.append({'Corpus':'Average','WMDistance':round(avg,2)}, ignore_index=True)
corrWMDistance.set_index('Corpus', inplace=True)
corrWMDistance

Unnamed: 0_level_0,WMDistance
Corpus,Unnamed: 1_level_1
cs-en,-0.01
de-en,0.01
en-fi,0.02
en-zh,-0.03
ru-en,0.01
zh-en,0.0
Average,0.0


<h2>BERT-Score</h2>

In [17]:
#preparation
#transformers
transformers.tokenization_utils.logger.setLevel(logging.ERROR)
transformers.configuration_utils.logger.setLevel(logging.ERROR)
transformers.modeling_utils.logger.setLevel(logging.ERROR)
#Visualization options
rcParams["xtick.major.size"] = 0
rcParams["xtick.minor.size"] = 0
rcParams["ytick.major.size"] = 0
rcParams["ytick.minor.size"] = 0
rcParams["axes.labelsize"] = "large"
rcParams["axes.axisbelow"] = True
rcParams["axes.grid"] = True

In [18]:
BSP=[]
BSR=[]
BSF1=[]
for i in tqdm(range(len(corpus))):
    Ptemp, Rtemp, F1temp = score(corpora[i].translation, corpora[i].reference, lang=corpus[i][-2:], verbose=True)
    BSP.append(Ptemp)
    BSR.append(Rtemp)
    BSF1.append(F1temp)
for i in range(len(corpus)):
    corpora[i]=pd.concat([corpora[i],BSP[i],BSR[i],BSF1[i]], axis=1, join='inner')
corpora[4].head(5)

  0%|          | 0/6 [00:00<?, ?it/s]

calculating scores...
computing bert embedding.


  0%|          | 0/178 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/182 [00:00<?, ?it/s]

KeyError: 'You will then grab the weapon between your forearm and shoulder just before you hit it in your face with a free elbow.'