<div class="alert alert-block alert-warning">
    
# TM Project - Test Set Preparation<a id='title'></a></b><br>
    
</div>
    


In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import string 
from tqdm.notebook import tqdm
from nltk.stem.snowball import SnowballStemmer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances, cosine_similarity
from sklearn.metrics import jaccard_score

In [2]:
en_fi, cs_en, en_zh, ru_en, zh_en, de_en = pd.read_csv("corpus/testset/en-fi/scores.csv"), pd.read_csv("corpus/testset/cs-en/scores.csv"), pd.read_csv("corpus/testset/en-zh/scores.csv"), pd.read_csv("corpus/testset/ru-en/scores.csv"), pd.read_csv("corpus/testset/zh-en/scores.csv"), pd.read_csv("corpus/testset/de-en/scores.csv")

<div class="alert alert-block alert-warning">

<b> Preprocessing

In [4]:
# For translations for English and Finnish (cs-en, ru-en, zh-en, de-en, en-fi)

def preprocessing(dataframe, column, toEnglish=True, stemming=False, stopwrd=False):
# The default is not removing stopwords or stemming, just lowercasing and removing punctuation

    processed_corpus = []
    for i in tqdm(range(len(dataframe))):
        text = list(dataframe[column])[i]
        
        #Convert to lowercase
        text = text.lower()
        
        # Convert to list from string
        text = text.split()
                
        # Remove punctuations
        text = [word.translate(str.maketrans('', '', string.punctuation)) for word in text]
        
        # Remove stopwords
        if stopwrd:
            # Default is English
            if toEnglish:
                stop_en = stopwords.words('english') 
                text = [word for word in text if word not in stop_en]
            else:
                stop_fi = stopwords.words('finnish') 
                text = [word for word in text if word not in stop_fi]

        # Stemming
        if stemming:
            # Default is English
            if toEnglish:
                stem_en = SnowballStemmer('english')
                text = [stem_en.stem(word) for word in text]
            else:
                stem_fi = SnowballStemmer('finnish')
                text = [stem_fi.stem(word) for word in text]
        
        text = " ".join(text)        
        processed_corpus.append(text)
    return processed_corpus

In [5]:
# For translations for Chinese (en-zh)

# !pip install jieba
import jieba

def preprocessing_chinese(dataframe, column):
    
    processed_corpus = []
    no_punc=dataframe[column].str.replace(r"[%s]+"%punc, "").astype(str)
    
    
    for i in tqdm(range(len(no_punc))):
        text = no_punc[i]
        
        text=[word for word in jieba.cut(text)]
        
        text = " ".join(text)   
        
        processed_corpus.append(text)
    return processed_corpus

In [17]:
ru_en.isna().sum() # --> row 9191

source               0
reference            1
translation          0
clean_translation    0
dtype: int64

In [41]:
ru_en.reference[9191] = 'na' #--> # Replacing null reference with the word 'na'

In [20]:
# Applying the preprocessing to the corpora with English and Finnish translations

for i in [cs_en, ru_en, zh_en, de_en, en_fi]:
    i['clean_translation'] = preprocessing(i, 'translation') # no punctuation and all in lower case
    i['clean_reference']= preprocessing(i, 'reference')

HBox(children=(FloatProgress(value=0.0, max=13157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=13157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=25352.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=25352.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=28404.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=28404.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8097.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8097.0), HTML(value='')))




In [21]:
# Applying the preprocessing to the corpus with Chinese translations

en_zh['clean_reference']= preprocessing(en_zh, 'reference')
en_zh['clean_translation']= preprocessing(en_zh, 'translation')

HBox(children=(FloatProgress(value=0.0, max=22128.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=22128.0), HTML(value='')))




In [43]:
# Replacing all empty references with the word 'na' --> some empty records found when applying TER
ru_en.clean_reference.replace({'':'na'}, inplace=True)

zh_en.clean_reference[14515]='na'

en_zh.clean_reference[4406]='na'

<div class="alert alert-block alert-warning">

<b> BLEU

In [26]:
from nltk.translate.bleu_score import sentence_bleu

In [27]:
# Applying BLEU for all language pairs

for i in [en_fi, cs_en, ru_en, zh_en, de_en, en_zh]:
    i['bleu'] = i.apply(lambda row: sentence_bleu([row['clean_reference']],row['clean_translation']), axis=1)

<div class="alert alert-block alert-warning">

<b> TER

In [28]:
import pyter

In [29]:
def ter_sc(df, ref, trans):
    
    translation = df[trans].to_list()
    reference = df[ref].to_list()
    
    ter = []

    # This function requires the text to be split
    for i in tqdm(range(len(translation))):
        ter.append(pyter.ter(translation[i].split(), reference[i].split()))
        
    return ter

In [58]:
# Applying TER for all language pairs
for i in [en_fi, cs_en, ru_en, zh_en, de_en, en_zh]:
    i['ter_1'] = ter_sc(i, 'clean_reference', 'clean_translation')

HBox(children=(FloatProgress(value=0.0, max=22128.0), HTML(value='')))




<div class="alert alert-block alert-warning">

<b> CHRF

In [60]:
from nltk.translate.chrf_score import sentence_chrf

In [61]:
# Applying CHRF for all language pairs
for i in [en_fi, cs_en, ru_en, zh_en, de_en, en_zh]:
    i['chrf'] = i.apply(lambda row: sentence_chrf([row['clean_reference']],row['clean_translation']), axis=1)

<div class="alert alert-block alert-warning">

<b> METEOR

In [62]:
from nltk.translate import meteor_score

In [63]:
def meteor_sc(df, ref, trans):
    
    translation = df[ref].to_list()
    reference = df[trans].to_list()
    
    meteor = []

    for i in tqdm(range(len(translation))):
        meteor.append(meteor_score.single_meteor_score(reference[i], translation[i]))

    return meteor

In [64]:
# Applying METEOR for all language pairs
for i in [en_fi, cs_en, ru_en, zh_en, de_en, en_zh]:
    i['meteor'] = meteor_sc(i, "clean_reference", "clean_translation")

HBox(children=(FloatProgress(value=0.0, max=8097.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8732.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=13157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=25352.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=28404.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=22128.0), HTML(value='')))




<div class="alert alert-block alert-warning">
<b> GLEU

In [65]:
import nltk.translate.gleu_score as gleu

In [66]:
# Applying GLEU for all language pairs
for i in [en_fi, cs_en, ru_en, zh_en, de_en, en_zh]:
    i['gleu'] = i.apply(lambda row: gleu.sentence_gleu([row['clean_reference']],row['clean_translation']), axis=1)

Saving into csv's the test corpora with the MT metrics scores to use feed the ensemble model

In [67]:
en_fi.to_csv('en_fi_test.csv')
cs_en.to_csv('cs_en_test.csv')
ru_en.to_csv('ru_en_test.csv')
zh_en.to_csv('zh_en_test.csv')
en_zh.to_csv('en_zh_test.csv')
de_en.to_csv('de_en_test.csv')