<div class="alert alert-block alert-warning">
    
# Text Mining Project<a id='title'></a></b><br>
    
</div>

<div class="alert alert-block alert-warning">

1 - [<font color='#000000'> Preprocessing</font>](#pro) <br>
2 - [<font color='#000000'> Simpler metrics</font>](#simple)<br>
3 - [<font color='#000000'> More complex metrics</font>](#da)<br> 
4 - [<font color='#000000'> Training and Development Split</font>](#trdv)<br> 
5 - [<font color='#000000'> Models with bag of words</font>](#bow)<br> 
6 - [<font color='#000000'> Models with word embeddings</font>](#embed)<br> 
7 - [<font color='#000000'> Combining metrics</font>](#comb)<br> 
8 - [<font color='#000000'> Correlations (Pearson and Kendall)</font>](#corr)<br> 
9 - [<font color='#000000'> Applying it to test set</font>](#test)<br> 
</div>

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import string 
from tqdm.notebook import tqdm
from nltk.stem.snowball import SnowballStemmer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances, cosine_similarity
from sklearn.metrics import jaccard_score

In [4]:
en_fi, cs_en, en_zh, ru_en, zh_en, de_en = pd.read_csv("corpus/en-fi/scores.csv"), pd.read_csv("corpus/cs-en/scores.csv"), pd.read_csv("corpus/en-zh/scores.csv"), pd.read_csv("corpus/ru-en/scores.csv"), pd.read_csv("corpus/zh-en/scores.csv"), pd.read_csv("corpus/de-en/scores.csv")

In [5]:
de_en.head()

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators
0,"Ihr Zeitlupentempo maßen sie, als sie vor Spit...",Her timeless pace measures them when they equi...,Their slow speed was measured by researchers o...,-0.345024,76.0,1
1,"Er sagte, dass die Bereiche ruhige Treffpunkte...",He said the areas offer quiet meeting points b...,He said the spaces provided calm meeting point...,0.9038,97.5,2
2,Für die Geschäftsleute an der B 27 ist es nur ...,"For businessmen at the B 27, it's only a small...",This is only a small consolation for businesse...,0.700503,94.0,1
3,Diese Fähigkeit sei möglicherweise angeboren o...,This ability may be born or developed with gen...,"This ability may be innate, or may develop as ...",-1.256572,51.5,2
4,Weil sie Wassertemperaturen um die sechs Grad ...,Because they prefer water temperatures around ...,They generally only come to the surface in win...,0.293909,87.0,2


<div class="alert alert-block alert-warning">

## 1. Preprocessing</font> <a class="anchor" id="pro"></a>

  [Back to introduction](#title)
  
  </div>

In [7]:
# For translations for English and Finnish (cs-en, ru-en, zh-en, de-en, en-fi)

def preprocessing(dataframe, column, toEnglish=True, stemming=False, stopwrd=False):
# The default is not removing stopwords or stemming, just lowercasing and removing punctuation
 
    processed_corpus = []
    for i in tqdm(range(len(dataframe))):
        text = list(dataframe[column])[i]
        
        # Convert to lowercase
        text = text.lower()
        
        # Convert to list from string
        text = text.split()
                
        # Remove punctuation
        text = [word.translate(str.maketrans('', '', string.punctuation)) for word in text]
        
        # Remove stopwords
        if stopwrd:
            # Default is English
            if toEnglish:
                stop_en = stopwords.words('english') 
                text = [word for word in text if word not in stop_en]
            else:
                stop_fi = stopwords.words('finnish') 
                text = [word for word in text if word not in stop_fi]

        # Stemming
        if stemming:
            # Default is English            
            if toEnglish:
                stem_en = SnowballStemmer('english')
                text = [stem_en.stem(word) for word in text]
            else:
                stem_fi = SnowballStemmer('finnish')
                text = [stem_fi.stem(word) for word in text]
        
        text = " ".join(text)        
        processed_corpus.append(text)
    return processed_corpus

In [8]:
# For translations for Chinese (en-zh)

# !pip install jieba
import jieba

def preprocessing_chinese(dataframe, column):
    
    processed_corpus = []
    no_punc=dataframe[column].str.replace(r"[%s]+"%punc, "").astype(str)
    
    
    for i in tqdm(range(len(no_punc))):
        text = no_punc[i]
        
        text=[word for word in jieba.cut(text)]
        
        text = " ".join(text)   
        
        processed_corpus.append(text)
    return processed_corpus

In [9]:
# Applying the preprocessing to the corpora with English translations

for i in [cs_en, ru_en, zh_en, de_en]:
    i['clean_translation_1'] = preprocessing(i, 'translation') # no punctuation and all in lower case
    i['clean_reference_1']= preprocessing(i, 'reference')
    
    i['clean_translation_2'] = preprocessing(i, 'translation', stemming=True, stopwrd=False) # without punctuation, stopwords and with stemming
    i['clean_reference_2']= preprocessing(i, 'reference', stemming=True, stopwrd=False)
    
# Applying the preprocessing to the corpus with Finnish translations

en_fi['clean_translation_1'] = preprocessing(en_fi, 'translation')
en_fi['clean_reference_1']= preprocessing(en_fi, 'reference')

en_fi['clean_translation_2'] = preprocessing(en_fi, 'translation', stemming=True, stopwrd=False, toEnglish=False)
en_fi['clean_reference_2']= preprocessing(en_fi, 'reference', stemming=True, stopwrd=False, toEnglish=False)

# Applying the preprocessing to the corpus with Chinese translations

en_zh['clean_reference']= preprocessing(en_zh, 'reference')
en_zh['clean_translation']= preprocessing(en_zh, 'translation')

HBox(children=(FloatProgress(value=0.0, max=11585.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11585.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11585.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11585.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17980.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17980.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17980.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17980.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26419.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26419.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26419.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26419.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21704.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21704.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21704.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21704.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6748.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6748.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6748.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6748.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10221.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10221.0), HTML(value='')))




In [10]:
# Delete rows that have an empty reference
for i in [en_fi, cs_en, ru_en, zh_en, de_en]:
    i = i[~(i.clean_reference_1 == '')]
    i = i[~(i.clean_reference_2 == '')]
    i = i[~(i.clean_translation_2 == '')]
    i = i[~(i.clean_translation_1 == '')]

In [None]:
zh_en.isna().sum() # run for all languages to see if any reference/translation is empty

In [111]:
zh_en[zh_en.clean_reference_1.isnull()] # The reference is empty -- > error
zh_en = zh_en[~zh_en.clean_reference_1.isnull()] #--> removing those records

zh_en.iloc[24028:24029] #--> this line has an issue because it does not have a reference --> error (caught when running TER)
zh_en = zh_en[~((zh_en.clean_reference_1=='   ')|(zh_en.clean_reference_1=='     '))] #--> removing this record and all similar

ru_en[ru_en.clean_reference_1.isnull()] # The reference is empty -- > error
ru_en = ru_en[~ru_en.clean_reference_1.isnull()] #--> removing those records

Saving the preprocessed corpora into new csv's to facilitate their usage

In [11]:
# en_fi.to_csv('en_fi_clean.csv')
# cs_en.to_csv('cs_en_clean.csv')
# ru_en.to_csv('ru_en_clean.csv')
# zh_en.to_csv('zh_en_clean.csv')
# en_zh.to_csv('en_zh_clean.csv')
# de_en.to_csv('de_en_clean.csv')

In [21]:
de_en.head(2)

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators,chrf,clean_translation_1,clean_reference_1,clean_translation_2,clean_reference_2
0,"Ihr Zeitlupentempo maßen sie, als sie vor Spit...",Her timeless pace measures them when they equi...,Their slow speed was measured by researchers o...,-0.345024,76.0,1,0.333695,their slow speed was measured by researchers o...,her timeless pace measures them when they equi...,slow speed measur research svalbard fit six an...,timeless pace measur equip six anim broadcast ...
1,"Er sagte, dass die Bereiche ruhige Treffpunkte...",He said the areas offer quiet meeting points b...,He said the spaces provided calm meeting point...,0.9038,97.5,2,0.740196,he said the spaces provided calm meeting point...,he said the areas offer quiet meeting points b...,said space provid calm meet point refuge volunt,said area offer quiet meet point refuge volunt


<div class="alert alert-block alert-warning">

## 2. Simpler Metrics</font> <a class="anchor" id="simple"></a>

  [Back to introduction](#title)
  
  </div>

<div class="alert alert-block alert-warning">
    
### 2.1. Distance Metrics to Measure Sentence Similarity
**Euclidean Distance, Manhattan Distance, Cosine Similarity & Jaccard Similarity**

https://github.com/makcedward/nlp/blob/master/sample/nlp-3_basic_distance_measurement_in_text_mining.ipynb

https://scikit-learn.org/stable/modules/classes.html#pairwise-metrics
    
</div>

In [6]:
# Calculating the Jaccard Similarity

def calculate_position(values):
    x = []
    for pos, matrix in enumerate(values):
        if matrix > 0:
            x.append(pos)
    return x

def padding(sentence1, sentence2):
    x1 = sentence1.copy()
    x2 = sentence2.copy()
    
    diff = len(x1) - len(x2)
    
    if diff > 0:
        for i in range(0, diff):
            x2.append(-1)
    elif diff < 0:
        for i in range(0, abs(diff)):
            x1.append(-1)
    
    return x1, x2

In [7]:
def transform_and_calculate(data, translation, reference, measurement=euclidean_distances, avg_type='micro'):
    translations = data[translation].to_list()
    references = data[reference].to_list()
    
    final_results = []

    for i in tqdm(range(len(translations))):
        text = translations[i] + ' ' + references[i]

        tokens_on_each_sentence = [translations[i].split()] + [references[i].split()]

        tokens = text.split()

        label_enc = LabelEncoder()
        onehot_enc = OneHotEncoder()

        encoded_all_tokens = label_enc.fit_transform(list(set(tokens)))
        encoded_all_tokens = encoded_all_tokens.reshape(len(encoded_all_tokens), 1)

        onehot_enc.fit(encoded_all_tokens)

        results = []

        for a in tokens_on_each_sentence:
            encoded_words = label_enc.transform(a)
            
            encoded_words = onehot_enc.transform(encoded_words.reshape(len(encoded_words), 1))

            results.append(np.sum(encoded_words.toarray(), axis=0))

        final_results.append(results)

    all_dists = []
        
    if measurement == jaccard_score:
        for i in tqdm(range(len(final_results))):
            y1, y2 = calculate_position(final_results[i][0]), calculate_position(final_results[i][1])

            x1, x2 = padding(y1, y2)

            dist = measurement(x1, x2, average=avg_type)

            all_dists.append(dist)
            
    else: # either euclidean_distances, manhattan_distances or cosine_similarity
        for i in tqdm(range(len(final_results))):
            dist = measurement([final_results[i][0]], [final_results[i][1]])[0][0]

            all_dists.append(dist)

    return all_dists

In [9]:
# Applying the 4 distance metrics to Preprocessing 1 for all language pairs except en-zh

for i in [en_fi, cs_en, ru_en, zh_en, de_en]:
    i['euclidean_dists_1'] = transform_and_calculate(i, 'clean_translation_1', 'clean_reference_1', euclidean_distances)
    i['manhattan_dists_1'] = transform_and_calculate(i, 'clean_translation_1', 'clean_reference_1', manhattan_distances)
    i['cosine_sim_1'] = transform_and_calculate(i, 'clean_translation_1', 'clean_reference_1', cosine_similarity)
    i['jaccard_sim_1'] = transform_and_calculate(i, 'clean_translation_1', 'clean_reference_1', jaccard_score)

HBox(children=(FloatProgress(value=0.0, max=17977.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17977.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17977.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17977.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17977.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17977.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17977.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17977.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26416.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26416.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26416.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26416.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26416.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26416.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26416.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26416.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21704.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21704.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21704.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21704.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21704.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21704.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21704.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21704.0), HTML(value='')))




In [10]:
# Applying the 4 distance metrics to Preprocessing 2 for all language pairs except en-zh

for i in [en_fi, cs_en, ru_en, zh_en, de_en]:
    i['euclidean_dists_2'] = transform_and_calculate(i, 'clean_translation_2', 'clean_reference_2', euclidean_distances)
    i['manhattan_dists_2'] = transform_and_calculate(i, 'clean_translation_2', 'clean_reference_2', manhattan_distances)
    i['cosine_sim_2'] = transform_and_calculate(i, 'clean_translation_2', 'clean_reference_2', cosine_similarity)
    i['jaccard_sim_2'] = transform_and_calculate(i, 'clean_translation_2', 'clean_reference_2', jaccard_score)

HBox(children=(FloatProgress(value=0.0, max=6748.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6748.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6748.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6748.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6748.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6748.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6748.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6748.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11585.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11585.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11585.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11585.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11585.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11585.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11585.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11585.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17977.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17977.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17977.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17977.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17977.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17977.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17977.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17977.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26416.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26416.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26416.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26416.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26416.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26416.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26416.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26416.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21704.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21704.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21704.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21704.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21704.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21704.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21704.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21704.0), HTML(value='')))




In [11]:
# Applying the 4 distance metrics to the Chinese translation (en-zh)

en_zh['euclidean_dists'] = transform_and_calculate(en_zh, 'clean_translation', 'clean_reference', euclidean_distances)
en_zh['manhattan_dists'] = transform_and_calculate(en_zh, 'clean_translation', 'clean_reference', manhattan_distances)
en_zh['cosine_sim'] = transform_and_calculate(en_zh, 'clean_translation', 'clean_reference', cosine_similarity)
en_zh['jaccard_sim'] = transform_and_calculate(en_zh, 'clean_translation', 'clean_reference', jaccard_score)

HBox(children=(FloatProgress(value=0.0, max=10221.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10221.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10221.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10221.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10221.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10221.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10221.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10221.0), HTML(value='')))




<div class="alert alert-block alert-warning">

### 2.2. BLEU

(Bi-Lingual Evaluation Understudy) score was first proposed in 2002. The most widely used metric for MT evaluation, due to its presumed high correlation with human rankings of MT output.
    
</div>

In [109]:
#!pip install bleu
#!pip install --upgrade pip

In [13]:
from nltk.translate.bleu_score import sentence_bleu

In [14]:
# Applying BLEU for both Preprocessings for all language pairs except en-zh

for i in [en_fi, cs_en, ru_en, zh_en, de_en]:
    i['bleu_1'] = i.apply(lambda row: sentence_bleu([row['clean_reference_1']],row['clean_translation_1']), axis=1)
    i['bleu_2'] = i.apply(lambda row: sentence_bleu([row['clean_reference_2']],row['clean_translation_2']), axis=1)

In [15]:
# Applying BLEU for en-zh language pair

en_zh['bleu'] = en_zh.apply(lambda row: sentence_bleu([row['clean_reference']],row['clean_translation']), axis=1)

<div class="alert alert-block alert-warning">

### 2.3. ROUGE

ROUGE is the Recall-Oriented Understudy for Gisting Evaluation. Its main metrics are:
- ROUGE-N (N-gram)
- ROUGE-L (Longest Common Subsequence)
- ROUGE-S (Skip-gram concurrence metric)

It's derived from BLEU - focuses on recall rather than precision, so it looks at how many n-grams in the reference translation show up in the output, rather than the reverse.

**We'll use ROUGE-1 (with f1-score) and also ROUGE-L.**
We focus on the f1-score because it's a more reliable measure for our model performance, as it relies not only on the model capturing as many words as possible (recall) but doing so without outputting irrelevant words (precision).

Notes -> Sentence-level: Compute longest common subsequence (LCS) between two pieces of text. Newlines are ignored. This is called rougeL in this package.

https://github.com/pltrdy/rouge/blob/master/rouge/rouge.py

`pip install rouge`
    
</div>

In [32]:
from rouge import Rouge

def rouge_sc(df, ref, trans):
    
    reference = df[ref].to_list()
    translation = df[trans].to_list()
    
    rouge = Rouge(stats=["f"])

    rouge_results = rouge.get_scores(translation, reference, avg=False)

    rouge_1 = []
    rouge_l = []

    for i in tqdm(range(len(rouge_results))):
        rouge_1.append(rouge_results[i]['rouge-1']['f'])
        rouge_l.append(rouge_results[i]['rouge-l']['f'])

    return rouge_1, rouge_l

In [33]:
# Applying ROUGE-1 and ROUGE-L for both Preprocessings for all language pairs except en-zh

for i in [en_fi, cs_en, ru_en, zh_en, de_en]:
    i['rouge-1_1'], i['rouge-l_1'] = rouge_sc(i, "clean_reference_1", "clean_translation_1")
    i['rouge-1_2'], i['rouge-l_2'] = rouge_sc(i, "clean_reference_2", "clean_translation_2")
    
    
# Applying ROUGE-1 and ROUGE-L for en-zh language pair
    
en_zh['rouge-1'], en_zh['rouge-l'] = rouge_sc(en_zh, "clean_reference", "clean_translation")

HBox(children=(FloatProgress(value=0.0, max=17977.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17977.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26418.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26418.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21704.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21704.0), HTML(value='')))




<div class="alert alert-block alert-warning">

### 2.4. TER
The Translation Error Rate, measures the number of edits (words deletion, addition and substitution) needed to change the original output translation into an acceptable human-level translation (match the closest reference translation in fluency and semantics).
It is generally preferred to BLEU for estimation of sentence post-editing effort.

TER = $\frac{E}{R}$ = (minimum number of edits) / (average length of reference text)

<b>Note:</b> TERp, or TER-plus, is an extension of TER that also considers paraphrases, stemming, and synonyms.

    
https://blog.taus.net/automated-mt-evaluation-metrics#:~:text=Translation%20Error%20Rate%20(TER)%20is,into%20a%20human%20translated%20reference

https://pypi.org/project/pyter3/#description
</div>

In [37]:
# !pip install pyter3
import pyter

In [38]:
def ter_sc(df, ref, trans):
    
    translation = df[trans].to_list()
    reference = df[ref].to_list()
    
    ter = []

    # This function requires the text to be split
    for i in tqdm(range(len(translation))):
        ter.append(pyter.ter(translation[i].split(), reference[i].split()))
        
    return ter

In [None]:
# Applying TER for both Preprocessings for all language pairs except en-zh

for i in [en_fi, cs_en, ru_en, zh_en, de_en]:
    i['ter_1'] = ter_sc(i, 'clean_reference_1', 'clean_translation_1')
    i['ter_2'] = ter_sc(i, 'clean_reference_2', 'clean_translation_2')

    
# Applying TER for en-zh language pair
    
en_zh['ter'] = ter_sc(en_zh, 'clean_reference', 'clean_translation')

<div class="alert alert-block alert-warning">

### 2.5. CHRF
(Character n-gram F-score) computes the precision, recall and fscore from the ngram overlaps. It returns the support which is the true positive score.

By underspecifying the input type, the function will be agnostic as to how it computes the ngrams and simply take the whichever element in the list; it could be either token or character.
    
</div>

In [18]:
from nltk.translate.chrf_score import sentence_chrf

In [19]:
# Applying CHRF for both Preprocessings for all language pairs except en-zh

for i in [en_fi, cs_en, ru_en, zh_en, de_en]:
    i['chrf_1'] = i.apply(lambda row: sentence_chrf([row['clean_reference_1']],row['clean_translation_1']), axis=1)
    i['chrf_2'] = i.apply(lambda row: sentence_chrf([row['clean_reference_2']],row['clean_translation_2']), axis=1)

    
# Applying CHRF for en-zh language pair
    
en_zh['chrf'] = en_zh.apply(lambda row: sentence_chrf([row['clean_reference']],row['clean_translation']), axis=1)

<div class="alert alert-block alert-warning">

## 3. More Complex Metrics</font> <a class="anchor" id="da"></a>

  [Back to introduction](#title)
  
</div>

<div class="alert alert-block alert-warning">

### 3.1. METEOR
METEOR (Metric for Evaluation of Translation with Explicit ORdering) is similar to BLEU, but more advanced.

The metric is based on the harmonic mean of unigram precision and recall, with recall weighted higher than precision. It also has several features that are not found in other metrics: considers synonyms and compares the stems of words, along with the standard exact word matching. 

The metric was designed to fix some of the problems found in the more popular BLEU metric, and also produce good correlation with human judgement at the sentence or segment level. This differs from the BLEU metric in that BLEU seeks correlation at the corpus level.
    
</div>

In [38]:
# import nltk
# nltk.download('wordnet')

In [24]:
from nltk.translate import meteor_score

In [25]:
def meteor_sc(df, ref, trans):
    
    translation = df[ref].to_list()
    reference = df[trans].to_list()
    
    meteor = []

    for i in tqdm(range(len(translation))):
        meteor.append(meteor_score.single_meteor_score(reference[i], translation[i]))

    return meteor

In [22]:
# Applying METEOR for both Preprocessings for all language pairs except en-zh

for i in [en_fi, cs_en, ru_en, zh_en, de_en]:
    i['meteor_1'] = meteor_sc(i, "clean_reference_1", "clean_translation_1")
    i['meteor_2'] = meteor_sc(i, "clean_reference_2", "clean_translation_2")

    
# Applying METEOR for en-zh language pair
    
en_zh['meteor'] = meteor_sc(en_zh, "clean_reference", "clean_translation")

HBox(children=(FloatProgress(value=0.0, max=6748.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6748.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11585.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11585.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17980.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17980.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26419.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26419.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21704.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=21704.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10221.0), HTML(value='')))




<div class="alert alert-block alert-warning">

### 3.2. BERTscore
    
BERTScore is an automatic evaluation metric for text generation. It computes a similarity score for each token in the candidate sentence with each token in the reference sentence, but, instead of exact matches, it computes token similarity using contextual embeddings. 

BERTScore is more robust, correlates better with human judgments and provides stronger model selection performance than existing metrics.
    
</div>

In [3]:
# !pip install bert-score
from bert_score import score

In [4]:
def bert_sc(df, ref, trans, language):
    references = df[ref].to_list()
    translations = df[trans].to_list()

    prec_bert, recall_bert, f1_bert = score(cands=translations, refs=references, lang=language, verbose=True)
    
    return f1_bert

# Here it's not computationally feasible to run all files in a loop as for the other methods, so they're run one by one

In [6]:
# Applying BERT for the en-zh language pair

en_zh['bert'] = bert_sc(en_zh, "clean_reference", "clean_translation",'zh')

calculating scores...
computing bert embedding.


HBox(children=(FloatProgress(value=0.0, max=191.0), HTML(value='')))


computing greedy matching.


HBox(children=(FloatProgress(value=0.0, max=160.0), HTML(value='')))


done in 274.72 seconds, 37.21 sentences/sec


In [7]:
# Applying BERT for the en-fi language pair

en_fi['bert_1'] = bert_sc(en_fi, "clean_reference_1", "clean_translation_1", 'others')
en_fi['bert_2'] = bert_sc(en_fi, "clean_reference_2", "clean_translation_2", 'others')

calculating scores...
computing bert embedding.


HBox(children=(FloatProgress(value=0.0, max=148.0), HTML(value='')))


computing greedy matching.


HBox(children=(FloatProgress(value=0.0, max=106.0), HTML(value='')))


done in 131.52 seconds, 51.31 sentences/sec
calculating scores...
computing bert embedding.


HBox(children=(FloatProgress(value=0.0, max=148.0), HTML(value='')))


computing greedy matching.


HBox(children=(FloatProgress(value=0.0, max=106.0), HTML(value='')))


done in 137.15 seconds, 49.20 sentences/sec


In [5]:
# After the first two language pairs, BERTscore could not be run, so we were not able to use it in our analysis

cs_en['bert_1'] = bert_sc(cs_en, "clean_reference_1", "clean_translation_1", 'en')

calculating scores...
computing bert embedding.


HBox(children=(FloatProgress(value=0.0, max=222.0), HTML(value='')))




RuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 2.00 GiB total capacity; 1.16 GiB already allocated; 86.93 MiB free; 1.34 GiB reserved in total by PyTorch)

<div class="alert alert-block alert-warning">

### 3.3. GLEU - Google-BLEU

For the GLEU score, we record all sub-sequences of 1, 2, 3 or 4 tokens in output and target sequence (n-grams). We then compute a recall, which is the ratio of the number of matching n-grams to the number of total n-grams in the target (ground truth) sequence, and a precision, which is the ratio of the number of matching n-grams to the number of total n-grams in the generated output sequence. Then GLEU score is simply the minimum of recall and precision. This GLEU score’s range is always between 0
(no matches) and 1 (all match) and it is symmetrical when switching output and target. According to our experiments, GLEU score correlates quite well with the BLEU metric on a corpus level but does not have its drawbacks for our per sentence reward objective.

https://github.com/gcunhase/NLPMetrics/blob/master/notebooks/gleu.ipynb

https://arxiv.org/pdf/1609.08144.pdf
    
</div>

In [72]:
import nltk.translate.gleu_score as gleu

In [73]:
# Applying GLEU for both Preprocessings for all language pairs except en-zh

for i in [en_fi, cs_en, ru_en, zh_en, de_en]:
    i['gleu_1'] = i.apply(lambda row: gleu.sentence_gleu([row['clean_reference_1']],row['clean_translation_1']), axis=1)
    i['gleu_2'] = i.apply(lambda row: gleu.sentence_gleu([row['clean_reference_2']],row['clean_translation_2']), axis=1)
    
    
# Applying GLEU for en-zh language pair
    
en_zh['gleu'] = en_zh.apply(lambda row: gleu.sentence_gleu([row['clean_reference']],row['clean_translation']), axis=1)

Saving into new csv's the corpora with all metrics to facilitate their usage

In [8]:
# en_fi.to_csv('en_fi_new.csv')
# cs_en.to_csv('cs_en_new.csv')
# ru_en.to_csv('ru_en_new.csv')
# zh_en.to_csv('zh_en_new.csv')
# en_zh.to_csv('en_zh_new.csv')
# de_en.to_csv('de_en_new.csv')

<div class="alert alert-block alert-warning">

## 4. Creation of Train/Dev Split</font> <a class="anchor" id="trdv"></a>

  [Back to introduction](#title)
</div>

In [2]:
en_fi, cs_en, en_zh, ru_en, zh_en, de_en = pd.read_csv("en_fi_new.csv"), pd.read_csv("cs_en_new.csv"), pd.read_csv("en_zh_new.csv"), pd.read_csv("ru_en_new.csv"), pd.read_csv("zh_en_new.csv"), pd.read_csv("de_en_new.csv")

<b> All english translations together <b>

In [87]:
DF_all = pd.concat([cs_en, ru_en, zh_en, de_en])
DF_all.reset_index(inplace=True, drop=True)

In [88]:
train, dev = train_test_split(DF_all, test_size=0.3, random_state=42, shuffle=True)
train.reset_index(inplace=True, drop=True)
dev.reset_index(inplace=True, drop=True)

In [89]:
len(train), len(dev)

(54377, 23305)

<b> Finnish split <b>

In [114]:
train_fi, dev_fi = train_test_split(en_fi, test_size=0.3, random_state=42, shuffle=True)
train_fi.reset_index(inplace=True, drop=True)
dev_fi.reset_index(inplace=True, drop=True)

In [115]:
len(train_fi), len(dev_fi)

(4723, 2025)

<b> Chinese split <b>

In [116]:
train_zh, dev_zh = train_test_split(en_zh, test_size=0.3, random_state=42, shuffle=True)
train_zh.reset_index(inplace=True, drop=True)
dev_zh.reset_index(inplace=True, drop=True)

In [117]:
len(train_zh), len(dev_zh)

(7154, 3067)

<div class="alert alert-block alert-warning">

## 5. Models with Bag of Words</font> <a class="anchor" id="bow"></a>

  [Back to introduction](#title)
  
</div>

<b> English BoW <b>

In [185]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(
    max_df=0.8,
    stop_words=None,
    max_features=10000, 
    ngram_range=(1,3)
)

In [186]:
# Joining the reference and translation in a single BoW, both for training and development set
trans_ref_train = (train.clean_reference_1 + train.clean_translation_1).to_list()
trans_ref_dev = (dev.clean_reference_1 + dev.clean_translation_1).to_list()

In [187]:
bow_train = cv.fit_transform(trans_ref_train)
bow_train.shape

(54377, 10000)

In [188]:
bow_dev = cv.transform(trans_ref_dev)
bow_dev.shape

(23305, 10000)

<b> Finnish BoW <b>

In [148]:
# Doing all the same for the Finnish translation corpus

cv_fi = CountVectorizer(
    max_df=0.8,
    stop_words=None,
    max_features=10000, 
    ngram_range=(1,3)
)

In [149]:
trans_ref_train_fi = (train_fi.clean_reference_1 + train_fi.clean_translation_1).to_list()
trans_ref_dev_fi = (dev_fi.clean_reference_1 + dev_fi.clean_translation_1).to_list()

In [150]:
bow_train_fi = cv_fi.fit_transform(trans_ref_train_fi)
bow_train_fi.shape

(4723, 10000)

In [151]:
bow_dev_fi = cv_fi.transform(trans_ref_dev_fi)
bow_dev_fi.shape

(2025, 10000)

<b> Chinese BoW <b>

In [152]:
# Doing all the same for the Chinese translation corpus

cv_zh = CountVectorizer(
    max_df=0.8,
    stop_words=None,
    max_features=10000, 
    ngram_range=(1,3)
)

In [153]:
trans_ref_train_zh = (train_zh.clean_reference + train_zh.clean_translation).to_list()
trans_ref_dev_zh = (dev_zh.clean_reference + dev_zh.clean_translation).to_list()

In [154]:
bow_train_zh = cv_zh.fit_transform(trans_ref_train_zh)
bow_train_zh.shape

(7154, 10000)

In [155]:
bow_dev_zh = cv_zh.transform(trans_ref_dev_zh)
bow_dev_zh.shape

(3067, 10000)

<div class="alert alert-block alert-warning">

### Neural Networks (Sklearn)
</div>

<b> English NN <b>

In [156]:
from sklearn.neural_network import MLPRegressor

regr = MLPRegressor(hidden_layer_sizes=(10,10) ,random_state=42, max_iter=50, alpha=0.01).fit(bow_train, train['z-score'])

In [157]:
pred_train = regr.predict(bow_train)
pred_dev = regr.predict(bow_dev)

In [158]:
from sklearn.metrics import mean_squared_error

mean_squared_error(train['z-score'], pred_train, squared=False)
print('Train:',mean_squared_error(train['z-score'], pred_train, squared=False),'\nDev:',mean_squared_error(dev['z-score'], pred_dev, squared=False))

Train: 0.425469736865002 
Dev: 0.9779057685676135


In [159]:
train['NN_bow'] = pred_train
dev['NN_bow'] = pred_dev

<b> Finnish NN <b>

In [160]:
regr_fi = MLPRegressor(hidden_layer_sizes=(10,10) ,random_state=42, max_iter=50, alpha=0.01).fit(bow_train_fi, train_fi['z-score'])

In [161]:
pred_train_fi = regr_fi.predict(bow_train_fi)
pred_dev_fi = regr_fi.predict(bow_dev_fi)

In [162]:
mean_squared_error(train['z-score'], pred_train, squared=False)
print('Train:',mean_squared_error(train_fi['z-score'], pred_train_fi, squared=False),'\nDev:',mean_squared_error(dev_fi['z-score'], pred_dev_fi, squared=False))

Train: 0.18256874535946918 
Dev: 0.9883580176079034


In [163]:
train_fi['NN_bow'] = pred_train_fi
dev_fi['NN_bow'] = pred_dev_fi

<b> Chinese NN <b>

In [164]:
regr_zh = MLPRegressor(hidden_layer_sizes=(10,10),random_state=42, max_iter=50, alpha=0.01).fit(bow_train_zh, train_zh['z-score'])

In [165]:
pred_train_zh = regr_zh.predict(bow_train_zh)
pred_dev_zh = regr_zh.predict(bow_dev_zh)

In [166]:
print('Train:',mean_squared_error(train_zh['z-score'], pred_train_zh, squared=False),'\nDev:',mean_squared_error(dev_zh['z-score'], pred_dev_zh, squared=False))

Train: 0.6464679058498025 
Dev: 0.9186680390548998


In [167]:
train_zh['NN_bow'] = pred_train_zh
dev_zh['NN_bow'] = pred_dev_zh

<div class="alert alert-block alert-warning">

### Deep Learning (Keras)
    
</div>

In [106]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.metrics import RootMeanSquaredError

<b> English <b>

In [189]:
n_words = bow_train.toarray().shape[1]

# define network
model = Sequential()
model.add(Dense(64, input_shape=(n_words,), activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1))

# compile network
model.compile(optimizer='rmsprop',loss='mse',metrics=[RootMeanSquaredError()])

# fit network
model.fit(bow_train.toarray(), train['z-score'], epochs=30, verbose=2,validation_data=(bow_dev.toarray(), dev['z-score']))

Epoch 1/30
1700/1700 - 95s - loss: 0.7280 - root_mean_squared_error: 0.8532 - val_loss: 0.6958 - val_root_mean_squared_error: 0.8342
Epoch 2/30
1700/1700 - 153s - loss: 0.6669 - root_mean_squared_error: 0.8167 - val_loss: 0.6869 - val_root_mean_squared_error: 0.8288
Epoch 3/30
1700/1700 - 100s - loss: 0.6227 - root_mean_squared_error: 0.7891 - val_loss: 0.6933 - val_root_mean_squared_error: 0.8327
Epoch 4/30
1700/1700 - 32s - loss: 0.5845 - root_mean_squared_error: 0.7645 - val_loss: 0.6986 - val_root_mean_squared_error: 0.8358
Epoch 5/30
1700/1700 - 62s - loss: 0.5504 - root_mean_squared_error: 0.7419 - val_loss: 0.6902 - val_root_mean_squared_error: 0.8308
Epoch 6/30
1700/1700 - 62s - loss: 0.5164 - root_mean_squared_error: 0.7186 - val_loss: 0.6954 - val_root_mean_squared_error: 0.8339
Epoch 7/30
1700/1700 - 37s - loss: 0.4863 - root_mean_squared_error: 0.6973 - val_loss: 0.7276 - val_root_mean_squared_error: 0.8530
Epoch 8/30
1700/1700 - 109s - loss: 0.4576 - root_mean_squared_erro

<tensorflow.python.keras.callbacks.History at 0x1bfb646bf10>

In [190]:
# evaluate
loss, rmse = model.evaluate(bow_dev.toarray(), dev['z-score'], verbose=0)
print('Dev RMSE: %f' % (rmse))

Dev RMSE: 0.936933


In [191]:
train['DL_bow'] = model.predict(bow_train.toarray())
dev['DL_bow'] = model.predict(bow_dev.toarray())

<b> Finnish <b>

In [176]:
n_words_fi = bow_train_fi.toarray().shape[1]

# define network
model_fi = Sequential()
model_fi.add(Dense(64, input_shape=(n_words_fi,), activation='relu'))
model_fi.add(Dense(64, activation='relu'))
model_fi.add(Dense(1))

# compile network
model_fi.compile(optimizer='rmsprop',loss='mse',metrics=[RootMeanSquaredError()])

# fit network
model_fi.fit(bow_train_fi.toarray(), train_fi['z-score'], epochs=30, verbose=2,validation_data=(bow_dev_fi.toarray(), dev_fi['z-score']))

Epoch 1/30
148/148 - 6s - loss: 0.7253 - root_mean_squared_error: 0.8516 - val_loss: 0.6960 - val_root_mean_squared_error: 0.8343
Epoch 2/30
148/148 - 3s - loss: 0.5229 - root_mean_squared_error: 0.7231 - val_loss: 0.6950 - val_root_mean_squared_error: 0.8337
Epoch 3/30
148/148 - 3s - loss: 0.4176 - root_mean_squared_error: 0.6462 - val_loss: 0.6927 - val_root_mean_squared_error: 0.8323
Epoch 4/30
148/148 - 3s - loss: 0.3432 - root_mean_squared_error: 0.5859 - val_loss: 0.7371 - val_root_mean_squared_error: 0.8586
Epoch 5/30
148/148 - 2s - loss: 0.2905 - root_mean_squared_error: 0.5390 - val_loss: 0.7260 - val_root_mean_squared_error: 0.8520
Epoch 6/30
148/148 - 2s - loss: 0.2480 - root_mean_squared_error: 0.4980 - val_loss: 0.7267 - val_root_mean_squared_error: 0.8525
Epoch 7/30
148/148 - 2s - loss: 0.2155 - root_mean_squared_error: 0.4642 - val_loss: 0.7952 - val_root_mean_squared_error: 0.8917
Epoch 8/30
148/148 - 2s - loss: 0.1887 - root_mean_squared_error: 0.4344 - val_loss: 0.781

<tensorflow.python.keras.callbacks.History at 0x1bfb6811e80>

In [177]:
# evaluate
loss, rmse = model_fi.evaluate(bow_dev_fi.toarray(), dev_fi['z-score'], verbose=0)
print('Dev RMSE: %f' % (rmse))

Dev RMSE: 0.905059


In [178]:
train_fi['DL_bow'] = model_fi.predict(bow_train_fi.toarray())
dev_fi['DL_bow'] = model_fi.predict(bow_dev_fi.toarray())

<b> Chinese <b>

In [179]:
n_words_zh = bow_train_zh.toarray().shape[1]

# define network
model_zh = Sequential()
model_zh.add(Dense(64, input_shape=(n_words,), activation='relu'))
model_zh.add(Dense(64, activation='relu'))
model_zh.add(Dense(1))

# compile network
model_zh.compile(optimizer='rmsprop',loss='mse',metrics=[RootMeanSquaredError()])

# fit network
model_zh.fit(bow_train_zh.toarray(), train_zh['z-score'], epochs=30, verbose=2,validation_data=(bow_dev_zh.toarray(), dev_zh['z-score']))

Epoch 1/30
224/224 - 6s - loss: 0.8314 - root_mean_squared_error: 0.9118 - val_loss: 0.7860 - val_root_mean_squared_error: 0.8866
Epoch 2/30
224/224 - 4s - loss: 0.6931 - root_mean_squared_error: 0.8325 - val_loss: 0.7693 - val_root_mean_squared_error: 0.8771
Epoch 3/30
224/224 - 4s - loss: 0.6382 - root_mean_squared_error: 0.7989 - val_loss: 0.7769 - val_root_mean_squared_error: 0.8814
Epoch 4/30
224/224 - 4s - loss: 0.6101 - root_mean_squared_error: 0.7811 - val_loss: 0.7687 - val_root_mean_squared_error: 0.8767
Epoch 5/30
224/224 - 4s - loss: 0.5942 - root_mean_squared_error: 0.7708 - val_loss: 0.7764 - val_root_mean_squared_error: 0.8811
Epoch 6/30
224/224 - 4s - loss: 0.5822 - root_mean_squared_error: 0.7630 - val_loss: 0.7710 - val_root_mean_squared_error: 0.8781
Epoch 7/30
224/224 - 4s - loss: 0.5716 - root_mean_squared_error: 0.7560 - val_loss: 0.7875 - val_root_mean_squared_error: 0.8874
Epoch 8/30
224/224 - 4s - loss: 0.5644 - root_mean_squared_error: 0.7512 - val_loss: 0.779

<tensorflow.python.keras.callbacks.History at 0x1bfbfa6c130>

In [182]:
# evaluate
loss, rmse = model_zh.evaluate(bow_dev_zh.toarray(), dev_zh['z-score'], verbose=0)
print('Dev RMSE: %f' % (rmse))

Dev RMSE: 0.903543


In [183]:
train_zh['DL_bow'] = model_zh.predict(bow_train_zh.toarray())
dev_zh['DL_bow'] = model_zh.predict(bow_dev_zh.toarray())

<div class="alert alert-block alert-warning">

## 6. Models with Word Embeddings</font> <a class="anchor" id="embed"></a>
Done seperately for each language pair.

  [Back to introduction](#title)
  
</div>

<b> de-en

In [3]:
train_ref_de_en = np.load('C:/Users/matip/Documents/Mestrado/2. Text Mining/Project/corpus/Embeddings/corpus/de-en/laser.reference_embeds.npy', allow_pickle=True)
train_source_de_en = np.load('C:/Users/matip/Documents/Mestrado/2. Text Mining/Project/corpus/Embeddings/corpus/de-en/laser.source_embeds.npy', allow_pickle=True)
train_trans_de_en = np.load('C:/Users/matip/Documents/Mestrado/2. Text Mining/Project/corpus/Embeddings/corpus/de-en/laser.translation_embeds.npy', allow_pickle=True)

In [4]:
train, dev = train_test_split(de_en, test_size=0.3, random_state=42, shuffle=False)

train.reset_index(inplace=True, drop=True)
dev.reset_index(inplace=True, drop=True)

In [5]:
len(train)

15192

In [6]:
# Joining reference source and translation
embed_de_en = np.concatenate((train_ref_de_en, train_source_de_en, train_trans_de_en),axis=1)

In [7]:
# Splitting the array to fit the train/dev split
train_embed_de_en = embed_de_en[:15192]
dev_embed_de_en = embed_de_en[15192:]

<b>cs-en

In [38]:
train_ref_cs_en = np.load('C:/Users/matip/Documents/Mestrado/2. Text Mining/Project/corpus/Embeddings/corpus/cs-en/laser.reference_embeds.npy', allow_pickle=True)
train_source_cs_en = np.load('C:/Users/matip/Documents/Mestrado/2. Text Mining/Project/corpus/Embeddings/corpus/cs-en/laser.source_embeds.npy', allow_pickle=True)
train_trans_cs_en = np.load('C:/Users/matip/Documents/Mestrado/2. Text Mining/Project/corpus/Embeddings/corpus/cs-en/laser.translation_embeds.npy', allow_pickle=True)

In [39]:
train_cs, dev_cs = train_test_split(cs_en, test_size=0.3, random_state=42, shuffle=False)

train_cs.reset_index(inplace=True, drop=True)
dev_cs.reset_index(inplace=True, drop=True)

In [40]:
len(train_cs)

8109

In [41]:
# Joining reference source and translation in an array
embed_cs_en = np.concatenate((train_ref_cs_en, train_source_cs_en, train_trans_cs_en),axis=1)

In [42]:
# Splitting the array to fit the train/dev split
train_embed_cs_en = embed_cs_en[:8109]
dev_embed_cs_en = embed_cs_en[8109:]

<b>ru-en

In [43]:
train_ref_ru_en = np.load('C:/Users/matip/Documents/Mestrado/2. Text Mining/Project/corpus/Embeddings/corpus/ru-en/laser.reference_embeds.npy', allow_pickle=True)
train_source_ru_en = np.load('C:/Users/matip/Documents/Mestrado/2. Text Mining/Project/corpus/Embeddings/corpus/ru-en/laser.source_embeds.npy', allow_pickle=True)
train_trans_ru_en = np.load('C:/Users/matip/Documents/Mestrado/2. Text Mining/Project/corpus/Embeddings/corpus/ru-en/laser.translation_embeds.npy', allow_pickle=True)

In [44]:
ru_en = pd.read_csv("ru_en_clean.csv")
train_ru, dev_ru = train_test_split(ru_en, test_size=0.3, random_state=42, shuffle=False)

train_ru.reset_index(inplace=True, drop=True)
dev_ru.reset_index(inplace=True, drop=True)

In [45]:
len(train_ru)

12586

In [46]:
# Joining reference source and translation in an array
embed_ru_en = np.concatenate((train_ref_ru_en, train_source_ru_en, train_trans_ru_en),axis=1)

In [47]:
# Splitting the array to fit the train/dev split
train_embed_ru_en = embed_ru_en[:12586]
dev_embed_ru_en = embed_ru_en[12586:]

<b>zh-en

In [48]:
train_ref_zh_en = np.load('C:/Users/matip/Documents/Mestrado/2. Text Mining/Project/corpus/Embeddings/corpus/zh-en/laser.reference_embeds.npy', allow_pickle=True)
train_source_zh_en = np.load('C:/Users/matip/Documents/Mestrado/2. Text Mining/Project/corpus/Embeddings/corpus/zh-en/laser.source_embeds.npy', allow_pickle=True)
train_trans_zh_en = np.load('C:/Users/matip/Documents/Mestrado/2. Text Mining/Project/corpus/Embeddings/corpus/zh-en/laser.translation_embeds.npy', allow_pickle=True)

In [49]:
zh_en = pd.read_csv("zh_en_clean.csv")

train_zhen, dev_zhen = train_test_split(zh_en, test_size=0.3, random_state=42, shuffle=False)
train_zhen.reset_index(inplace=True, drop=True)
dev_zhen.reset_index(inplace=True, drop=True)

In [50]:
len(train_zhen)

18493

In [51]:
# Joining reference source and translation in an array
embed_zh_en = np.concatenate((train_ref_zh_en, train_source_zh_en, train_trans_zh_en),axis=1)

In [52]:
# Splitting the array to fit the train/dev split
train_embed_zh_en = embed_zh_en[:18493]
dev_embed_zh_en = embed_zh_en[18493:]

<b>en-fi

In [53]:
train_ref_en_fi = np.load('C:/Users/matip/Documents/Mestrado/2. Text Mining/Project/corpus/Embeddings/corpus/en-fi/laser.reference_embeds.npy', allow_pickle=True)
train_source_en_fi = np.load('C:/Users/matip/Documents/Mestrado/2. Text Mining/Project/corpus/Embeddings/corpus/en-fi/laser.source_embeds.npy', allow_pickle=True)
train_trans_en_fi = np.load('C:/Users/matip/Documents/Mestrado/2. Text Mining/Project/corpus/Embeddings/corpus/en-fi/laser.translation_embeds.npy', allow_pickle=True)

In [54]:
train_fi, dev_fi = train_test_split(en_fi, test_size=0.3, random_state=42, shuffle=False)

train_fi.reset_index(inplace=True, drop=True)
dev_fi.reset_index(inplace=True, drop=True)

In [55]:
len(train_fi)

4723

In [56]:
# Joining reference source and translation in an array
embed_en_fi = np.concatenate((train_ref_en_fi, train_source_en_fi, train_trans_en_fi),axis=1)

In [57]:
# Splitting the array to fit the train/dev split
train_embed_en_fi = embed_en_fi[:4723]
dev_embed_en_fi = embed_en_fi[4723:]

<b>en-zh

In [58]:
train_ref_en_zh = np.load('C:/Users/matip/Documents/Mestrado/2. Text Mining/Project/corpus/Embeddings/corpus/en-zh/laser.reference_embeds.npy', allow_pickle=True)
train_source_en_zh = np.load('C:/Users/matip/Documents/Mestrado/2. Text Mining/Project/corpus/Embeddings/corpus/en-zh/laser.source_embeds.npy', allow_pickle=True)
train_trans_en_zh = np.load('C:/Users/matip/Documents/Mestrado/2. Text Mining/Project/corpus/Embeddings/corpus/en-zh/laser.translation_embeds.npy', allow_pickle=True)

In [59]:
train_enzh, dev_enzh = train_test_split(en_zh, test_size=0.3, random_state=42, shuffle=False)
train_enzh.reset_index(inplace=True, drop=True)
dev_enzh.reset_index(inplace=True, drop=True)

In [60]:
len(train_enzh)

7154

In [61]:
# Joining reference source and translation in an array
embed_en_zh = np.concatenate((train_ref_en_zh, train_source_en_zh, train_trans_en_zh),axis=1)

In [62]:
# Splitting the array to fit the train/dev split
train_embed_en_zh = embed_en_zh[:7154]
dev_embed_en_zh = embed_en_zh[7154:]

<div class="alert alert-block alert-warning">

### Neural Networks (SkLearn)
    
</div>

<b> de-en

In [4]:
regr_zh = MLPRegressor(hidden_layer_sizes=(10,10),random_state=42, max_iter=50, alpha=0.01).fit(train_embed_de_en, train['z-score'])

pred_train = regr.predict(train_embed_de_en)
pred_dev = regr.predict(dev_embed_de_en)

print('Train:',mean_squared_error(train['z-score'], pred_train, squared=False),'\nDev:',mean_squared_error(dev['z-score'], pred_dev, squared=False))

In [None]:
train['NN_embedded'] = pred_train
dev['NN_embedded'] = pred_dev

<b> cs-en

In [244]:
regr_cs = MLPRegressor(hidden_layer_sizes=(10,10),random_state=42, max_iter=50, alpha=0.01).fit(train_embed_cs_en, train_cs['z-score'])

pred_train_cs = regr_cs.predict(train_embed_cs_en)
pred_dev_cs = regr_cs.predict(dev_embed_cs_en)

print('Train:',mean_squared_error(train_cs['z-score'], pred_train_cs, squared=False),'\nDev:',mean_squared_error(dev_cs['z-score'], pred_dev_cs, squared=False))

Train: 0.5520173117593772 
Dev: 0.800841242261568


In [245]:
train_cs['NN_embedded'] = pred_train_cs
dev_cs['NN_embedded'] = pred_dev_cs

<b> ru-en

In [38]:
regr_ru = MLPRegressor(hidden_layer_sizes=(10,10),random_state=42, max_iter=50, alpha=0.01).fit(train_embed_ru_en, train_ru['z-score'])

pred_train_ru = regr_ru.predict(train_embed_ru_en)
pred_dev_ru = regr_ru.predict(dev_embed_ru_en)

print('Train:',mean_squared_error(train_ru['z-score'], pred_train_ru, squared=False),'\nDev:',mean_squared_error(dev_ru['z-score'], pred_dev_ru, squared=False))

Train: 0.6775352037724004 
Dev: 0.916386646256473


In [39]:
train_ru['NN_embedded'] = pred_train_ru
dev_ru['NN_embedded'] = pred_dev_ru

<b> zh-en

In [40]:
regr_zhen = MLPRegressor(hidden_layer_sizes=(10,10),random_state=42, max_iter=50, alpha=0.01).fit(train_embed_zh_en, train_zhen['z-score'])

pred_train_zhen = regr_zhen.predict(train_embed_zh_en)
pred_dev_zhen = regr_zhen.predict(dev_embed_zh_en)

print('Train:',mean_squared_error(train_zhen['z-score'], pred_train_zhen, squared=False),'\nDev:',mean_squared_error(dev_zhen['z-score'], pred_dev_zhen, squared=False))

Train: 0.7057464035860933 
Dev: 0.850612245465507


In [41]:
train_zhen['NN_embedded'] = pred_train_zhen
dev_zhen['NN_embedded'] = pred_dev_zhen

<b> en-fi

In [42]:
regr_fi = MLPRegressor(hidden_layer_sizes=(10,10),random_state=42, max_iter=50, alpha=0.01).fit(train_embed_en_fi, train_fi['z-score'])

pred_train_fi = regr_fi.predict(train_embed_en_fi)
pred_dev_fi = regr_fi.predict(dev_embed_en_fi)

print('Train:',mean_squared_error(train_fi['z-score'], pred_train_fi, squared=False),'\nDev:',mean_squared_error(dev_fi['z-score'], pred_dev_fi, squared=False))

Train: 0.5672980908017875 
Dev: 0.7870533995512707


In [43]:
train_fi['NN_embedded'] = pred_train_fi
dev_fi['NN_embedded'] = pred_dev_fi

<b> en-zh

In [44]:
regr_enzh = MLPRegressor(hidden_layer_sizes=(10,10),random_state=42, max_iter=50, alpha=0.01).fit(train_embed_en_zh, train_enzh['z-score'])

pred_train_enzh = regr_enzh.predict(train_embed_en_zh)
pred_dev_enzh = regr_enzh.predict(dev_embed_en_zh)

print('Train:',mean_squared_error(train_enzh['z-score'], pred_train_enzh, squared=False),'\nDev:',mean_squared_error(dev_enzh['z-score'], pred_dev_enzh, squared=False))

Train: 0.6714207144097877 
Dev: 0.8379415212556378


In [45]:
train_enzh['NN_embedded'] = pred_train_enzh
dev_enzh['NN_embedded'] = pred_dev_enzh

<div class="alert alert-block alert-warning">

### Deep Learning (Keras)
    
</div>

In [21]:
train_embed_de_en.shape

(15192, 3072)

In [17]:
n_words = train_embed_de_en.shape[1]

# define network
model = Sequential()
model.add(Dense(64, input_shape=(n_words,), activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1))

# Compile the model
model.compile(optimizer='rmsprop',loss='mse',metrics=[RootMeanSquaredError()])

# fit network
model.fit(train_embed_de_en, train['z-score'], epochs=30, verbose=2)

In [32]:
# evaluate
loss, rmse = model.evaluate(dev_embed_de_en, dev['z-score'], verbose=0)
print('Dev RMSE: %f' % (rmse))

Dev RMSE: 0.837514


In [33]:
train['DL_embed'] = model.predict(train_embed_de_en)
dev['DL_embed'] = model.predict(dev_embed_de_en)

<b> cs-en

In [82]:
n_words_cs = train_embed_cs_en.shape[1]

# define network
model_cs = Sequential()
model_cs.add(Dense(64, input_shape=(n_words_cs,), activation='relu'))
model_cs.add(Dense(64, activation='relu'))
model_cs.add(Dense(1))

# compile network
model_cs.compile(optimizer='rmsprop',loss='mse',metrics=[RootMeanSquaredError()])

# fit network
model_cs.fit(train_embed_cs_en, train_cs['z-score'], epochs=30, verbose=2,validation_data=(dev_embed_cs_en, dev_cs['z-score']))

Epoch 1/30
254/254 - 2s - loss: 0.7244 - root_mean_squared_error: 0.8511 - val_loss: 0.7687 - val_root_mean_squared_error: 0.8768
Epoch 2/30
254/254 - 1s - loss: 0.6606 - root_mean_squared_error: 0.8128 - val_loss: 0.7096 - val_root_mean_squared_error: 0.8424
Epoch 3/30
254/254 - 1s - loss: 0.6085 - root_mean_squared_error: 0.7801 - val_loss: 0.6657 - val_root_mean_squared_error: 0.8159
Epoch 4/30
254/254 - 1s - loss: 0.5649 - root_mean_squared_error: 0.7516 - val_loss: 0.6512 - val_root_mean_squared_error: 0.8069
Epoch 5/30
254/254 - 1s - loss: 0.5346 - root_mean_squared_error: 0.7312 - val_loss: 0.6574 - val_root_mean_squared_error: 0.8108
Epoch 6/30
254/254 - 1s - loss: 0.4996 - root_mean_squared_error: 0.7069 - val_loss: 0.6352 - val_root_mean_squared_error: 0.7970
Epoch 7/30
254/254 - 1s - loss: 0.4766 - root_mean_squared_error: 0.6904 - val_loss: 0.6443 - val_root_mean_squared_error: 0.8027
Epoch 8/30
254/254 - 1s - loss: 0.4504 - root_mean_squared_error: 0.6712 - val_loss: 0.640

<tensorflow.python.keras.callbacks.History at 0x168a5da7790>

In [83]:
# evaluate
loss, rmse = model_cs.evaluate(dev_embed_cs_en, dev_cs['z-score'], verbose=0)
print('Dev RMSE: %f' % (rmse))

Dev RMSE: 0.788632


In [84]:
train_cs['DL_embed'] = model_cs.predict(train_embed_cs_en)
dev_cs['DL_embed'] = model_cs.predict(dev_embed_cs_en)

<b> ru-en

In [85]:
n_words_ru = train_embed_ru_en.shape[1]

# define network
model_ru = Sequential()
model_ru.add(Dense(64, input_shape=(n_words_ru,), activation='relu'))
model_ru.add(Dense(64, activation='relu'))
model_ru.add(Dense(1))

# compile network
model_ru.compile(optimizer='rmsprop',loss='mse',metrics=[RootMeanSquaredError()])

# fit network
model_ru.fit(train_embed_ru_en, train_ru['z-score'], epochs=30, verbose=2,validation_data=(dev_embed_ru_en, dev_ru['z-score']))

Epoch 1/30
394/394 - 2s - loss: 0.7443 - root_mean_squared_error: 0.8627 - val_loss: 0.8555 - val_root_mean_squared_error: 0.9249
Epoch 2/30
394/394 - 1s - loss: 0.7051 - root_mean_squared_error: 0.8397 - val_loss: 0.8023 - val_root_mean_squared_error: 0.8957
Epoch 3/30
394/394 - 1s - loss: 0.6750 - root_mean_squared_error: 0.8216 - val_loss: 0.7534 - val_root_mean_squared_error: 0.8680
Epoch 4/30
394/394 - 1s - loss: 0.6536 - root_mean_squared_error: 0.8085 - val_loss: 0.7562 - val_root_mean_squared_error: 0.8696
Epoch 5/30
394/394 - 1s - loss: 0.6304 - root_mean_squared_error: 0.7940 - val_loss: 0.7986 - val_root_mean_squared_error: 0.8936
Epoch 6/30
394/394 - 1s - loss: 0.6173 - root_mean_squared_error: 0.7857 - val_loss: 0.7604 - val_root_mean_squared_error: 0.8720
Epoch 7/30
394/394 - 1s - loss: 0.5996 - root_mean_squared_error: 0.7743 - val_loss: 0.8123 - val_root_mean_squared_error: 0.9013
Epoch 8/30
394/394 - 1s - loss: 0.5859 - root_mean_squared_error: 0.7655 - val_loss: 0.756

<tensorflow.python.keras.callbacks.History at 0x168a55a4850>

In [86]:
# evaluate
loss, rmse = model_ru.evaluate(dev_embed_ru_en, dev_ru['z-score'], verbose=0)
print('Dev RMSE: %f' % (rmse))

Dev RMSE: 0.926702


In [87]:
train_ru['DL_embed'] = model_ru.predict(train_embed_ru_en)
dev_ru['DL_embed'] = model_ru.predict(dev_embed_ru_en)

<b> zh-en <b>

In [90]:
n_words_zh_en = train_embed_zh_en.shape[1]

# define network
model_zhen = Sequential()
model_zhen.add(Dense(64, input_shape=(n_words_zh_en,), activation='relu'))
model_zhen.add(Dense(64, activation='relu'))
model_zhen.add(Dense(1))

# compile network
model_zhen.compile(optimizer='rmsprop',loss='mse',metrics=[RootMeanSquaredError()])

# fit network
model_zhen.fit(train_embed_zh_en, train_zhen['z-score'], epochs=30, verbose=2,validation_data=(dev_embed_zh_en, dev_zhen['z-score']))

Epoch 1/30
578/578 - 2s - loss: 0.7215 - root_mean_squared_error: 0.8494 - val_loss: 0.7226 - val_root_mean_squared_error: 0.8500
Epoch 2/30
578/578 - 2s - loss: 0.6755 - root_mean_squared_error: 0.8219 - val_loss: 0.7094 - val_root_mean_squared_error: 0.8423
Epoch 3/30
578/578 - 2s - loss: 0.6481 - root_mean_squared_error: 0.8051 - val_loss: 0.7482 - val_root_mean_squared_error: 0.8650
Epoch 4/30
578/578 - 2s - loss: 0.6316 - root_mean_squared_error: 0.7947 - val_loss: 0.6990 - val_root_mean_squared_error: 0.8361
Epoch 5/30
578/578 - 3s - loss: 0.6148 - root_mean_squared_error: 0.7841 - val_loss: 0.7313 - val_root_mean_squared_error: 0.8552
Epoch 6/30
578/578 - 2s - loss: 0.6010 - root_mean_squared_error: 0.7752 - val_loss: 0.6884 - val_root_mean_squared_error: 0.8297
Epoch 7/30
578/578 - 2s - loss: 0.5879 - root_mean_squared_error: 0.7668 - val_loss: 0.6884 - val_root_mean_squared_error: 0.8297
Epoch 8/30
578/578 - 2s - loss: 0.5762 - root_mean_squared_error: 0.7591 - val_loss: 0.678

<tensorflow.python.keras.callbacks.History at 0x168fa8fa940>

In [91]:
# # evaluate
loss, rmse = model_zhen.evaluate(dev_embed_zh_en, dev_zhen['z-score'], verbose=0)
print('Dev RMSE: %f' % (rmse))

Dev RMSE: 0.890740


In [92]:
train_zhen['DL_embed'] = model_zhen.predict(train_embed_zh_en)
dev_zhen['DL_embed'] = model_zhen.predict(dev_embed_zh_en)

<b> en-fi

In [63]:
n_words_fi = train_embed_en_fi.shape[1]

# define network
model_fi = Sequential()
model_fi.add(Dense(64, input_shape=(n_words_fi,), activation='relu'))
model_fi.add(Dense(64, activation='relu'))
model_fi.add(Dense(1))

# compile network
model_fi.compile(optimizer='rmsprop',loss='mse',metrics=[RootMeanSquaredError()])

# fit network
model_fi.fit(train_embed_en_fi, train_fi['z-score'], epochs=30, verbose=2,validation_data=(dev_embed_en_fi, dev_fi['z-score']))

Epoch 1/30
148/148 - 4s - loss: 0.7996 - root_mean_squared_error: 0.8942 - val_loss: 0.6469 - val_root_mean_squared_error: 0.8043
Epoch 2/30
148/148 - 1s - loss: 0.6990 - root_mean_squared_error: 0.8361 - val_loss: 0.5757 - val_root_mean_squared_error: 0.7587
Epoch 3/30
148/148 - 1s - loss: 0.6262 - root_mean_squared_error: 0.7913 - val_loss: 0.7170 - val_root_mean_squared_error: 0.8467
Epoch 4/30
148/148 - 1s - loss: 0.5923 - root_mean_squared_error: 0.7696 - val_loss: 0.5675 - val_root_mean_squared_error: 0.7533
Epoch 5/30
148/148 - 1s - loss: 0.5586 - root_mean_squared_error: 0.7474 - val_loss: 0.5717 - val_root_mean_squared_error: 0.7561
Epoch 6/30
148/148 - 1s - loss: 0.5398 - root_mean_squared_error: 0.7347 - val_loss: 0.5759 - val_root_mean_squared_error: 0.7589
Epoch 7/30
148/148 - 1s - loss: 0.5174 - root_mean_squared_error: 0.7193 - val_loss: 0.5619 - val_root_mean_squared_error: 0.7496
Epoch 8/30
148/148 - 1s - loss: 0.4907 - root_mean_squared_error: 0.7005 - val_loss: 0.561

<tensorflow.python.keras.callbacks.History at 0x168fc6f1430>

In [64]:
# # evaluate
loss, rmse = model_fi.evaluate(dev_embed_en_fi, dev_fi['z-score'], verbose=0)
print('Dev RMSE: %f' % (rmse))

Dev RMSE: 0.792861


In [65]:
train_fi['DL_embed'] = model_fi.predict(train_embed_en_fi)
dev_fi['DL_embed'] = model_fi.predict(dev_embed_en_fi)

<b> en-zh <b>

In [76]:
n_words_en_zh = train_embed_en_zh.shape[1]

# define network
model_enzh = Sequential()
model_enzh.add(Dense(64, input_shape=(n_words_en_zh,), activation='relu'))
model_enzh.add(Dense(64, activation='relu'))
model_enzh.add(Dense(1))

# compile network
model_enzh.compile(optimizer='rmsprop',loss='mse',metrics=[RootMeanSquaredError()])

# fit network
model_enzh.fit(train_embed_en_zh, train_enzh['z-score'], epochs=30, verbose=2,validation_data=(dev_embed_en_zh, dev_enzh['z-score']))

Epoch 1/30
224/224 - 2s - loss: 0.8283 - root_mean_squared_error: 0.9101 - val_loss: 0.7256 - val_root_mean_squared_error: 0.8518
Epoch 2/30
224/224 - 1s - loss: 0.7667 - root_mean_squared_error: 0.8756 - val_loss: 0.7904 - val_root_mean_squared_error: 0.8891
Epoch 3/30
224/224 - 1s - loss: 0.7294 - root_mean_squared_error: 0.8541 - val_loss: 1.1653 - val_root_mean_squared_error: 1.0795
Epoch 4/30
224/224 - 1s - loss: 0.6941 - root_mean_squared_error: 0.8331 - val_loss: 0.7039 - val_root_mean_squared_error: 0.8390
Epoch 5/30
224/224 - 1s - loss: 0.6640 - root_mean_squared_error: 0.8149 - val_loss: 0.6488 - val_root_mean_squared_error: 0.8055
Epoch 6/30
224/224 - 1s - loss: 0.6469 - root_mean_squared_error: 0.8043 - val_loss: 0.9750 - val_root_mean_squared_error: 0.9874
Epoch 7/30
224/224 - 1s - loss: 0.6304 - root_mean_squared_error: 0.7940 - val_loss: 0.8822 - val_root_mean_squared_error: 0.9393
Epoch 8/30
224/224 - 1s - loss: 0.6123 - root_mean_squared_error: 0.7825 - val_loss: 0.672

<tensorflow.python.keras.callbacks.History at 0x168a5508040>

In [77]:
# # evaluate
loss, rmse = model_enzh.evaluate(dev_embed_en_zh, dev_enzh['z-score'], verbose=0)
print('Dev RMSE: %f' % (rmse))

Dev RMSE: 0.835161


In [79]:
train_enzh['DL_embed'] = model_enzh.predict(train_embed_en_zh)
dev_enzh['DL_embed'] = model_enzh.predict(dev_embed_en_zh)

<div class="alert alert-block alert-warning">

## 7. Combining metrics</font> <a class="anchor" id="comb"></a>

  [Back to introduction](#title)

</div>

In [3]:
# Renaming columns so they fit and match each other to then be concatenated
en_zh.rename(columns={'bleu':'bleu_1','chrf':'chrf_1', 'meteor':'meteor_1','ter':'ter_1','gleu':'gleu_1'}, inplace=True)

In [4]:
# Joining all corpora in a single one
DF_all = pd.concat([cs_en, ru_en, zh_en, de_en, en_fi, en_zh])
DF_all.reset_index(inplace=True, drop=True)

In [5]:
# Splitting the concatenated corpus into train and development set
train, dev = train_test_split(DF_all, test_size=0.3, random_state=42, shuffle=True)

train.reset_index(inplace=True, drop=True)
dev.reset_index(inplace=True, drop=True)

In [6]:
len(train), len(dev)

(66255, 28396)

In [8]:
# Defining that the X variables are for the metrics that performed better (BLEU, CHRF, GLEU, METEOR, TER)
X_train = train[['bleu_1','chrf_1','gleu_1', 'meteor_1', 'ter_1']]

# Defining the target as the z-score
y_train = train['z-score']

In [9]:
X_train.isna().sum()

bleu_1      0
chrf_1      0
gleu_1      0
meteor_1    0
ter_1       0
dtype: int64

In [10]:
# Defining X and the target for development set
X_dev = dev[['bleu_1','chrf_1','gleu_1', 'meteor_1', 'ter_1']]
y_dev = dev['z-score']

In [11]:
# Using Gradient Boosting Regressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

In [12]:
modelGB = GradientBoostingRegressor(random_state=42)
modelGB.fit(X_train, y_train)

GradientBoostingRegressor(random_state=42)

In [13]:
pred_train = modelGB.predict(X_train)

In [14]:
pred_dev = modelGB.predict(X_dev)
print('Train:',mean_squared_error(y_train, pred_train, squared=False),'\nDev:',mean_squared_error(y_dev, pred_dev, squared=False))

Train: 0.795940796301352 
Dev: 0.8112477311786886


In [15]:
train['grad_boost'] = pred_train
dev['grad_boost'] = pred_dev

### Experimenting with other combinations (all corpora seperately and also for all English translations together)

<b> English translations

In [76]:
# Joining all corpora with English translation
DF_all = pd.concat([cs_en, ru_en, zh_en, de_en])
DF_all.reset_index(inplace=True, drop=True)

In [77]:
train, dev = train_test_split(DF_all, test_size=0.3, random_state=42, shuffle=True)

train.reset_index(inplace=True, drop=True)
dev.reset_index(inplace=True, drop=True)

In [78]:
len(train), len(dev)

(54377, 23305)

In [79]:
train, dev = train_test_split(DF_all, test_size=0.3, random_state=42, shuffle=True)

train.reset_index(inplace=True, drop=True)
dev.reset_index(inplace=True, drop=True)

In [80]:
X_train = train[['bleu_1','chrf_1','gleu_1', 'meteor_1', 'ter_1']]
y_train = train['z-score']

In [82]:
X_dev = dev[['bleu_1','chrf_1','gleu_1', 'meteor_1', 'ter_1']]
y_dev = dev['z-score']

In [84]:
modelGB = GradientBoostingRegressor(random_state=42)
modelGB.fit(X_train, y_train)

GradientBoostingRegressor(random_state=42)

In [85]:
pred_train = modelGB.predict(X_train)

In [86]:
pred_dev = modelGB.predict(X_dev)
print('Train:',mean_squared_error(y_train, pred_train, squared=False),'\nDev:',mean_squared_error(y_dev, pred_dev, squared=False))

Train: 0.8022200327555328 
Dev: 0.7967414039498808


In [87]:
train['grad_boost'] = pred_train
dev['grad_boost'] = pred_dev

<b> Finnish translations

In [90]:
DF_all = en_fi
DF_all.reset_index(inplace=True, drop=True)

In [91]:
train, dev = train_test_split(DF_all, test_size=0.3, random_state=42, shuffle=True)

train.reset_index(inplace=True, drop=True)
dev.reset_index(inplace=True, drop=True)

In [92]:
len(train), len(dev)

(4723, 2025)

In [94]:
X_train = train[['bleu_1','chrf_1','gleu_1', 'meteor_1', 'ter_1']]
y_train = train['z-score']

In [95]:
X_dev = dev[['bleu_1','chrf_1','gleu_1', 'meteor_1', 'ter_1']]
y_dev = dev['z-score']

In [97]:
modelGB = GradientBoostingRegressor(random_state=42)
modelGB.fit(X_train, y_train)

GradientBoostingRegressor(random_state=42)

In [98]:
pred_train = modelGB.predict(X_train)

In [99]:
pred_dev = modelGB.predict(X_dev)
print('Train:',mean_squared_error(y_train, pred_train, squared=False),'\nDev:',mean_squared_error(y_dev, pred_dev, squared=False))

Train: 0.6508263053284159 
Dev: 0.7010606121887464


In [100]:
train['grad_boost'] = pred_train
dev['grad_boost'] = pred_dev

<b> Chinese translations

In [103]:
DF_all = en_zh
DF_all.reset_index(inplace=True, drop=True)

In [104]:
train, dev = train_test_split(DF_all, test_size=0.3, random_state=42, shuffle=True)

train.reset_index(inplace=True, drop=True)
dev.reset_index(inplace=True, drop=True)

In [105]:
len(train), len(dev)

(7154, 3067)

In [107]:
X_train = train[['bleu_1','chrf_1','gleu_1', 'meteor_1', 'ter_1']]
y_train = train['z-score']

In [108]:
X_dev = dev[['bleu_1','chrf_1','gleu_1', 'meteor_1', 'ter_1']]
y_dev = dev['z-score']

In [110]:
modelGB = GradientBoostingRegressor(random_state=42)
modelGB.fit(X_train, y_train)

GradientBoostingRegressor(random_state=42)

In [111]:
pred_train = modelGB.predict(X_train)

In [112]:
pred_dev = modelGB.predict(X_dev)
print('Train:',mean_squared_error(y_train, pred_train, squared=False),'\nDev:',mean_squared_error(y_dev, pred_dev, squared=False))

Train: 0.7836489202961927 
Dev: 0.8077886102885207


In [113]:
train['grad_boost'] = pred_train
dev['grad_boost'] = pred_dev

<b> Czech-en translations

In [116]:
DF_all = cs_en
DF_all.reset_index(inplace=True, drop=True)

In [117]:
train, dev = train_test_split(DF_all, test_size=0.3, random_state=42, shuffle=True)

train.reset_index(inplace=True, drop=True)
dev.reset_index(inplace=True, drop=True)

In [118]:
len(train), len(dev)

(8109, 3476)

In [120]:
X_train = train[['bleu_1','chrf_1','gleu_1', 'meteor_1', 'ter_1']]
y_train = train['z-score']

In [121]:
X_dev = dev[['bleu_1','chrf_1','gleu_1', 'meteor_1', 'ter_1']]
y_dev = dev['z-score']

In [123]:
modelGB = GradientBoostingRegressor(random_state=42)
modelGB.fit(X_train, y_train)

GradientBoostingRegressor(random_state=42)

In [124]:
pred_train = modelGB.predict(X_train)

In [125]:
pred_dev = modelGB.predict(X_dev)
print('Train:',mean_squared_error(y_train, pred_train, squared=False),'\nDev:',mean_squared_error(y_dev, pred_dev, squared=False))

Train: 0.7232785380643042 
Dev: 0.7850163510714665


In [126]:
train['grad_boost'] = pred_train
dev['grad_boost'] = pred_dev

<b> Russian- en translations

In [135]:
DF_all = ru_en
DF_all.reset_index(inplace=True, drop=True)

In [136]:
train, dev = train_test_split(DF_all, test_size=0.3, random_state=42, shuffle=True)

train.reset_index(inplace=True, drop=True)
dev.reset_index(inplace=True, drop=True)

In [137]:
len(train), len(dev)

(12583, 5394)

In [139]:
X_train = train[['bleu_1','chrf_1','gleu_1', 'meteor_1', 'ter_1']]
y_train = train['z-score']

In [140]:
X_dev = dev[['bleu_1','chrf_1','gleu_1', 'meteor_1', 'ter_1']]
y_dev = dev['z-score']

In [142]:
modelGB = GradientBoostingRegressor(random_state=42)
modelGB.fit(X_train, y_train)

GradientBoostingRegressor(random_state=42)

In [143]:
pred_train = modelGB.predict(X_train)

In [144]:
pred_dev = modelGB.predict(X_dev)
print('Train:',mean_squared_error(y_train, pred_train, squared=False),'\nDev:',mean_squared_error(y_dev, pred_dev, squared=False))

Train: 0.7852682175642716 
Dev: 0.8363676830425896


In [145]:
train['grad_boost'] = pred_train
dev['grad_boost'] = pred_dev

<b> Chinese- en translations

In [148]:
DF_all = zh_en
DF_all.reset_index(inplace=True, drop=True)

In [149]:
train, dev = train_test_split(DF_all, test_size=0.3, random_state=42, shuffle=True)

train.reset_index(inplace=True, drop=True)
dev.reset_index(inplace=True, drop=True)

In [150]:
len(train), len(dev)

(18491, 7925)

In [152]:
X_train = train[['bleu_1','chrf_1','gleu_1', 'meteor_1', 'ter_1']]
y_train = train['z-score']

In [153]:
X_dev = dev[['bleu_1','chrf_1','gleu_1', 'meteor_1', 'ter_1']]
y_dev = dev['z-score']

In [155]:
modelGB = GradientBoostingRegressor(random_state=42)
modelGB.fit(X_train, y_train)

GradientBoostingRegressor(random_state=42)

In [156]:
pred_train = modelGB.predict(X_train)

In [157]:
pred_dev = modelGB.predict(X_dev)
print('Train:',mean_squared_error(y_train, pred_train, squared=False),'\nDev:',mean_squared_error(y_dev, pred_dev, squared=False))

Train: 0.7956424407116267 
Dev: 0.8094406698213367


In [158]:
train['grad_boost'] = pred_train
dev['grad_boost'] = pred_dev

<div class="alert alert-block alert-warning">

## 8. Correlations (Pearson's and Kendall's Tau)</font> <a class="anchor" id="corr"></a>

  [Back to introduction](#title)
  
</div>

In [73]:
from scipy.stats import pearsonr, kendalltau

In [202]:
metric_corr1=[]
for col in ['euclidean_dists_1','manhattan_dists_1','cosine_sim_1', 'jaccard_sim_1','bleu_1','rouge-1_1','rouge-l_1','chrf_1','ter_1', 'meteor_1','gleu_1']:
    for i in [cs_en, ru_en, zh_en, de_en, en_fi]:
        metric_corr1.append(round(pearsonr(i['z-score'], i[col])[0], 3))
        metric_corr1.append(round(kendalltau(i['z-score'], i[col])[0], 3))

In [203]:
metric_corr2=[]
for col in ['euclidean_dists_2','manhattan_dists_2','cosine_sim_2', 'jaccard_sim_2','bleu_2','rouge-1_2','rouge-l_2','chrf_2','ter_2', 'meteor_2','gleu_2']:
    for i in [cs_en, ru_en, zh_en, de_en, en_fi]:
        metric_corr2.append(round(pearsonr(i['z-score'], i[col])[0], 3))
        metric_corr2.append(round(kendalltau(i['z-score'], i[col])[0], 3))

In [204]:
en_zh_P=[]
en_zh_K = []
for col in ['euclidean_dists','manhattan_dists','cosine_sim', 'jaccard_sim','bleu','rouge-1','rouge-l','chrf','ter', 'meteor','gleu']:
    en_zh_P.append(round(pearsonr(en_zh['z-score'], en_zh[col])[0], 3))
    en_zh_K.append(round(kendalltau(en_zh['z-score'], en_zh[col])[0], 3))

In [205]:
metrics=['euclidean_dists','manhattan_dists','cosine_sim', 'jaccard_sim','bleu','rouge-1','rouge-l','chrf','ter', 'meteor','gleu']
cols=['cs_en_P','cs_en_K','ru_en_P','ru_en_K','zh_en_P','zh_en_K','de_en_P','de_en_K','en_fi_P','en_fi_K']

In [206]:
PreProc1 = pd.DataFrame(np.array(metric_corr1).reshape((11,10)), index=metrics, columns=cols)
PreProc2 = pd.DataFrame(np.array(metric_corr2).reshape((11,10)), index=metrics, columns=cols)

In [207]:
PreProc1['en_zh_P'] = en_zh_P
PreProc1['en_zh_K'] = en_zh_K

Saving to an Excel file the tables with the Pearson and Kendall's Tau correlation scores, for Preprocessing 1 and 2

In [209]:
# PreProc1.to_excel('correlations_proc1.xls')
# PreProc2.to_excel('correlations_proc2.xls')

<div class="alert alert-block alert-warning">

### For Neural Networks with bag of words:
    
</div>

<b> English

In [168]:
# Pearson for training and development set
print(round(pearsonr(train['z-score'], train['NN_bow'])[0], 3), round(pearsonr(dev['z-score'], dev['NN_bow'])[0], 3))

0.874 0.266


In [169]:
# Kendall's Tau for training and development set
print(round(kendalltau(train['z-score'], train['NN_bow'])[0], 3),round(kendalltau(dev['z-score'], dev['NN_bow'])[0], 3))

0.664 0.187


<b> Finnish

In [171]:
# Pearson for training and development set
print(round(pearsonr(train_fi['z-score'], train_fi['NN_bow'])[0], 3), round(pearsonr(dev_fi['z-score'], dev_fi['NN_bow'])[0], 3))

0.979 0.342


In [172]:
# Kendall's Tau for training and development set
print(round(kendalltau(train_fi['z-score'], train_fi['NN_bow'])[0], 3),round(kendalltau(dev_fi['z-score'], dev_fi['NN_bow'])[0], 3))

0.907 0.221


<b> Chinese

In [174]:
# Pearson for training and development set
print(round(pearsonr(train_zh['z-score'], train_zh['NN_bow'])[0], 3), round(pearsonr(dev_zh['z-score'], dev_zh['NN_bow'])[0], 3))

0.72 0.34


In [175]:
# Kendall's Tau for training and development set
print(round(kendalltau(train_zh['z-score'], train_zh['NN_bow'])[0], 3),round(kendalltau(dev_zh['z-score'], dev_zh['NN_bow'])[0], 3))

0.518 0.224


<div class="alert alert-block alert-warning">

### For Deep Learning with bag of words:
    
</div>

<b> English

In [192]:
# Pearson for training and development set
print(round(pearsonr(train['z-score'], train['DL_bow'])[0], 3), round(pearsonr(dev['z-score'], dev['DL_bow'])[0], 3))

0.882 0.287


In [193]:
# Kendall's Tau for training and development set
print(round(kendalltau(train['z-score'], train['DL_bow'])[0], 3),round(kendalltau(dev['z-score'], dev['DL_bow'])[0], 3))

0.71 0.202


<b> Finnish

In [194]:
# Pearson for training and development set
print(round(pearsonr(train_fi['z-score'], train_fi['DL_bow'])[0], 3), round(pearsonr(dev_fi['z-score'], dev_fi['DL_bow'])[0], 3))

0.969 0.385


In [195]:
# Kendall's Tau for training and development set
print(round(kendalltau(train_fi['z-score'], train_fi['DL_bow'])[0], 3),round(kendalltau(dev_fi['z-score'], dev_fi['DL_bow'])[0], 3))

0.872 0.244


<b> Chinese

In [196]:
# Pearson for training and development set
print(round(pearsonr(train_zh['z-score'], train_zh['DL_bow'])[0], 3), round(pearsonr(dev_zh['z-score'], dev_zh['DL_bow'])[0], 3))

0.673 0.353


In [197]:
# Kendall's Tau for training and development set
print(round(kendalltau(train_zh['z-score'], train_zh['DL_bow'])[0], 3),round(kendalltau(dev_zh['z-score'], dev_zh['DL_bow'])[0], 3))

0.496 0.233


<div class="alert alert-block alert-warning">

### For Neural Networks with word embeddings:

</div>

<b> de-en

In [127]:
# Pearson for training and development set
print(round(pearsonr(train['z-score'], train['NN_embedded'])[0], 3), round(pearsonr(dev['z-score'], dev['NN_embedded'])[0], 3))

0.951 0.31


In [128]:
# Kendall's Tau for training and development set
print(round(kendalltau(train['z-score'], train['NN_embedded'])[0], 3),round(kendalltau(dev['z-score'], dev['NN_embedded'])[0], 3))

0.788 0.22


<b> cs-en

In [246]:
# Pearson for training and development set
print(round(pearsonr(train_cs['z-score'], train_cs['NN_embedded'])[0], 3), round(pearsonr(dev_cs['z-score'], dev_cs['NN_embedded'])[0], 3))

0.776 0.484


In [247]:
# Kendall's Tau for training and development set
print(round(kendalltau(train_cs['z-score'], train_cs['NN_embedded'])[0], 3),round(kendalltau(dev_cs['z-score'], dev_cs['NN_embedded'])[0], 3))

0.548 0.321


<b> ru-en

In [46]:
# Pearson for training and development set
print(round(pearsonr(train_ru['z-score'], train_ru['NN_embedded'])[0], 3), round(pearsonr(dev_ru['z-score'], dev_ru['NN_embedded'])[0], 3))

0.653 0.274


In [47]:
# Kendall's Tau for training and development set
print(round(kendalltau(train_ru['z-score'], train_ru['NN_embedded'])[0], 3),round(kendalltau(dev_ru['z-score'], dev_ru['NN_embedded'])[0], 3))

0.436 0.193


<b> zh-en

In [48]:
# Pearson for training and development set
print(round(pearsonr(train_zhen['z-score'], train_zhen['NN_embedded'])[0], 3), round(pearsonr(dev_zhen['z-score'], dev_zhen['NN_embedded'])[0], 3))

0.606 0.35


In [49]:
# Kendall's Tau for training and development set
print(round(kendalltau(train_zhen['z-score'], train_zhen['NN_embedded'])[0], 3),round(kendalltau(dev_zhen['z-score'], dev_zhen['NN_embedded'])[0], 3))

0.418 0.239


<b> en-fi

In [50]:
# Pearson for training and development set
print(round(pearsonr(train_fi['z-score'], train_fi['NN_embedded'])[0], 3), round(pearsonr(dev_fi['z-score'], dev_fi['NN_embedded'])[0], 3))

0.785 0.392


In [51]:
# Kendall's Tau for training and development set
print(round(kendalltau(train_fi['z-score'], train_fi['NN_embedded'])[0], 3),round(kendalltau(dev_fi['z-score'], dev_fi['NN_embedded'])[0], 3))

0.585 0.266


<b> en-zh

In [52]:
# Pearson for training and development set
print(round(pearsonr(train_enzh['z-score'], train_enzh['NN_embedded'])[0], 3), round(pearsonr(dev_enzh['z-score'], dev_enzh['NN_embedded'])[0], 3))

0.7 0.439


In [53]:
# Kendall's Tau for training and development set
print(round(kendalltau(train_enzh['z-score'], train_enzh['NN_embedded'])[0], 3),round(kendalltau(dev_enzh['z-score'], dev_enzh['NN_embedded'])[0], 3))

0.498 0.285


<div class="alert alert-block alert-warning">

### For Deep Learning with word embeddings:
    
</div>

<b> cs-en

In [88]:
# Pearson for training and development set
print(round(pearsonr(train_cs['z-score'], train_cs['DL_embed'])[0], 3), round(pearsonr(dev_cs['z-score'], dev_cs['DL_embed'])[0], 3))

0.878 0.509


In [89]:
# Kendall's Tau for training and development set
print(round(kendalltau(train_cs['z-score'], train_cs['DL_embed'])[0], 3),round(kendalltau(dev_cs['z-score'], dev_cs['DL_embed'])[0], 3))

0.667 0.348


<b> ru-en

In [46]:
# Pearson for training and development set
print(round(pearsonr(train_ru['z-score'], train_ru['DL_embed'])[0], 3), round(pearsonr(dev_ru['z-score'], dev_ru['DL_embed'])[0], 3))

0.653 0.274


In [47]:
# Kendall's Tau for training and development set
print(round(kendalltau(train_ru['z-score'], train_ru['DL_embed'])[0], 3),round(kendalltau(dev_ru['z-score'], dev_ru['DL_embed'])[0], 3))

0.436 0.193


<b> zh-en

In [93]:
# Pearson for training and development set
print(round(pearsonr(train_zhen['z-score'], train_zhen['DL_embed'])[0], 3), round(pearsonr(dev_zhen['z-score'], dev_zhen['DL_embed'])[0], 3))

0.693 0.34


In [94]:
# Kendall's Tau for training and development set
print(round(kendalltau(train_zhen['z-score'], train_zhen['DL_embed'])[0], 3),round(kendalltau(dev_zhen['z-score'], dev_zhen['DL_embed'])[0], 3))

0.481 0.236


<b> de-en

In [36]:
# Pearson for training and development set
print(round(pearsonr(train['z-score'], train['DL_embed'])[0], 3), round(pearsonr(dev['z-score'], dev['DL_embed'])[0], 3))

0.704 0.342


In [37]:
# Kendall's Tau for training and development set
print(round(kendalltau(train['z-score'], train['DL_embed'])[0], 3),round(kendalltau(dev['z-score'], dev['DL_embed'])[0], 3))

0.474 0.238


<b> en-fi

In [66]:
# Pearson for training and development set
print(round(pearsonr(train_fi['z-score'], train_fi['DL_embed'])[0], 3), round(pearsonr(dev_fi['z-score'], dev_fi['DL_embed'])[0], 3))

0.91 0.422


In [67]:
# Kendall's Tau for training and development set
print(round(kendalltau(train_fi['z-score'], train_fi['DL_embed'])[0], 3),round(kendalltau(dev_fi['z-score'], dev_fi['DL_embed'])[0], 3))

0.738 0.288


<b> en-zh

In [80]:
# Pearson for training and development set
print(round(pearsonr(train_enzh['z-score'], train_enzh['DL_embed'])[0], 3), round(pearsonr(dev_enzh['z-score'], dev_enzh['DL_embed'])[0], 3))

0.783 0.452


In [81]:
# Kendall's Tau for training and development set
print(round(kendalltau(train_enzh['z-score'], train_enzh['DL_embed'])[0], 3),round(kendalltau(dev_enzh['z-score'], dev_enzh['DL_embed'])[0], 3))

0.576 0.298


<div class="alert alert-block alert-warning">

### For Ensemble with past metrics:
    
</div>

<b> English only

In [88]:
# Pearson for training and development set
print(round(pearsonr(train['z-score'], train['grad_boost'])[0], 3), round(pearsonr(dev['z-score'], dev['grad_boost'])[0], 3))

0.397 0.388


In [89]:
# Kendall's Tau for training and development set
print(round(kendalltau(train['z-score'], train['grad_boost'])[0], 3),round(kendalltau(dev['z-score'], dev['grad_boost'])[0], 3))

0.261 0.263


<b> Finnish

In [101]:
# Pearson for training and development set
print(round(pearsonr(train['z-score'], train['grad_boost'])[0], 3), round(pearsonr(dev['z-score'], dev['grad_boost'])[0], 3))

0.681 0.62


In [102]:
# Kendall's Tau for training and development set
print(round(kendalltau(train['z-score'], train['grad_boost'])[0], 3),round(kendalltau(dev['z-score'], dev['grad_boost'])[0], 3))

0.457 0.398


<b> Chinese

In [114]:
# Pearson for training and development set
print(round(pearsonr(train['z-score'], train['grad_boost'])[0], 3), round(pearsonr(dev['z-score'], dev['grad_boost'])[0], 3))

0.542 0.494


In [115]:
# Kendall's Tau for training and development set
print(round(kendalltau(train['z-score'], train['grad_boost'])[0], 3),round(kendalltau(dev['z-score'], dev['grad_boost'])[0], 3))

0.362 0.331


<b> Czech

In [127]:
# Pearson for training and development set
print(round(pearsonr(train['z-score'], train['grad_boost'])[0], 3), round(pearsonr(dev['z-score'], dev['grad_boost'])[0], 3))

0.554 0.454


In [128]:
# Kendall's Tau for training and development set
print(round(kendalltau(train['z-score'], train['grad_boost'])[0], 3),round(kendalltau(dev['z-score'], dev['grad_boost'])[0], 3))

0.362 0.314


<b> Russian

In [146]:
# Pearson for training and development set
print(round(pearsonr(train['z-score'], train['grad_boost'])[0], 3), round(pearsonr(dev['z-score'], dev['grad_boost'])[0], 3))

0.448 0.359


In [147]:
# Kendall's Tau for training and development set
print(round(kendalltau(train['z-score'], train['grad_boost'])[0], 3),round(kendalltau(dev['z-score'], dev['grad_boost'])[0], 3))

0.283 0.249


<b> Chinese - eng translations

In [159]:
# Pearson for training and development set
print(round(pearsonr(train['z-score'], train['grad_boost'])[0], 3), round(pearsonr(dev['z-score'], dev['grad_boost'])[0], 3))

0.422 0.367


In [160]:
# Kendall's Tau for training and development set
print(round(kendalltau(train['z-score'], train['grad_boost'])[0], 3),round(kendalltau(dev['z-score'], dev['grad_boost'])[0], 3))

0.268 0.241


<b> German

In [74]:
# Pearson for training and development set
print(round(pearsonr(train['z-score'], train['grad_boost'])[0], 3), round(pearsonr(dev['z-score'], dev['grad_boost'])[0], 3))

0.419 0.404


In [75]:
# Kendall's Tau for training and development set
print(round(kendalltau(train['z-score'], train['grad_boost'])[0], 3),round(kendalltau(dev['z-score'], dev['grad_boost'])[0], 3))

0.277 0.273


<div class="alert alert-block alert-warning">

## 9. Applying it to test set</font> <a class="anchor" id="test"></a>

  [Back to introduction](#title)
  
</div>

In [34]:
# Importing the test corpora already with the the MT metrics used in the ensemble model -> retrieved from the TM Project - Test Set Preparation
en_fi, cs_en, en_zh, ru_en, zh_en, de_en = pd.read_csv("en_fi_test.csv"), pd.read_csv("cs_en_test.csv"), pd.read_csv("en_zh_test.csv"), pd.read_csv("ru_en_test.csv"), pd.read_csv("zh_en_test.csv"), pd.read_csv("de_en_test.csv")

In [35]:
# Joining all corpora
test_set = pd.concat([cs_en, ru_en, zh_en, de_en, en_fi, en_zh])
test_set.reset_index(inplace=True, drop=True)

In [19]:
# Defining the X variables as the same metrics used in the training of the Ensemble model
X_test = test_set[['bleu','chrf','gleu', 'meteor', 'ter_1']]

In [37]:
pred_test = modelGB.predict(X_test)

In [38]:
len(en_fi), len(cs_en), len(en_zh), len(ru_en), len(zh_en), len(de_en)

(8097, 8732, 22128, 13157, 25352, 28404)

In [39]:
test_set['metric_scores'] = pred_test

In [292]:
test_set[(21889-13157):21889]

Unnamed: 0           0
source               0
reference            0
translation          0
clean_translation    0
clean_reference      0
bleu                 0
ter_1                0
chrf                 0
meteor               0
gleu                 0
metric               0
dtype: int64

In [377]:
# Splitting the joined test set in separate sets to later join in each language-pair corpus

cs_en_test = test_set[:8732]

en_zh_test = test_set[-22128:]

en_fi_test = test_set[(83742-8097):83742]

de_en_test = test_set[(75645-28404):75645]

zh_en_test = test_set[(47241-25352):47241]

ru_en_test = test_set[(21889-13157):21889]

In [383]:
# Keeping only the columns: Source, Reference, Translation and Metric Scores (the predictions of the z-score)

cs_en_test = cs_en_test[['source','reference','translation','metric_scores']]
ru_en_test = ru_en_test[['source','reference','translation','metric_scores']]
zh_en_test = zh_en_test[['source','reference','translation','metric_scores']]
de_en_test = de_en_test[['source','reference','translation','metric_scores']]
en_fi_test = en_fi_test[['source','reference','translation','metric_scores']]
en_zh_test = en_zh_test[['source','reference','translation','metric_scores']]

In [384]:
# Dropping the index to have a cleaner aspect

cs_en_test.reset_index(inplace=True, drop=True)
en_zh_test.reset_index(inplace=True, drop=True)
zh_en_test.reset_index(inplace=True, drop=True)
de_en_test.reset_index(inplace=True, drop=True)
en_fi_test.reset_index(inplace=True, drop=True)
ru_en_test.reset_index(inplace=True, drop=True)

Saving the final corpora already with the predictions into the resulting csv's

In [385]:
cs_en_test.to_csv('corpus/testset/cs-en/scores.csv')
en_zh_test.to_csv('corpus/testset/en-zh/scores.csv')
zh_en_test.to_csv('corpus/testset/zh-en/scores.csv')
de_en_test.to_csv('corpus/testset/de-en/scores.csv')
en_fi_test.to_csv('corpus/testset/en-fi/scores.csv')
ru_en_test.to_csv('corpus/testset/ru-en/scores.csv')