In [1]:
import os
import sys
import pickle
import numpy as np

# Evaluation Metrics
Similar to classical classification task, we can evaluate our model performance using F-score, precision and recall. 

In the field of Natural Language Processing (NLP), it is common to use BLEU and ROUGE to measure precision and recall. 

## What is BLEU?
BLEU (BiLingual Evaluation Understudy) stands for  measures how well a candidate translation matches a set of reference translations by counting the percentage of n-grams in the candidate translation overlapping with the references. BLEU was first introduced in Papineni et. al. (2001).

## What is ROUGE?
ROUGE stands for Recall-Oriented Understudy for Gisting Evaluation. It comes with mainly two metrics, ROUGE-N and ROUGE-L. 

ROUGE-N is a recall-related measure because the denominator of the equation is the total sum of the number of n-grams occurring at the reference summary side. 

ROUGE-N: Overlap of N-grams[2] between the system and reference summaries.
- ROUGE-1 refers to the overlap of 1-gram (each word) between the system and reference summaries.
- ROUGE-2 refers to the overlap of bigrams between the system and reference summaries.

ROUGE-L: Longest Common Subsequence (LCS)[3] based statistics. Longest common subsequence problem takes into account sentence level structure similarity naturally and identifies longest co-occurring in sequence n-grams automatically.

Rouge applies in cases with multiple reference summary, however, because we have only one ground truth (i.e., one title), we will simplify the definition of rouge as following:

Recall (ROUGE) = $\frac{Count_{match}(gram_n)}{Count(gram_n)}$

n stands for the length of the n-gram ($gram_n$), and $Count_{match}(gram_n)$ is the maximum number of n-grams co-occurring in a candidate summary and a set of reference summaries"

## How do we calculate F-score?
$F_1 = 2 * \frac{Precision * Recall}{Precision + Recall}$

--- 

In [2]:
#Relative import of rouge_evaluation
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    
from myeval import rouge_evaluation

# BLEU
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

In [3]:
# Import data
with open("glove_self_trained/predictions_base",'rb') as f:
    basePred = pickle.load(f)
with open("glove_self_trained/predictions_attention",'rb') as f:
    attentionPred = pickle.load(f)

# kNN
knn_pred = np.load("kNN/baseline_generated.npy")
knn_truth = np.load("kNN/baseline_true.npy")

# word2vec
with open("word2vec/predictions_base",'rb') as f:
    word2vec_basePred = pickle.load(f)
# with open("word2vec/predictions_attention",'rb') as f:
#     word2vec_attnPred = pickle.load(f)

In [4]:
# Recall
def one_gram_recall(truth, pred):
    truth2 = list(set(truth.split())) # get unique words in truth
    pred2 = list(set(pred.split())) # get unique words in predicted title
    return np.sum([word in pred2 for word in truth2])/len(truth.split())

def ngrams(txt_input, n):
    txt_input = txt_input.split(' ')
    output = []
    for i in range(len(txt_input)-n+1):
        output.append(txt_input[i:i+n])
    return output

def two_gram_recall(truth, pred, n = 2):
    '''
    truth = basePred['Truth'][i]
    pred = basePred[model][i]
    two_gram_recall(truth_2gram, pred_2gram)
    '''
    truth_2gram = ngrams(truth, n)
    pred_2gram = ngrams(pred, n)
    return np.sum([gram in truth_2gram for gram in pred_2gram])/len(truth.split())

$Recall_{\text{1gram}} = \frac{\text{# of words overlap between predicted and true title}}{\text{# of words in true title}} $

In [25]:
i = 4
print("True title:", basePred['Truth'][i])
print("Predicted title:", basePred['Greedy'][i])

True title: learning shuffle ideals under restricted distributions
Predicted title: learning on the complexity of the presence of barn


In [26]:
ngrams(basePred['Truth'][i],2)

[['learning', 'shuffle'],
 ['shuffle', 'ideals'],
 ['ideals', 'under'],
 ['under', 'restricted'],
 ['restricted', 'distributions']]

In [27]:
ngrams(basePred['Greedy'][i], 2)

[['learning', 'on'],
 ['on', 'the'],
 ['the', 'complexity'],
 ['complexity', 'of'],
 ['of', 'the'],
 ['the', 'presence'],
 ['presence', 'of'],
 ['of', 'barn']]

## Baseline kNN

In [145]:
models = ["kNN"]
metrics = ['rouge1', 'rouge2', 'bleu']
kNN_eval_dict = {model: {metric: [] for metric in metrics} for model in models}
# true = "Pavlos likes cross validation"
# pred = "Pavlos likes cheese"
        
for i in range(len(knn_pred)):
    for model in models:
#         rouge1_res = rouge_evaluation(basePred[model][i], basePred['Truth'][i])[2]
#         rouge2_res = rouge_evaluation(basePred[model][i], basePred['Truth'][i], method="rouge-2")[2]
#         rougeL_res = rouge_evaluation(basePred[model][i], basePred['Truth'][i], method="rouge-l")[2]
        rouge1_res = one_gram_recall(knn_truth[i], knn_pred[i])
        rouge2_res = two_gram_recall(knn_truth[i], knn_pred[i])
        bleu = sentence_bleu(knn_truth[i], knn_pred[i], smoothing_function=SmoothingFunction().method3)

        kNN_eval_dict[model]['rouge1'].append(rouge1_res)
        kNN_eval_dict[model]['rouge2'].append(rouge2_res)
        kNN_eval_dict[model]['bleu'].append(bleu)

    

In [148]:
print(np.mean(kNN_eval_dict['kNN']['rouge1']))
print(np.mean(kNN_eval_dict['kNN']['rouge2']))
print(np.mean(kNN_eval_dict['kNN']['bleu']))

kNN_f1 = 2 * (np.mean(kNN_eval_dict['kNN']['rouge1']) * np.mean(kNN_eval_dict['kNN']['rouge1']))/(np.mean(kNN_eval_dict['kNN']['rouge1']) + np.mean(kNN_eval_dict['kNN']['rouge1']))
kNN_f1


0.17277510273913035
0.045383005228656854
0.013455188123427974


0.17277510273913035

## Glove: self-trained embeddings

In [131]:
models = ["Greedy", "Non-Greedy"]
metrics = ['rouge1', 'rouge2', 'bleu']
baseline_eval_dict = {model: {metric: [] for metric in metrics} for model in models}
# true = "Pavlos likes cross validation"
# pred = "Pavlos likes cheese"
        
for i in range(len(basePred['Truth'])):
    for model in models:
#         rouge1_res = rouge_evaluation(basePred[model][i], basePred['Truth'][i])[2]
#         rouge2_res = rouge_evaluation(basePred[model][i], basePred['Truth'][i], method="rouge-2")[2]
#         rougeL_res = rouge_evaluation(basePred[model][i], basePred['Truth'][i], method="rouge-l")[2]
        rouge1_res = one_gram_recall(basePred['Truth'][i], basePred[model][i])
        rouge2_res = two_gram_recall(basePred['Truth'][i], basePred[model][i])
        bleu = sentence_bleu(basePred['Truth'][i], basePred[model][i], smoothing_function=SmoothingFunction().method3)
        
        baseline_eval_dict[model]['rouge1'].append(rouge1_res)
        baseline_eval_dict[model]['rouge2'].append(rouge2_res)
        baseline_eval_dict[model]['bleu'].append(bleu)

attention_eval_dict = {model: {metric: [] for metric in metrics} for model in models}

for i in range(len(basePred['Truth'])):
    for model in models:
        rouge1_res = one_gram_recall(attentionPred['Truth'][i], attentionPred[model][i])
        rouge2_res = two_gram_recall(attentionPred['Truth'][i], attentionPred[model][i])
        bleu = sentence_bleu(attentionPred['Truth'][i], attentionPred[model][i], smoothing_function=SmoothingFunction().method3)
        
        attention_eval_dict[model]['rouge1'].append(rouge1_res)
        attention_eval_dict[model]['rouge2'].append(rouge2_res)
        attention_eval_dict[model]['bleu'].append(bleu)
    

In [132]:
# Greedy
np.mean(baseline_eval_dict['Greedy']['rouge1'])

0.09841122507197558

In [133]:
np.mean(baseline_eval_dict['Greedy']['bleu'])

0.013718839998325369

In [138]:
greedy_f1 = 2 * (np.mean(baseline_eval_dict['Greedy']['rouge1']) * np.mean(baseline_eval_dict['Greedy']['bleu']))/(np.mean(baseline_eval_dict['Greedy']['rouge1']) + np.mean(baseline_eval_dict['Greedy']['bleu']))
greedy_f1

0.024080746764129128

In [135]:
np.mean(baseline_eval_dict['Non-Greedy']['rouge1'])

0.1214838735962595

In [136]:
np.mean(baseline_eval_dict['Non-Greedy']['bleu'])

0.010275063935989353

In [139]:
nongreedy_f1 = 2 * (np.mean(baseline_eval_dict['Non-Greedy']['rouge1']) * np.mean(baseline_eval_dict['Non-Greedy']['bleu']))/(np.mean(baseline_eval_dict['Non-Greedy']['rouge1']) + np.mean(baseline_eval_dict['Non-Greedy']['bleu']))
nongreedy_f1

0.018947550606769226

In [155]:
# Attention Glove
# Greedy
print(np.mean(attention_eval_dict['Greedy']['rouge1']))

print(np.mean(attention_eval_dict['Greedy']['bleu']))

greedy_f1 = 2 * (np.mean(attention_eval_dict['Greedy']['rouge1']) * np.mean(attention_eval_dict['Greedy']['bleu']))/(np.mean(attention_eval_dict['Greedy']['rouge1']) + np.mean(attention_eval_dict['Greedy']['bleu']))
print(greedy_f1)

# Non-greedy
print(np.mean(attention_eval_dict['Non-Greedy']['rouge1']))
print(np.mean(attention_eval_dict['Non-Greedy']['bleu']))
nongreedy_f1 = 2 * (np.mean(attention_eval_dict['Non-Greedy']['rouge1']) * np.mean(attention_eval_dict['Non-Greedy']['bleu']))/(np.mean(attention_eval_dict['Non-Greedy']['rouge1']) + np.mean(attention_eval_dict['Non-Greedy']['bleu']))
print(nongreedy_f1)

0.10630318028264275
0.011577459224898126
0.020880795020128157
0.1294049162924353
0.010968986443292517
0.020223713166676118


## Word2Vec embedding

In [151]:
models = ["Greedy", "Non-Greedy"]
metrics = ['rouge1', 'rouge2', 'bleu']
word2vec_eval_dict = {model: {metric: [] for metric in metrics} for model in models}

for i in range(len(word2vec_basePred['Truth'])):
    for model in models:
#         rouge1_res = rouge_evaluation(word2vec_basePred[model][i], word2vec_basePred['Truth'][i])[2]
#         rouge2_res = rouge_evaluation(word2vec_basePred[model][i], word2vec_basePred['Truth'][i], method="rouge-2")[2]
#         rougeL_res = rouge_evaluation(word2vec_basePred[model][i], word2vec_basePred['Truth'][i], method="rouge-l")[2]
        rouge1_res = one_gram_recall(word2vec_basePred['Truth'][i], word2vec_basePred[model][i])
        rouge2_res = two_gram_recall(word2vec_basePred['Truth'][i], word2vec_basePred[model][i])
        bleu = sentence_bleu(word2vec_basePred['Truth'][i], word2vec_basePred[model][i], smoothing_function=SmoothingFunction().method3)
        
        word2vec_eval_dict[model]['rouge1'].append(rouge1_res)
        word2vec_eval_dict[model]['rouge2'].append(rouge2_res)
        word2vec_eval_dict[model]['bleu'].append(bleu)

# # attention
# word2vec_attn_eval_dict = {model: {metric: [] for metric in metrics} for model in models}
# for i in range(len(word2vec_attentionPred['Truth'])):
#     for model in models:
#         rouge1_res = one_gram_recall(word2vec_attentionPred['Truth'][i], word2vec_attentionPred[model][i])
#         rouge2_res = two_gram_recall(word2vec_attentionPred['Truth'][i], word2vec_attentionPred[model][i])
#         bleu = sentence_bleu(word2vec_attentionPred['Truth'][i], word2vec_attentionPred[model][i], smoothing_function=SmoothingFunction().method3)
        
#         word2vec_attn_eval_dict[model]['rouge1'].append(rouge1_res)
#         word2vec_attn_eval_dict[model]['rouge2'].append(rouge2_res)
#         word2vec_attn_eval_dict[model]['bleu'].append(bleu)

In [154]:
# Greedy
print(np.mean(word2vec_eval_dict['Greedy']['rouge1']))

print(np.mean(word2vec_eval_dict['Greedy']['bleu']))

greedy_f1 = 2 * (np.mean(word2vec_eval_dict['Greedy']['rouge1']) * np.mean(word2vec_eval_dict['Greedy']['bleu']))/(np.mean(word2vec_eval_dict['Greedy']['rouge1']) + np.mean(word2vec_eval_dict['Greedy']['bleu']))
print(greedy_f1)

# Non-greedy
print(np.mean(word2vec_eval_dict['Non-Greedy']['rouge1']))
print(np.mean(word2vec_eval_dict['Non-Greedy']['bleu']))
nongreedy_f1 = 2 * (np.mean(word2vec_eval_dict['Non-Greedy']['rouge1']) * np.mean(word2vec_eval_dict['Non-Greedy']['bleu']))/(np.mean(word2vec_eval_dict['Non-Greedy']['rouge1']) + np.mean(word2vec_eval_dict['Non-Greedy']['bleu']))
print(nongreedy_f1)

0.12605377168400114
0.013500808908447787
0.024389423499681085
0.12221925997360202
0.010770771719775033
0.019796908567863168


In [None]:
# Attention word2vec
# Greedy
print(np.mean(word2vec_attn_eval_dict['Greedy']['rouge1']))

print(np.mean(word2vec_attn_eval_dict['Greedy']['bleu']))

greedy_f1 = 2 * (np.mean(word2vec_attn_eval_dict['Greedy']['rouge1']) * np.mean(word2vec_attn_eval_dict['Greedy']['bleu']))/(np.mean(word2vec_attn_eval_dict['Greedy']['rouge1']) + np.mean(word2vec_attn_eval_dict['Greedy']['bleu']))
print(greedy_f1)

# Non-greedy
print(np.mean(word2vec_attn_eval_dict['Non-Greedy']['rouge1']))
print(np.mean(word2vec_attn_eval_dict['Non-Greedy']['bleu']))
nongreedy_f1 = 2 * (np.mean(word2vec_attn_eval_dict['Non-Greedy']['rouge1']) * np.mean(word2vec_attn_eval_dict['Non-Greedy']['bleu']))/(np.mean(word2vec_attn_eval_dict['Non-Greedy']['rouge1']) + np.mean(word2vec_attn_eval_dict['Non-Greedy']['bleu']))
print(nongreedy_f1)