# Evaluation notebook 

In [1]:

from pprint import pprint
import os
import json
import pandas as pandas

In [2]:
def read_file(file_path):
    with open(file_path,'r', encoding="utf8") as file:
        return file.readlines()

def read_file_as_text(file_path):
    with open(file_path,'r', encoding="utf8") as file:
        return file.read()

def read_reference_file_for_bleu_metric(path):
    with open(path,'r', encoding="utf8") as file:
        # Returns a list of lists (each sentence is a list itself, with one item)
        return [list(map(str, line.rstrip(",\r\n").split(","))) for line in file]

In [3]:
p1 = 'simplify_summary'
p2 = 'summary_simplify'

In [4]:
def calculate_metrics_for_dataset(metric, pipeline=p1, data_dir='data/'):
    '''
    General method that takes in the metric method and returns the metric scores
    '''
    evaluation_dataset = dict()
    metric_name = metric.__name__
    wiki_base_dir = os.path.join(data_dir, 'wiki-auto')

    processed_articles = read_file_as_text('data/processed_articles.txt').split('\n')

    # base data/wiki-auto
    for folder in processed_articles:
        base_data_dir = os.path.join(wiki_base_dir, folder)
        ground_truth_file_path = os.path.join(base_data_dir, 'destination.txt')
        pipeline_dir = os.path.join(base_data_dir, pipeline)
        simplified_summary_file_path = os.path.join(pipeline_dir, 'simplified_summary.txt')
        if os.path.exists(simplified_summary_file_path):
            evaluation_dataset[folder] = metric(simplified_summary_file_path, ground_truth_file_path)
            print(f'Done generating {metric_name} scores for {folder}\r', end='', flush=True)
    with open(f'{data_dir}{pipeline}_{metric_name}', 'w') as outputfile:
        json.dump(evaluation_dataset, outputfile)
    print('\nDone')
    return evaluation_dataset

## Lexical similarity based scores - ROUGE

### ROUGE - ROUGE-1, ROUGE-2, ROUGE-L and Google ROUGE

In [None]:
!pip install rouge
!pip install pandas
!pip install rouge-score

In [5]:
from rouge_score import rouge_scorer
from rouge import Rouge

In [6]:
def rouge_scores(hyp_file_path, ref_file_path):
    '''
    Calculate ROUGE-1, ROUGE-2 and ROUGE-l scores for hypothesis and reference files
    '''
    rouge = Rouge()
    hypotheses = ''
    references = ''
    hypotheses = read_file_as_text(hyp_file_path)
    references = read_file_as_text(ref_file_path)
    return rouge.get_scores(hypotheses, references, avg=True)

# pprint(calculate_rouge_score('data/wiki-auto/25/simplify_summary/simplified_summary.txt','data/wiki-auto/25/destination.txt'))
# calculate_rouge_score('data/wiki-auto/25/summary_simplify/simplified_summary.txt','data/wiki-auto/25/destination.txt')


def convert_google_rouge_format_to_usable_format(scores):
    google_rouge_stats = dict(
    {
        'rouge-1': {
            "r": 0,
            "p": 0,
            "f": 0,
        },
        'rouge-2': {
            "r": 0,
            "p": 0,
            "f": 0,
        },
        'rouge-l': {
            "r": 0,
            "p": 0,
            "f": 0,
        },
        'rouge-lsum': {
            "r": 0,
            "p": 0,
            "f": 0,
        }
        
    })
    for key in scores:
        if key == 'rouge1':
            google_rouge_stats['rouge-1'] = {"p":scores[key].precision,"r":scores[key].recall,"f":scores[key].fmeasure}
        if key == 'rouge2':
            google_rouge_stats['rouge-2'] = {"p":scores[key].precision,"r":scores[key].recall,"f":scores[key].fmeasure}
        if key == 'rougeL':
            google_rouge_stats['rouge-l'] = {"p":scores[key].precision,"r":scores[key].recall,"f":scores[key].fmeasure}
        if key == 'rougeLsum':
            google_rouge_stats['rouge-lsum'] = {"p":scores[key].precision,"r":scores[key].recall,"f":scores[key].fmeasure}
    return google_rouge_stats

def google_rouge_scores(hyp_file_path, ref_file_path):
    scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL','rougeLsum'], use_stemmer=True)
    hypotheses = read_file_as_text(hyp_file_path)
    references = read_file_as_text(ref_file_path)
    scores = scorer.score(references,hypotheses)
    # pprint(type(scores))
    # return scores
    return convert_google_rouge_format_to_usable_format(scores)

In [7]:
# Test 
pprint(google_rouge_scores('data/wiki-auto/25/simplify_summary/simplified_summary.txt','data/wiki-auto/25/destination.txt'))
# pprint(calculate_google_rouge_score('data/wiki-auto/25/summary_simplify/simplified_summary.txt','data/wiki-auto/25/destination.txt'))

{'rouge-1': {'f': 0.5666280417149478,
             'p': 0.6680327868852459,
             'r': 0.4919517102615694},
 'rouge-2': {'f': 0.1654592871631411,
             'p': 0.1950802869832593,
             'r': 0.14364779874213837},
 'rouge-l': {'f': 0.14629200463499423,
             'p': 0.17247267759562843,
             'r': 0.12701207243460766},
 'rouge-lsum': {'f': 0.5559096176129779,
                'p': 0.655396174863388,
                'r': 0.48264587525150904}}


### Normal ROUGE

In [9]:
%%time
# Calculate ROUGE score for the whole dataset for the first pipeline (Simplify & Summarize)
rouge_p1 = calculate_metrics_for_dataset(rouge_scores, pipeline=p1)

CPU times: user 10min 5s, sys: 540 ms, total: 10min 6s
Wall time: 10min 6s


In [10]:
%%time
# Calculate ROUGE score for the whole dataset for the second pipeline (Summarize & Simplify)
rouge_p2 = calculate_metrics_for_dataset(rouge_scores, pipeline=p2)

CPU times: user 9min 55s, sys: 724 ms, total: 9min 56s
Wall time: 9min 57s


### Google ROUGE

In [12]:
%%time
# Calculate Google ROUGE score for the whole dataset for the first pipeline (Simplify & Summarize)
google_rouge_p1 = calculate_metrics_for_dataset(google_rouge_scores, pipeline=p1)

CPU times: user 11min 5s, sys: 3.72 s, total: 11min 9s
Wall time: 11min 10s


In [13]:
%%time
# Calculate Google ROUGE score for the whole dataset for the second pipeline (Summarize & Simplify)
google_rouge_p2 = calculate_metrics_for_dataset(google_rouge_scores, pipeline=p2)

CPU times: user 10min 18s, sys: 2.73 s, total: 10min 21s
Wall time: 10min 22s


### Calculate average values for whole dataset

Next cell loads data from files if the actual evaluation is not performed and the scores from the file is to be read

In [35]:
%%time
with open('data/simplify_summary_rouge_scores', 'r', encoding="utf8") as f:
    rouge_p1 = json.load(f)

with open('data/summary_simplify_rouge_scores', 'r', encoding="utf8") as f:
    rouge_p2 = json.load(f)

with open('data/summary_simplify_google_rouge_scores', 'r', encoding="utf8") as f:
    google_rouge_p1 = json.load(f)

with open('data/summary_simplify_google_rouge_scores', 'r', encoding="utf8") as f:
    google_rouge_p2 = json.load(f)

Wall time: 18 ms


In [49]:
def calculate_avg_rouge_stats(rouge_dataframe, isGoogle=False):
    '''
    Calculate the average ROUGE scores r,p,f for the whole given dataset 
    '''
    stats = {}
    cols=['rouge-1', 'rouge-2', 'rouge-l']
    if isGoogle:
        cols.append('rouge-lsum')
    dataset_size = len(rouge_dataframe)
    for col in cols:
        stats[col] = {}
        stats[col]['r'] = round(sum(x[col]['r'] for x in rouge_dataframe.values())/dataset_size, 3)
        stats[col]['p'] = round(sum(x[col]['p'] for x in rouge_dataframe.values())/dataset_size, 3)
        stats[col]['f'] = round(sum(x[col]['f'] for x in rouge_dataframe.values())/dataset_size, 3)
    return stats

In [58]:
print("The ROUGE-1,ROUGE-2 and ROUGE-l score for P1 - Simplify & Summary : ")
pprint(calculate_avg_rouge_stats(rouge_p1))

print("The ROUGE-1,ROUGE-2 and ROUGE-l score for P2 - Summary & Simplify : ")
pprint(calculate_avg_rouge_stats(rouge_p2))

The ROUGE-1,ROUGE-2 and ROUGE-l score for P1 - Simplify & Summary : 
{'rouge-1': {'f': 0.309, 'p': 0.344, 'r': 0.297},
 'rouge-2': {'f': 0.129, 'p': 0.145, 'r': 0.125},
 'rouge-l': {'f': 0.292, 'p': 0.326, 'r': 0.28}}
The ROUGE-1,ROUGE-2 and ROUGE-l score for P2 - Summary & Simplify : 
{'rouge-1': {'f': 0.309, 'p': 0.352, 'r': 0.291},
 'rouge-2': {'f': 0.13, 'p': 0.151, 'r': 0.123},
 'rouge-l': {'f': 0.293, 'p': 0.333, 'r': 0.275}}


In [57]:
print("The Google ROUGE-1, ROUGE-2, ROUGE-l and ROUGE-Lsum score for P1 - Simplify & Summary : ")
pprint(calculate_avg_rouge_stats(google_rouge_p1, isGoogle=True))

print("The Google ROUGE-1, ROUGE-2, ROUGE-l and ROUGE-Lsum score for P2 - Summary & Simplify : ")
pprint(calculate_avg_rouge_stats(google_rouge_p2, isGoogle=True))

The Google ROUGE-1, ROUGE-2, ROUGE-l and ROUGE-Lsum score for P1 - Simplify & Summary : 
{'rouge-1': {'f': 0.509, 'p': 0.594, 'r': 0.478},
 'rouge-2': {'f': 0.228, 'p': 0.274, 'r': 0.21},
 'rouge-l': {'f': 0.272, 'p': 0.326, 'r': 0.251},
 'rouge-lsum': {'f': 0.494, 'p': 0.576, 'r': 0.464}}
The Google ROUGE-1, ROUGE-2, ROUGE-l and ROUGE-Lsum score for P2 - Summary & Simplify : 
{'rouge-1': {'f': 0.509, 'p': 0.594, 'r': 0.478},
 'rouge-2': {'f': 0.228, 'p': 0.274, 'r': 0.21},
 'rouge-l': {'f': 0.272, 'p': 0.326, 'r': 0.251},
 'rouge-lsum': {'f': 0.494, 'p': 0.576, 'r': 0.464}}


In [136]:
def get_min_max_rouge_scores(rf, isGoogle=False):
	'''
	Get minimum and maximum rouge scores from rf as a dict
	'''
	cols=['rouge-1', 'rouge-2', 'rouge-l']
	if isGoogle:
		cols.append('rouge-lsum')
	stats = {}
	for metric in cols:
		stats[metric] = {}
		stats[metric]['max'] = {}
		stats[metric]['max'] = round(max(x[metric]['r'] for x in rf.values()), 5)
		stats[metric]['max'] = round(max(x[metric]['p'] for x in rf.values()), 5)
		stats[metric]['max'] = round(max(x[metric]['f'] for x in rf.values()), 5)
		stats[metric]['min'] = {}
		stats[metric]['min'] = round(min(x[metric]['r'] for x in rf.values()), 5)
		stats[metric]['min'] = round(min(x[metric]['p'] for x in rf.values()), 5)
		stats[metric]['min'] = round(min(x[metric]['f'] for x in rf.values()), 5)
	return stats

In [137]:
all_rouge_stats = {}
all_rouge_stats['p1'] = get_min_max_rouge_scores(rouge_p1)
all_rouge_stats['p2'] = get_min_max_rouge_scores(rouge_p2)
all_rouge_stats['google_p1'] = get_min_max_rouge_scores(google_rouge_p1, isGoogle=True)
all_rouge_stats['google_p2'] = get_min_max_rouge_scores(google_rouge_p2, isGoogle=True)
pprint(all_rouge_stats)

{'google_p1': {'rouge-1': {'max': 0.81702, 'min': 0.09167},
               'rouge-2': {'max': 0.79828, 'min': 0.02473},
               'rouge-l': {'max': 0.81702, 'min': 0.055},
               'rouge-lsum': {'max': 0.81702, 'min': 0.08843}},
 'google_p2': {'rouge-1': {'max': 0.81702, 'min': 0.09167},
               'rouge-2': {'max': 0.79828, 'min': 0.02473},
               'rouge-l': {'max': 0.81702, 'min': 0.055},
               'rouge-lsum': {'max': 0.81702, 'min': 0.08843}},
 'p1': {'rouge-1': {'max': 0.64474, 'min': 0.07303},
        'rouge-2': {'max': 0.45026, 'min': 0.00685},
        'rouge-l': {'max': 0.64474, 'min': 0.0618}},
 'p2': {'rouge-1': {'max': 0.60606, 'min': 0.07386},
        'rouge-2': {'max': 0.41833, 'min': 0.00698},
        'rouge-l': {'max': 0.60606, 'min': 0.0625}}}


## Semantic similarity scores

In [None]:
!pip install jupyterlab pandas datasets matplotlib plotly scikit-learn tqdm ipywidgets 
!pip install numpy spacy textdistance fasttext 
!pip install tensorflow tensorflow_hub sentence-transformers openai
!conda install pyemd gensim

# Download the Spacy Model
!python -m spacy download en_core_web_sm

Code based on [this](https://towardsdatascience.com/semantic-textual-similarity-83b3ca4a840e) turorial:

### Word Movers Distance (WMD)

> 💡 Word embeddings are models that encode words into numeric vectors such that similar words have vectors that are near each other in vector space.

There are several ways to generate word embeddings, the most prominent being Word2Vec, GloVe, and FastText.

Since we need to compare the similarity between texts that contain multiple words, the simplest way to go from individual word embeddings into a single sentence embedding is to calculate the element-wise average of all the word embeddings in that text. However, there is an even better approach to computing the similarity between texts directly from the word embeddings called Word Movers Distance (WMD).

[WMD](http://proceedings.mlr.press/v37/kusnerb15.pdf) is based on the concept of [Earth Movers Distance](https://www.cs.jhu.edu/~misha/Papers/Rubner98.pdf) and is the minimum distance that the word embeddings from one document need to “travel” to reach the word embeddings of the document we are comparing it to. Since each document includes multiple words, the WMD calculation needs to calculate the distances from each word to every other word. It also weights the “travel” by the term frequencies of each word. Thankfully, the [gensim](https://github.com/RaRe-Technologies/gensim) library implements this complex computation efficiently using the [Fast WMD algorithm](https://www.cs.huji.ac.il/w~werman/Papers/ICCV2009.pdf). We can easily use it with just a single line of code!

Though we can use any word embedding model with WMD, I decide to use the [FastText model](https://arxiv.org/pdf/1607.04606.pdf) pre-trained on Wikipedia primarily because FastText uses sub-word information and will never run into Out Of Vocabulary issues that Word2Vec or GloVe might encounter. Take note to preprocess the texts to remove stopwords, lower case, and lemmatize them to ensure that the WMD calculation only uses informative words. Finally, since the WMD is a distance metric while we are looking for a similarity metric, we multiply the WMD value by -1 (Negative WMD) so that more similar texts have numerically larger values.

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import numpy as np
import gensim.downloader as api

nlp = spacy.load("en_core_web_sm")

In [8]:
def text_processing(sentence):
    sentence = [token.lemma_.lower()
                for token in nlp(sentence) 
                if token.is_alpha and not token.is_stop]   
    return sentence

def cos_sim(sentence1_emb, sentence2_emb):
    cos_sim = cosine_similarity(sentence1_emb, sentence2_emb)
    return np.diag(cos_sim)

!! Run the next cell only once to load the model !!

In [9]:
%%time
# Load the pre-trained model
model = api.load('fasttext-wiki-news-subwords-300')

Wall time: 5min 15s


In [7]:
def wmd_scores(file1, file2):
    # load text from file as a gle sentence
    text1 = ' '.join(read_file_as_text(file1).split('\n'))
    text2 = ' '.join(read_file_as_text(file2).split('\n'))
    # Text Processing
    processed_text1 = text_processing(text1)
    processed_text2 = text_processing(text2)
    
    # Negative Word Movers Distance
    return -model.wmdistance(processed_text1, processed_text2)

In [10]:
%%time
# Calculate WMD similarity for the whole dataset for the first pipeline (Simplify & Summarize)
wmd_p1 = calculate_metrics_for_dataset(wmd_scores, pipeline=p1)

Done generating wmd_scores scores for 99569
Done
Wall time: 1h 27min 57s


In [11]:
%%time
# Calculate WMD similarity for the whole dataset for the second pipeline (Summarize & Simplify)
wmd_p2 = calculate_metrics_for_dataset(wmd_scores, pipeline=p2)

Done generating wmd_scores scores for 99569
Done
Wall time: 1h 15min 23s


In [11]:
%%time
with open('data/simplify_summary_wmd_scores', 'r', encoding="utf8") as p1_f:
    wmd_p1 = json.load(p1_f)

with open('data/summary_simplify_wmd_scores', 'r', encoding="utf8") as p2_f:
    wmd_p2 = json.load(p2_f)

Wall time: 3.03 ms


In [130]:
def get_basic_wmd_stats(wmd_scores):
	'''
	Get basic WMD stats - average, min and max scores
	'''
	stats = {}
	stats['average'] = -round(sum(wmd_scores.values())/len(wmd_scores), 3)
	stats['max_score'] = -round(max(wmd_scores.values()), 3)
	stats['min_score'] = -round(min(wmd_scores.values()), 3)
	return stats

In [131]:
print("The Word Mover Distance similarity scores for P1 - Simplify & Summary : ")
pprint(get_basic_wmd_stats(wmd_p1))

print("The Word Mover Distance similarity scores for P1 - Summary & Simplify : ")
pprint(get_basic_wmd_stats(wmd_p2))

The Word Mover Distance similarity scores for P1 - Simplify & Summary : 
{'average': 0.589, 'max_score': 0.27, 'min_score': 1.028}
The Word Mover Distance similarity scores for P1 - Summary & Simplify : 
{'average': 0.592, 'max_score': 0.242, 'min_score': 0.993}


In [132]:
# Merge wmd scores for p1 and p2
wmd = {}
for key in wmd_p1:
	wmd[key] = {}
	wmd[key]['p1'] = round(wmd_p1[key], 3)
	wmd[key]['p2'] = round(wmd_p2[key], 3)
	wmd[key]['diff'] = round(abs(wmd_p2[key] - wmd_p1[key]), 3)

In [133]:
l = 500

print('Average WMD difference in score: ', round(sum(wmd[art]['diff'] for art in wmd.keys())/l, 6))
print('Max WMD difference in score: ', round(max(wmd[art]['diff'] for art in wmd.keys()), 6))
print('Min WMD difference in score: ', round(min(wmd[art]['diff'] for art in wmd.keys()), 6))

Average WMD difference in score:  0.026702
Max WMD difference in score:  0.247
Min WMD difference in score:  0.0
