# Evaluation notebook 
## ROUGE (ROUGE-1, ROUGE-2, ROUGE-l), SARI, BLEU

### ROUGE (ROUGE-1, ROUGE-2, ROUGE-l)

In [5]:
from pprint import pprint
import os
from rouge import Rouge
import json
import pandas as pandas
from easse import sari, bleu, fkgl

In [6]:
def read_file(file_path):
    with open(file_path,'r') as file:
        return file.readlines()

In [7]:
'''
Calculate ROUGE-1, ROUGE-2 and ROUGE-l scores for hypothesis and reference files
'''
def calculate_rouge_score(hyp_file_path, ref_file_path):
    rouge = Rouge()
    hypotheses = ''
    references = ''
    with open(hyp_file_path, 'r') as simplified_summary_file:
        hypotheses = simplified_summary_file.read()
    with open(ref_file_path, 'r') as simplified_summary_file:
        references = simplified_summary_file.read()
    return rouge.get_scores(hypotheses, references, avg=True)


# calculate_rouge_score('data/wiki-auto/25/simplify_summary/simplified_summary.txt','data/wiki-auto/25/destination.txt')
# calculate_rouge_score('data/wiki-auto/25/summary_simplify/simplified_summary.txt','data/wiki-auto/25/destination.txt')

In [73]:
'''
Calculate the ROUGE SCORES for the whole dataset and save the scores to a file. 
'''
def calculate_rouge_score_for_dataset(pipeline='simplify_summary', data_dir='data/wiki-auto', location='data/'):
    evaluation_dataset = dict()
    # base data/wiki-auto
    count = 2
    for folder in os.listdir(data_dir):
        # raw data = data/wiki-auto/{id}
        base_data_dir = os.path.join(data_dir, folder)
        ground_truth_file_path = os.path.join(base_data_dir, 'destination.txt')
        pipeline_dir = os.path.join(base_data_dir, pipeline)
        simplified_summary_file_path = os.path.join(
            pipeline_dir, 'simplified_summary.txt')
        evaluation_dataset[folder] = calculate_rouge_score(simplified_summary_file_path, ground_truth_file_path)
        print(f'Done generating ROUGE scores for {folder} \r', end='', flush=True)
        # count -= 1 
        # if count == 0:
        #     break
    with open(location+pipeline+'_rouge_scores', 'w') as outputfile:
        json.dump(evaluation_dataset, outputfile)

In [74]:
%%time
# Calculate ROUGE score for the whole dataset for the first pipeline (Simplify & Summarize)
calculate_rouge_score_for_dataset('simplify_summary')

CPU times: user 10min 5s, sys: 540 ms, total: 10min 6s
Wall time: 10min 6s


In [75]:
%%time
# Calculate ROUGE score for the whole dataset for the second pipeline (Summarize & Simplify)
calculate_rouge_score_for_dataset('summary_simplify')

CPU times: user 9min 55s, sys: 724 ms, total: 9min 56s
Wall time: 9min 57s


#### Preprocess the ROUGE scores files to calculate the average score for the whole dataset

In [8]:
%%time
simplify_summary_df = pandas.read_json(
    'data/simplify_summary_rouge_scores',
    encoding='utf8',
    orient='index'
)
simplify_summary_df.head(10)

CPU times: user 12.2 ms, sys: 4.08 ms, total: 16.2 ms
Wall time: 24.3 ms


Unnamed: 0,rouge-1,rouge-2,rouge-l
1004,"{'r': 0.202247191011235, 'p': 0.19148936170212...","{'r': 0.051204819277108, 'p': 0.05151515151515...","{'r': 0.18539325842696602, 'p': 0.175531914893..."
1005,"{'r': 0.234042553191489, 'p': 0.27049180327868...","{'r': 0.081481481481481, 'p': 0.11398963730569...","{'r': 0.20567375886524802, 'p': 0.237704918032..."
10065,"{'r': 0.23333333333333303, 'p': 0.233333333333...","{'r': 0.07042253521126701, 'p': 0.070422535211...","{'r': 0.23333333333333303, 'p': 0.233333333333..."
10224,"{'r': 0.158415841584158, 'p': 0.16080402010050...","{'r': 0.032934131736526005, 'p': 0.02917771883...","{'r': 0.15346534653465302, 'p': 0.155778894472..."
10263,"{'r': 0.19626168224299, 'p': 0.375, 'f': 0.257...","{'r': 0.09202453987730001, 'p': 0.174418604651...","{'r': 0.177570093457943, 'p': 0.33928571428571..."
10294,"{'r': 0.252830188679245, 'p': 0.19881305637982...","{'r': 0.055187637969094004, 'p': 0.03703703703...","{'r': 0.23018867924528302, 'p': 0.181008902077..."
10335,"{'r': 0.340764331210191, 'p': 0.27973856209150...","{'r': 0.11384820239680402, 'p': 0.102333931777...","{'r': 0.32324840764331203, 'p': 0.265359477124..."
10415,"{'r': 0.251700680272108, 'p': 0.23717948717948...","{'r': 0.14351851851851802, 'p': 0.113138686131...","{'r': 0.23809523809523803, 'p': 0.224358974358..."
10416,"{'r': 0.37158469945355105, 'p': 0.456375838926...","{'r': 0.21951219512195103, 'p': 0.245136186770...","{'r': 0.37158469945355105, 'p': 0.456375838926..."
10418,"{'r': 0.29487179487179405, 'p': 0.326241134751...","{'r': 0.11489361702127601, 'p': 0.1125, 'f': 0...","{'r': 0.29487179487179405, 'p': 0.326241134751..."


In [9]:
summary_simplify_df = pandas.read_json(
    'data/summary_simplify_rouge_scores',
    encoding='utf8',
    orient='index'
)
summary_simplify_df.head(10)

Unnamed: 0,rouge-1,rouge-2,rouge-l
1004,"{'r': 0.230337078651685, 'p': 0.22404371584699...","{'r': 0.07228915662650601, 'p': 0.073619631901...","{'r': 0.21910112359550502, 'p': 0.213114754098..."
1005,"{'r': 0.234042553191489, 'p': 0.28205128205128...","{'r': 0.092592592592592, 'p': 0.13586956521739...","{'r': 0.20567375886524802, 'p': 0.247863247863..."
10065,"{'r': 0.21111111111111103, 'p': 0.283582089552...","{'r': 0.07042253521126701, 'p': 0.086956521739...","{'r': 0.21111111111111103, 'p': 0.283582089552..."
10224,"{'r': 0.17821782178217802, 'p': 0.173076923076...","{'r': 0.032934131736526005, 'p': 0.02956989247...","{'r': 0.173267326732673, 'p': 0.16826923076923..."
10263,"{'r': 0.19626168224299, 'p': 0.403846153846153...","{'r': 0.08588957055214701, 'p': 0.168674698795...","{'r': 0.15887850467289702, 'p': 0.326923076923..."
10294,"{'r': 0.24528301886792403, 'p': 0.198170731707...","{'r': 0.048565121412803, 'p': 0.03298350824587...","{'r': 0.226415094339622, 'p': 0.18292682926829..."
10335,"{'r': 0.312101910828025, 'p': 0.27762039660056...","{'r': 0.101864181091877, 'p': 0.09845559845559...","{'r': 0.29936305732484003, 'p': 0.266288951841..."
10415,"{'r': 0.23809523809523803, 'p': 0.239726027397...","{'r': 0.12037037037037, 'p': 0.103174603174603...","{'r': 0.224489795918367, 'p': 0.22602739726027..."
10416,"{'r': 0.34972677595628404, 'p': 0.438356164383...","{'r': 0.21602787456445902, 'p': 0.248, 'f': 0....","{'r': 0.34426229508196704, 'p': 0.431506849315..."
10418,"{'r': 0.224358974358974, 'p': 0.30434782608695...","{'r': 0.059574468085106004, 'p': 0.07216494845...","{'r': 0.224358974358974, 'p': 0.30434782608695..."


In [10]:
'''
Calculate the average ROUGE scores r,p,f for the whole given dataset 
'''
def calculate_avg_rouge_stats(rouge_dataframe):
    avg_rouge_stats = dict(
    {
        'rouge-1': {
            "r": 0,
            "p": 0,
            "f": 0,
        },
        'rouge-2': {
            "r": 0,
            "p": 0,
            "f": 0,
        },
        'rouge-l': {
            "r": 0,
            "p": 0,
            "f": 0,
        }
    })
    dataframe_size = len(rouge_dataframe)
    for i in range(dataframe_size):
        for key in avg_rouge_stats:
            avg_rouge_stats[key]['r'] = avg_rouge_stats[key]['r'] + rouge_dataframe.iloc[i][key]['r']
            avg_rouge_stats[key]['p'] = avg_rouge_stats[key]['p'] + rouge_dataframe.iloc[i][key]['p']
            avg_rouge_stats[key]['f'] = avg_rouge_stats[key]['f']+ rouge_dataframe.iloc[i][key]['f']

    for key in avg_rouge_stats:
        avg_rouge_stats[key]['r'] = round(avg_rouge_stats[key]['r'] / dataframe_size, 3)
        avg_rouge_stats[key]['p'] = round(avg_rouge_stats[key]['p'] / dataframe_size, 3)
        avg_rouge_stats[key]['f'] = round(avg_rouge_stats[key]['f'] / dataframe_size, 3)
        # if i == 1:
        #     break
    return avg_rouge_stats

In [11]:
%%time
print("The ROUGE-1,ROUGE-2 and ROUGE-l score for the Simplify & Summary : ")
pprint(calculate_avg_rouge_stats(simplify_summary_df))

print("The ROUGE-1,ROUGE-2 and ROUGE-l score for the Summary & Simplify : ")
pprint(calculate_avg_rouge_stats(summary_simplify_df))

The ROUGE-1,ROUGE-2 and ROUGE-l score for the Simplify & Summary : 
{'rouge-1': {'f': 0.309, 'p': 0.344, 'r': 0.297},
 'rouge-2': {'f': 0.129, 'p': 0.145, 'r': 0.125},
 'rouge-l': {'f': 0.292, 'p': 0.326, 'r': 0.28}}
The ROUGE-1,ROUGE-2 and ROUGE-l score for the Summary & Simplify : 
{'rouge-1': {'f': 0.309, 'p': 0.352, 'r': 0.291},
 'rouge-2': {'f': 0.13, 'p': 0.151, 'r': 0.123},
 'rouge-l': {'f': 0.293, 'p': 0.333, 'r': 0.275}}
CPU times: user 658 ms, sys: 2.33 ms, total: 660 ms
Wall time: 658 ms


In [13]:
pprint(calculate_rouge_score('data/wiki-auto/1368/simplify_summary/simplified_summary.txt','data/wiki-auto/1368/destination.txt'))
pprint(calculate_rouge_score('data/wiki-auto/1368/summary_simplify/simplified_summary.txt','data/wiki-auto/1368/destination.txt')  )

{'rouge-1': {'f': 0.2988953816345476,
             'p': 0.28117359413202936,
             'r': 0.31900138696255204},
 'rouge-2': {'f': 0.09651217847179176,
             'p': 0.09417249417249417,
             'r': 0.09897109260166585},
 'rouge-l': {'f': 0.2820012945650219,
             'p': 0.265281173594132,
             'r': 0.30097087378640774}}
{'rouge-1': {'f': 0.3056265934958972,
             'p': 0.2835112692763938,
             'r': 0.3314840499306519},
 'rouge-2': {'f': 0.09906675738603936,
             'p': 0.09681945743685688,
             'r': 0.10142087212150906},
 'rouge-l': {'f': 0.27877237354704815,
             'p': 0.2586002372479241,
             'r': 0.30235783633841884}}


### BLEU

In [153]:
!easse evaluate -t pwkp_test -m 'bleu,fkgl' --refs_sents_paths data/wiki-auto/1368/destination.txt --orig_sents_path data/wiki-auto/1368/source.txt --sys_sents_path data/wiki-auto/1368/simplify_summary/simplified_summary.txt

{'bleu': 0.213, 'fkgl': 12.984}


In [23]:
'''
Calculate the BLEU Scores for the whole dataset, you specify the pipeline
'''
def calculate_bleu_score_for_dataset(pipeline='simplify_summary', data_dir='data/wiki-auto', location='data/'):
    evaluation_dataset = dict()
    # base data/wiki-auto
    count = 2
    for folder in os.listdir(data_dir):
        # raw data = data/wiki-auto/{id}
        base_data_dir = os.path.join(data_dir, folder)
        ground_truth_file_path = os.path.join(base_data_dir, 'destination.txt')
        pipeline_dir = os.path.join(base_data_dir, pipeline)
        simplified_summary_file_path = os.path.join(
            pipeline_dir, 'simplified_summary.txt')
        evaluation_dataset[folder] = bleu.corpus_bleu(sys_sents=read_file(simplified_summary_file_path), 
        refs_sents=read_file(ground_truth_file_path)) # still not sure of this must be list of lists
    
        print(f'Done generating BLEU scores for {folder} \r', end='', flush=True)
        # count -= 1 
        # if count == 0:
        #     break
    with open(location+pipeline+'_bleu_scores', 'w') as outputfile:
        json.dump(evaluation_dataset, outputfile)

def calculate_avg_bleu_stats(file_path):
    with open(file_path, 'r') as inputfile:
       bleu_stats =  json.load(inputfile)
    return (sum(bleu_stats.values())/len(bleu_stats))


In [24]:
%%time
# Calculate BLEU score for the whole dataset for the first pipeline (Simplify & Summarize)
calculate_bleu_score_for_dataset('simplify_summary')

CPU times: user 1min 12s, sys: 180 ms, total: 1min 12s
Wall time: 1min 12s


In [25]:
%%time
# Calculate BLEU score for the whole dataset for the first pipeline (Summarize & Simplify)
calculate_bleu_score_for_dataset('summary_simplify')

CPU times: user 1min 12s, sys: 277 ms, total: 1min 12s
Wall time: 1min 12s


In [26]:
%%time
print("The bleuscore for the Simplify & Summary : ")
pprint(calculate_avg_bleu_stats('data/simplify_summary_bleu_scores'))

print("The bleu score for the Summary & Simplify : ")
pprint(calculate_avg_bleu_stats('data/summary_simplify_bleu_scores'))

The bleuscore for the Simplify & Summary : 
0.15187647507892169
The bleu score for the Summary & Simplify : 
0.15094121464884871
CPU times: user 1.81 ms, sys: 0 ns, total: 1.81 ms
Wall time: 1.41 ms


In [161]:
bleu.corpus_bleu(sys_sents=read_file('data/wiki-auto/1368/simplify_summary/simplified_summary.txt'),
 refs_sents=read_file('data/wiki-auto/1368/destination.txt'))

0.11933771243068295

### SARI

In [212]:
# read_file('data/wiki-auto/1368/source.txt')

In [211]:
sari.corpus_sari(
    orig_sents = read_file('data/wiki-auto/1368/source.txt'),
    sys_sents=read_file('data/wiki-auto/1368/simplify_summary/simplified_summary.txt'),
    refs_sents=read_file('data/wiki-auto/1368/destination.txt')) # still not sure of this must be list of lists

AssertionError: Original sentences and system sentences don't have the same number of samples