# **Trabajo de Fin de Máster**

In [1]:
%cd /Users/andreea/Desktop/TFM/git/qubart/eval
%load_ext autoreload
%autoreload 2
from utils import *
from constants import *
from rouge import Rouge
import re
from tqdm.notebook import tqdm, trange
import time    # to be used in loop iterations
import string

/Users/andreea/Desktop/TFM/git/qubart/eval


  from collections import Iterable


In [2]:
corpus = load_corpus(f"/Users/andreea/Desktop/TFM/git/qubart/corpus/{CORPUS_GOT_REVIEWS}")['train']



In [3]:
rouge = Rouge()
rouge_measures = ["rouge-1", "rouge-2", "rouge-l"]
columns = ['Reference data length', 'Filtered data length', 'Summary length', 'Precision', 'Recall', 'F-Score', 'Filtered']


def get_length_text(text):
    words = [''.join(x for x in par if x not in string.punctuation) for par in text.split()]
    words = list(filter(lambda a: a != '', words))
    lenght_text = len(words)
    return lenght_text
    
def get_rouge_columns(df):
    '''
    '''
    df_aux = df
    df_r1 = df_aux.iloc[::3, :]
    df = df_aux.drop(df_aux.iloc[::3, :].index)
    df_r2 = df.iloc[::2, :]
    df_rl = df.drop(df.iloc[::2, :].index)
    return df_r1, df_r2, df_rl

# Show top n keywords for each topic
def get_word_topic(vectorizer, lda_model, n_words=1):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

def get_filter_by(input_data, num_topics):
    lda_model, vectorizer, data_lda = get_LDA(input_data, num_topics)
    topic_keywords = get_word_topic(vectorizer=vectorizer, lda_model=lda_model, n_words=1)
    return topic_keywords[np.argmax(data_lda)][0]    


def create_table_rouge(hyps, refs, rouge_measures, index, columns, ref_length, summary_length, filtered_length, filtered):
    scores = rouge.get_scores(hyps=hyps, refs=refs)
    table = get_df_rouge_scores(scores=[scores[0]],
                                rouge_measures=rouge_measures,
                                index=index,    
                                columns=columns,
                                ref_length = ref_length,
                                summary_length = summary_length,
                                filtered_length = filtered_length,
                                filtered=filtered)
    return table
    
def get_rouge_scores(sentences, filtered_data, summary, target_summary, is_filtered, episode, lead_n = 3):
    '''
    '''
    index = [f"Entry {episode} (R1)", f"Episode {episode} (R2)", f"Episode {episode} (RL)"]
    
    filtered_length = get_length_text(filtered_data) if filtered_data != "" else 0
    
    table_orig_text = create_table_rouge(hyps=summary, 
                                    refs=''.join(sentences),
                                    rouge_measures=rouge_measures,
                                    index=index,
                                    columns=columns,
                                    ref_length = get_length_text(''.join(sentences)),
                                    summary_length = get_length_text(summary),
                                    filtered_length = filtered_length,
                                    filtered=is_filtered)

    if target_summary != "":
        table_target = create_table_rouge(hyps=summary, 
                                    refs=''.join(target_summary),
                                    rouge_measures=rouge_measures,
                                    index=index,
                                    columns=columns,
                                    ref_length = get_length_text(''.join(sentences)),
                                    summary_length = get_length_text(''.join(target_summary)),
                                    filtered_length = filtered_length,
                                    filtered=is_filtered)
    else:
        table_target = pd.DataFrame()
    
    leadn_ref_text = summary if target_summary == "" else ''.join(target_summary)

    table_leadn = create_table_rouge(hyps=''.join(sentences[:lead_n]),
                                     refs=leadn_ref_text,
                                     rouge_measures=rouge_measures,
                                     index=index,
                                     columns=columns,
                                     ref_length = get_length_text(leadn_ref_text),
                                     summary_length = get_length_text(''.join(sentences[:lead_n])),
                                     filtered_length = filtered_length,
                                     filtered=is_filtered)
    
    if filtered_data != "":
        table_filtered = create_table_rouge(hyps=summary,
                                            refs=filtered_data,
                                            rouge_measures=rouge_measures,
                                            index=index,
                                            columns=columns,
                                            ref_length = get_length_text(''.join(sentences)),
                                            summary_length = get_length_text(summary),
                                            filtered_length = filtered_length,
                                            filtered=is_filtered)
        
        return table_orig_text, table_target, table_leadn, table_filtered
    
    return table_orig_text, table_target, table_leadn

def calculate_rouge_qubart(data, len_data, model="sshleifer/distilbart-cnn-12-6", filter_by=None, num_topics=10, similarity_threshold=0.3, use_columns=False):
    df_orig = pd.DataFrame(columns=columns)
    df_filtered = pd.DataFrame(columns=columns)
    df_leadn = pd.DataFrame(columns=columns)
    df_target = pd.DataFrame(columns=columns)

    for entry in trange(len_data):
        filtered = 0
        
        if use_columns:
            text = data[entry]['text']
            text = "".join(text)
        else:
            text = data[entry]
            
        sentences = re.split(REGEX_EOS, text)
        sentences = [sentence.strip() for sentence in sentences]
        
        target_summary = ""
        
        if use_columns:
            target_summary = data[entry]['summary']
        
        embedded_sentences = get_sentence_embeddings(sentences)
        
        if not filter_by:
            entry_filter_by = get_filter_by(text, num_topics)
        else:
            entry_filter_by = filter_by
            
        similar_sentences = get_similar_sentences(get_sentence_embeddings(entry_filter_by), embedded_sentences,
                                                 similarity_threshold=similarity_threshold)
        filtered_text = ""
        for idx in similar_sentences:
            filtered_text = filtered_text + sentences[idx]
            
        if filtered_text == "":
            summary = get_summary(text, model)
            table_orig, table_target, table_leadn = get_rouge_scores(sentences=sentences,
                                                                  filtered_data=filtered_text,
                                                                  target_summary = target_summary,
                                                                  summary=summary[0],
                                                                  is_filtered=filtered,
                                                                  episode=entry)
        else:
            filtered = 1
            summary = get_summary(filtered_text, model)
            table_orig, table_target, table_leadn, table_filtered = get_rouge_scores(sentences=sentences,
                                                                                  filtered_data=filtered_text,
                                                                                  target_summary = target_summary,
                                                                                  summary=summary[0],
                                                                                  is_filtered=filtered,
                                                                                  episode=entry)
            df_filtered = df_filtered.append(table_filtered)
        
        df_orig = df_orig.append(table_orig)
        df_leadn = df_leadn.append(table_leadn)
        if not table_target.empty:
            df_target = df_leadn.append(table_target)
    
    return df_orig, df_target, df_filtered, df_leadn

In [4]:
import jsonlines

def load_dataset(file, max_num_entries=100):
    with jsonlines.open(file) as f:
        counter = 0
        lines = {}
        for line in f.iter():
            if counter == max_num_entries:
                break
            else:
                lines[counter] = {'text': line['text'],
                                 'summary': line['summary']}
                counter = counter + 1
        return lines

In [5]:
xsum_lines = load_dataset('datasets/ACL2020_other_datasets/test_xsum.jsonl')
cnndm_lines = load_dataset('datasets/ACL2020_data/test_CNNDM_bert.jsonl')
wikihow_lines = load_dataset('datasets/ACL2020_other_datasets/test_wikihow.jsonl')

# **Game of Thrones**

In [6]:
df_orig, df_ref, df_filtered, df_leadn = calculate_rouge_qubart(data=corpus[REV_REVIEW], len_data=len(corpus[REV_REVIEW]), filter_by="Bran", similarity_threshold=0.3)
df_r1, df_r2, df_rl = get_rouge_columns(df_orig)
df_r1_filtered, df_r2_filtered, df_rl_filtered = get_rouge_columns(df_filtered)
df_r1_lead3, df_r2_lead3, df_rl_lead3 = get_rouge_columns(df_leadn)
if not df_ref.empty:
    df_r1_ref, df_r2_ref, df_rl_ref = get_rouge_columns(df_ref)

  0%|          | 0/73 [00:00<?, ?it/s]

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  ../aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


## **R1 metric**

In [7]:
df_r1.mean()

Reference data length    773.808219
Filtered data length      42.123288
Summary length            52.876712
Precision                  0.127238
Recall                     0.911770
F-Score                    0.210591
Filtered                   0.356164
dtype: float64

In [69]:
df_r1.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Entry 4 (R1),57,0,41,0.74,0.948718,0.831461,0
Entry 3 (R1),63,0,37,0.528302,0.823529,0.643678,0
Entry 0 (R1),146,0,58,0.380165,0.938776,0.541176,0
Entry 8 (R1),302,0,71,0.276498,0.952381,0.428571,0
Entry 2 (R1),195,0,50,0.273381,0.95,0.424581,0


In [8]:
df_r1_filtered.mean()

Reference data length    896.307692
Filtered data length     118.269231
Summary length            50.384615
Precision                  0.572589
Recall                     0.918358
F-Score                    0.665945
Filtered                   1.000000
dtype: float64

In [68]:
df_r1_filtered.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Entry 59 (R1),1181,40,51,0.909091,0.909091,0.909091,1
Entry 29 (R1),977,45,44,0.871795,0.944444,0.906667,1
Entry 19 (R1),898,38,45,0.909091,0.857143,0.882353,1
Entry 52 (R1),1129,64,62,0.843137,0.895833,0.868687,1
Entry 66 (R1),1419,56,45,0.770833,0.973684,0.860465,1


In [9]:
df_r1_lead3.mean()

Reference data length     52.876712
Filtered data length      42.123288
Summary length           112.054795
Precision                  0.456529
Recall                     0.238365
F-Score                    0.306507
Filtered                   0.356164
dtype: float64

In [70]:
df_r1_lead3.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Entry 4 (R1),41,0,57,0.948718,0.74,0.831461,0
Entry 8 (R1),71,0,96,0.952381,0.714286,0.816327,0
Entry 3 (R1),37,0,63,0.823529,0.528302,0.643678,0
Entry 58 (R1),52,0,106,0.926829,0.457831,0.612903,0
Entry 55 (R1),82,178,135,0.761905,0.5,0.603774,1


In [10]:
if not df_ref.empty:
    print(df_r1_ref.mean())

## **R2 metric**

In [11]:
df_r2.mean()

Reference data length    773.808219
Filtered data length      42.123288
Summary length            52.876712
Precision                  0.085473
Recall                     0.794329
F-Score                    0.143500
Filtered                   0.356164
dtype: float64

In [71]:
df_r2.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 4 (R2),57,0,41,0.607143,0.85,0.708333,0
Episode 3 (R2),63,0,37,0.403226,0.694444,0.510204,0
Episode 0 (R2),146,0,58,0.342466,0.877193,0.492611,0
Episode 8 (R2),302,0,71,0.218121,0.928571,0.353261,0
Episode 2 (R2),195,0,50,0.202073,0.8125,0.323651,0


In [12]:
df_r2_filtered.mean()

Reference data length    896.307692
Filtered data length     118.269231
Summary length            50.384615
Precision                  0.508384
Recall                     0.832168
F-Score                    0.576648
Filtered                   1.000000
dtype: float64

In [72]:
df_r2_filtered.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 29 (R2),977,45,44,0.822222,0.860465,0.840909,1
Episode 59 (R2),1181,40,51,0.894737,0.73913,0.809524,1
Episode 26 (R2),573,62,46,0.7,0.933333,0.8,1
Episode 52 (R2),1129,64,62,0.774194,0.8,0.786885,1
Episode 19 (R2),898,38,45,0.837838,0.704545,0.765432,1


In [13]:
df_r2_lead3.mean()

Reference data length     52.876712
Filtered data length      42.123288
Summary length           112.054795
Precision                  0.279275
Recall                     0.130826
F-Score                    0.173545
Filtered                   0.356164
dtype: float64

In [73]:
df_r2_lead3.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 8 (R2),71,0,96,0.928571,0.677083,0.783133,0
Episode 4 (R2),41,0,57,0.85,0.607143,0.708333,0
Episode 58 (R2),52,0,106,0.823529,0.4,0.538462,0
Episode 3 (R2),37,0,63,0.694444,0.403226,0.510204,0
Episode 55 (R2),82,178,135,0.641975,0.403101,0.495238,1


In [14]:
if not df_ref.empty:
    print(df_r2_ref.mean())

## **RL metric**

In [15]:
df_rl.mean()

Reference data length    773.808219
Filtered data length      42.123288
Summary length            52.876712
Precision                  0.126351
Recall                     0.903553
F-Score                    0.209005
Filtered                   0.356164
dtype: float64

In [74]:
df_rl.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 4 (RL),57,0,41,0.74,0.948718,0.831461,0
Episode 3 (RL),63,0,37,0.528302,0.823529,0.643678,0
Episode 0 (RL),146,0,58,0.380165,0.938776,0.541176,0
Episode 8 (RL),302,0,71,0.276498,0.952381,0.428571,0
Episode 2 (RL),195,0,50,0.273381,0.95,0.424581,0


In [16]:
df_rl_filtered.mean()

Reference data length    896.307692
Filtered data length     118.269231
Summary length            50.384615
Precision                  0.570030
Recall                     0.913803
F-Score                    0.662812
Filtered                   1.000000
dtype: float64

In [75]:
df_rl_filtered.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 59 (RL),1181,40,51,0.909091,0.909091,0.909091,1
Episode 29 (RL),977,45,44,0.871795,0.944444,0.906667,1
Episode 19 (RL),898,38,45,0.909091,0.857143,0.882353,1
Episode 26 (RL),573,62,46,0.764706,0.975,0.857143,1
Episode 52 (RL),1129,64,62,0.823529,0.875,0.848485,1


In [17]:
df_rl_lead3.mean()

Reference data length     52.876712
Filtered data length      42.123288
Summary length           112.054795
Precision                  0.420480
Recall                     0.219908
F-Score                    0.282445
Filtered                   0.356164
dtype: float64

In [76]:
df_rl_lead3.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 4 (RL),41,0,57,0.948718,0.74,0.831461,0
Episode 8 (RL),71,0,96,0.952381,0.714286,0.816327,0
Episode 3 (RL),37,0,63,0.823529,0.528302,0.643678,0
Episode 55 (RL),82,178,135,0.746032,0.489583,0.591195,1
Episode 58 (RL),52,0,106,0.878049,0.433735,0.580645,0


In [18]:
if not df_ref.empty:
    print(df_rl_ref.mean())

***

# **CNN/Dailymail Dataset**

In [19]:
df_orig_cnndm, df_ref_cnndm, df_filtered_cnndm, df_leadn_cnndm = calculate_rouge_qubart(data=cnndm_lines, len_data=len(cnndm_lines), similarity_threshold=0.2, use_columns=True)
df_cnndm_r1, df_cnndm_r2, df_cnndm_rl = get_rouge_columns(df_orig_cnndm)
df_cnndm_r1_filtered, df_cnndm_r2_filtered, df_cnndm_rl_filtered = get_rouge_columns(df_filtered_cnndm)
df_cnndm_r1_lead3, df_cnndm_r2_lead3, df_cnndm_rl_lead3 = get_rouge_columns(df_leadn_cnndm)
if not df_ref_cnndm.empty:
    df_cnndm_r1_ref, df_cnndm_r2_ref, df_cnndm_rl_ref = get_rouge_columns(df_ref_cnndm)

  0%|          | 0/100 [00:00<?, ?it/s]

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 

## **R1 metric**

In [20]:
df_cnndm_r1.mean()

Reference data length    667.410000
Filtered data length     601.490000
Summary length            58.480000
Precision                  0.166813
Recall                     0.908705
F-Score                    0.276730
Filtered                   0.980000
dtype: float64

In [77]:
df_cnndm_r1.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Entry 65 (R1),150,150,57,0.386792,0.891304,0.539474,1
Entry 20 (R1),201,201,53,0.35,0.933333,0.509091,1
Entry 22 (R1),270,270,49,0.310078,0.97561,0.470588,1
Entry 5 (R1),245,245,54,0.32,0.869565,0.467836,1
Entry 48 (R1),403,0,90,0.310345,0.913043,0.463235,0


In [21]:
df_cnndm_r1_filtered.mean()

Reference data length    668.632653
Filtered data length     613.765306
Summary length            58.183673
Precision                  0.177446
Recall                     0.908345
F-Score                    0.290987
Filtered                   1.000000
dtype: float64

In [78]:
df_cnndm_r1_filtered.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Entry 21 (R1),228,80,43,0.469697,0.837838,0.601942,1
Entry 65 (R1),150,150,57,0.386792,0.891304,0.539474,1
Entry 20 (R1),201,201,53,0.35,0.933333,0.509091,1
Entry 22 (R1),270,270,49,0.310078,0.97561,0.470588,1
Entry 5 (R1),245,245,54,0.32,0.869565,0.467836,1


In [22]:
df_cnndm_r1_lead3.mean()

Reference data length     53.810000
Filtered data length     601.490000
Summary length           514.970000
Precision                  0.828009
Recall                     0.174894
F-Score                    0.281301
Filtered                   0.980000
dtype: float64

In [79]:
df_cnndm_r1_lead3.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Entry 32 (R1),80,377,377,0.985507,0.356021,0.523077,1
Entry 42 (R1),79,546,188,0.723077,0.373016,0.492147,1
Entry 56 (R1),63,351,351,0.910714,0.326923,0.481132,1
Entry 78 (R1),77,498,290,0.768116,0.339744,0.471111,1
Entry 47 (R1),67,447,88,0.54902,0.394366,0.459016,1


In [23]:
if not df_ref_cnndm.empty:
    print(df_cnndm_r1_ref.mean())

Reference data length     66.168317
Filtered data length     607.910891
Summary length           510.247525
Precision                  0.823022
Recall                     0.175404
F-Score                    0.281157
Filtered                   0.980198
dtype: float64


In [80]:
df_cnndm_r1_ref.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Entry 32 (R1),80,377,377,0.985507,0.356021,0.523077,1
Entry 42 (R1),79,546,188,0.723077,0.373016,0.492147,1
Entry 56 (R1),63,351,351,0.910714,0.326923,0.481132,1
Entry 78 (R1),77,498,290,0.768116,0.339744,0.471111,1
Entry 47 (R1),67,447,88,0.54902,0.394366,0.459016,1


## **R2 metric**

In [24]:
df_cnndm_r2.mean()

Reference data length    667.410000
Filtered data length     601.490000
Summary length            58.480000
Precision                  0.093359
Recall                     0.797589
F-Score                    0.163495
Filtered                   0.980000
dtype: float64

In [81]:
df_cnndm_r2.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 65 (R2),150,150,57,0.273333,0.732143,0.398058,1
Episode 20 (R2),201,201,53,0.244565,0.865385,0.381356,1
Episode 56 (R2),351,351,62,0.220833,0.868852,0.352159,1
Episode 48 (R2),403,0,90,0.216901,0.855556,0.346067,0
Episode 22 (R2),270,270,49,0.191388,0.851064,0.3125,1


In [25]:
df_cnndm_r2_filtered.mean()

Reference data length    668.632653
Filtered data length     613.765306
Summary length            58.183673
Precision                  0.101610
Recall                     0.796625
F-Score                    0.175592
Filtered                   1.000000
dtype: float64

In [82]:
df_cnndm_r2_filtered.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 21 (R2),228,80,43,0.39759,0.785714,0.528,1
Episode 65 (R2),150,150,57,0.273333,0.732143,0.398058,1
Episode 20 (R2),201,201,53,0.244565,0.865385,0.381356,1
Episode 56 (R2),351,351,62,0.220833,0.868852,0.352159,1
Episode 18 (R2),641,409,85,0.201102,0.879518,0.327354,1


In [26]:
df_cnndm_r2_lead3.mean()

Reference data length     53.810000
Filtered data length     601.490000
Summary length           514.970000
Precision                  0.471494
Recall                     0.067722
F-Score                    0.115089
Filtered                   0.980000
dtype: float64

In [83]:
df_cnndm_r2_lead3.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 32 (R2),80,377,377,0.914634,0.241158,0.381679,1
Episode 56 (R2),63,351,351,0.822581,0.2125,0.337748,1
Episode 73 (R2),79,598,428,0.904762,0.180952,0.301587,1
Episode 82 (R2),82,702,460,0.768293,0.1575,0.261411,1
Episode 16 (R2),54,691,247,0.673077,0.156951,0.254545,1


In [27]:
if not df_ref_cnndm.empty:
    print(df_cnndm_r2_ref.mean())

Reference data length     66.168317
Filtered data length     607.910891
Summary length           510.247525
Precision                  0.467309
Recall                     0.067393
F-Score                    0.114349
Filtered                   0.980198
dtype: float64


In [84]:
df_cnndm_r2_ref.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 32 (R2),80,377,377,0.914634,0.241158,0.381679,1
Episode 56 (R2),63,351,351,0.822581,0.2125,0.337748,1
Episode 73 (R2),79,598,428,0.904762,0.180952,0.301587,1
Episode 82 (R2),82,702,460,0.768293,0.1575,0.261411,1
Episode 16 (R2),54,691,247,0.673077,0.156951,0.254545,1


## **RL metric**

In [28]:
df_cnndm_rl.mean()

Reference data length    667.410000
Filtered data length     601.490000
Summary length            58.480000
Precision                  0.165580
Recall                     0.901866
F-Score                    0.274668
Filtered                   0.980000
dtype: float64

In [85]:
df_cnndm_rl.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 65 (RL),150,150,57,0.386792,0.891304,0.539474,1
Episode 20 (RL),201,201,53,0.35,0.933333,0.509091,1
Episode 22 (RL),270,270,49,0.310078,0.97561,0.470588,1
Episode 48 (RL),403,0,90,0.310345,0.913043,0.463235,0
Episode 5 (RL),245,245,54,0.312,0.847826,0.45614,1


In [29]:
df_cnndm_rl_filtered.mean()

Reference data length    668.632653
Filtered data length     613.765306
Summary length            58.183673
Precision                  0.176069
Recall                     0.901523
F-Score                    0.288723
Filtered                   1.000000
dtype: float64

In [86]:
df_cnndm_rl_filtered.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 21 (RL),228,80,43,0.469697,0.837838,0.601942,1
Episode 65 (RL),150,150,57,0.386792,0.891304,0.539474,1
Episode 20 (RL),201,201,53,0.35,0.933333,0.509091,1
Episode 22 (RL),270,270,49,0.310078,0.97561,0.470588,1
Episode 5 (RL),245,245,54,0.312,0.847826,0.45614,1


In [30]:
df_cnndm_rl_lead3.mean()

Reference data length     53.810000
Filtered data length     601.490000
Summary length           514.970000
Precision                  0.784707
Recall                     0.164819
F-Score                    0.265509
Filtered                   0.980000
dtype: float64

In [87]:
df_cnndm_rl_lead3.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 32 (RL),80,377,377,0.985507,0.356021,0.523077,1
Episode 56 (RL),63,351,351,0.910714,0.326923,0.481132,1
Episode 42 (RL),79,546,188,0.692308,0.357143,0.471204,1
Episode 78 (RL),77,498,290,0.73913,0.326923,0.453333,1
Episode 82 (RL),82,702,460,0.970149,0.286344,0.442177,1


In [31]:
if not df_ref_cnndm.empty:
    print(df_cnndm_rl_ref.mean())

Reference data length     66.168317
Filtered data length     607.910891
Summary length           510.247525
Precision                  0.779346
Recall                     0.164869
F-Score                    0.264860
Filtered                   0.980198
dtype: float64


In [88]:
df_cnndm_rl_ref.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 32 (RL),80,377,377,0.985507,0.356021,0.523077,1
Episode 56 (RL),63,351,351,0.910714,0.326923,0.481132,1
Episode 42 (RL),79,546,188,0.692308,0.357143,0.471204,1
Episode 78 (RL),77,498,290,0.73913,0.326923,0.453333,1
Episode 82 (RL),82,702,460,0.970149,0.286344,0.442177,1


***

# **Wikihow**

In [32]:
df_orig_wikihow, df_ref_wikihow, df_filtered_wikihow, df_leadn_wikihow = calculate_rouge_qubart(data=wikihow_lines, len_data=len(wikihow_lines), similarity_threshold=0.2, use_columns=True)
df_wikihow_r1, df_wikihow_r2, df_wikihow_rl = get_rouge_columns(df_orig_wikihow)
df_wikihow_r1_filtered, df_wikihow_r2_filtered, df_wikihow_rl_filtered = get_rouge_columns(df_filtered_wikihow)
df_wikihow_r1_lead3, df_wikihow_r2_lead3, df_wikihow_rl_lead3 = get_rouge_columns(df_leadn_wikihow)
if not df_ref_wikihow.empty:
    df_wikihow_r1_ref, df_wikihow_r2_ref, df_wikihow_rl_ref = get_rouge_columns(df_ref_wikihow)

  0%|          | 0/100 [00:00<?, ?it/s]

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 

## **R1 metric**

In [33]:
df_wikihow_r1.mean()

Reference data length    600.980000
Filtered data length     550.780000
Summary length            54.490000
Precision                  0.204248
Recall                     0.875236
F-Score                    0.316176
Filtered                   0.950000
dtype: float64

In [89]:
df_wikihow_r1.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Entry 12 (R1),66,66,66,0.909091,0.925926,0.917431,1
Entry 67 (R1),80,80,52,0.633333,0.95,0.76,1
Entry 10 (R1),138,138,60,0.470588,0.930233,0.625,1
Entry 85 (R1),174,174,61,0.464646,0.938776,0.621622,1
Entry 23 (R1),165,0,55,0.444444,0.930233,0.601504,0


In [34]:
df_wikihow_r1_filtered.mean()

Reference data length    610.263158
Filtered data length     579.768421
Summary length            54.178947
Precision                  0.218269
Recall                     0.872658
F-Score                    0.321427
Filtered                   1.000000
dtype: float64

In [90]:
df_wikihow_r1_filtered.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Entry 12 (R1),66,66,66,0.909091,0.925926,0.917431,1
Entry 67 (R1),80,80,52,0.633333,0.95,0.76,1
Entry 10 (R1),138,138,60,0.470588,0.930233,0.625,1
Entry 85 (R1),174,174,61,0.464646,0.938776,0.621622,1
Entry 56 (R1),156,156,53,0.446809,0.913043,0.6,1


In [35]:
df_wikihow_r1_lead3.mean()

Reference data length     58.230000
Filtered data length     550.780000
Summary length           479.590000
Precision                  0.670821
Recall                     0.135838
F-Score                    0.211540
Filtered                   0.950000
dtype: float64

In [91]:
df_wikihow_r1_lead3.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Entry 92 (R1),159,609,127,0.433735,0.428571,0.431138,1
Entry 48 (R1),88,273,273,0.787234,0.286822,0.420455,1
Entry 26 (R1),64,266,266,0.804878,0.270492,0.404908,1
Entry 19 (R1),137,0,348,0.556962,0.275,0.368201,1
Entry 10 (R1),82,138,138,0.421875,0.317647,0.362416,1


In [36]:
if not df_ref_wikihow.empty:
    print(df_wikihow_r1_ref.mean())

Reference data length     61.306931
Filtered data length     548.980198
Summary length           475.069307
Precision                  0.670544
Recall                     0.136175
F-Score                    0.212106
Filtered                   0.950495
dtype: float64


In [92]:
df_wikihow_r1_ref.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Entry 92 (R1),159,609,127,0.433735,0.428571,0.431138,1
Entry 48 (R1),88,273,273,0.787234,0.286822,0.420455,1
Entry 26 (R1),64,266,266,0.804878,0.270492,0.404908,1
Entry 19 (R1),137,0,348,0.556962,0.275,0.368201,1
Entry 10 (R1),82,138,138,0.421875,0.317647,0.362416,1


## **R2 metric**

In [37]:
df_wikihow_r2.mean()

Reference data length    600.980000
Filtered data length     550.780000
Summary length            54.490000
Precision                  0.121984
Recall                     0.777090
F-Score                    0.197043
Filtered                   0.950000
dtype: float64

In [93]:
df_wikihow_r2.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 12 (R2),66,66,66,0.850746,0.876923,0.863636,1
Episode 67 (R2),80,80,52,0.551724,0.888889,0.680851,1
Episode 10 (R2),138,138,60,0.38806,0.896552,0.541667,1
Episode 23 (R2),165,0,55,0.345324,0.888889,0.497409,0
Episode 85 (R2),174,174,61,0.316129,0.830508,0.457944,1


In [38]:
df_wikihow_r2_filtered.mean()

Reference data length    610.263158
Filtered data length     579.768421
Summary length            54.178947
Precision                  0.123994
Recall                     0.774288
F-Score                    0.199824
Filtered                   1.000000
dtype: float64

In [94]:
df_wikihow_r2_filtered.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 12 (R2),66,66,66,0.850746,0.876923,0.863636,1
Episode 67 (R2),80,80,52,0.551724,0.888889,0.680851,1
Episode 10 (R2),138,138,60,0.38806,0.896552,0.541667,1
Episode 85 (R2),174,174,61,0.316129,0.830508,0.457944,1
Episode 8 (R2),179,179,68,0.306818,0.830769,0.448133,1


In [39]:
df_wikihow_r2_lead3.mean()

Reference data length     58.230000
Filtered data length     550.780000
Summary length           479.590000
Precision                  0.249316
Recall                     0.032478
F-Score                    0.053102
Filtered                   0.950000
dtype: float64

In [95]:
df_wikihow_r2_lead3.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 26 (R2),64,266,266,0.483333,0.124464,0.197952,1
Episode 48 (R2),88,273,273,0.356164,0.109705,0.167742,1
Episode 92 (R2),159,609,127,0.147059,0.15748,0.152091,1
Episode 2 (R2),140,261,261,0.15,0.078947,0.103448,1
Episode 10 (R2),82,138,138,0.134146,0.08209,0.101852,1


In [40]:
if not df_ref_wikihow.empty:
    print(df_wikihow_r2_ref.mean())

Reference data length     61.306931
Filtered data length     548.980198
Summary length           475.069307
Precision                  0.249760
Recall                     0.032995
F-Score                    0.053879
Filtered                   0.950495
dtype: float64


In [96]:
df_wikihow_r2_ref.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 26 (R2),64,266,266,0.483333,0.124464,0.197952,1
Episode 48 (R2),88,273,273,0.356164,0.109705,0.167742,1
Episode 92 (R2),159,609,127,0.147059,0.15748,0.152091,1
Episode 99 (R2),369,369,23,0.294118,0.084746,0.131579,1
Episode 2 (R2),140,261,261,0.15,0.078947,0.103448,1


## **RL metric**

In [41]:
df_wikihow_rl.mean()

Reference data length    600.980000
Filtered data length     550.780000
Summary length            54.490000
Precision                  0.203947
Recall                     0.874032
F-Score                    0.315720
Filtered                   0.950000
dtype: float64

In [97]:
df_wikihow_rl.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 12 (RL),66,66,66,0.909091,0.925926,0.917431,1
Episode 67 (RL),80,80,52,0.633333,0.95,0.76,1
Episode 10 (RL),138,138,60,0.470588,0.930233,0.625,1
Episode 85 (RL),174,174,61,0.464646,0.938776,0.621622,1
Episode 23 (RL),165,0,55,0.444444,0.930233,0.601504,0


In [42]:
df_wikihow_rl_filtered.mean()

Reference data length    610.263158
Filtered data length     579.768421
Summary length            54.178947
Precision                  0.217953
Recall                     0.871392
F-Score                    0.320947
Filtered                   1.000000
dtype: float64

In [98]:
df_wikihow_rl_filtered.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 12 (RL),66,66,66,0.909091,0.925926,0.917431,1
Episode 67 (RL),80,80,52,0.633333,0.95,0.76,1
Episode 10 (RL),138,138,60,0.470588,0.930233,0.625,1
Episode 85 (RL),174,174,61,0.464646,0.938776,0.621622,1
Episode 56 (RL),156,156,53,0.446809,0.913043,0.6,1


In [43]:
df_wikihow_rl_lead3.mean()

Reference data length     58.230000
Filtered data length     550.780000
Summary length           479.590000
Precision                  0.638360
Recall                     0.128151
F-Score                    0.200181
Filtered                   0.950000
dtype: float64

In [99]:
df_wikihow_rl_lead3.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 48 (RL),88,273,273,0.787234,0.286822,0.420455,1
Episode 26 (RL),64,266,266,0.804878,0.270492,0.404908,1
Episode 92 (RL),159,609,127,0.39759,0.392857,0.39521,1
Episode 19 (RL),137,0,348,0.518987,0.25625,0.343096,1
Episode 10 (RL),82,138,138,0.390625,0.294118,0.33557,1


In [44]:
if not df_ref_wikihow.empty:  
    print(df_wikihow_rl_ref.mean())

Reference data length     61.306931
Filtered data length     548.980198
Summary length           475.069307
Precision                  0.637697
Recall                     0.128376
F-Score                    0.200564
Filtered                   0.950495
dtype: float64


In [100]:
df_wikihow_rl_ref.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 48 (RL),88,273,273,0.787234,0.286822,0.420455,1
Episode 26 (RL),64,266,266,0.804878,0.270492,0.404908,1
Episode 92 (RL),159,609,127,0.39759,0.392857,0.39521,1
Episode 19 (RL),137,0,348,0.518987,0.25625,0.343096,1
Episode 10 (RL),82,138,138,0.390625,0.294118,0.33557,1


***

# **XSum**

In [45]:
df_orig_xsum, df_ref_xsum, df_filtered_xsum, df_leadn_xsum = calculate_rouge_qubart(data=xsum_lines, len_data=len(xsum_lines), similarity_threshold=0.2, use_columns=True)
df_xsum_r1, df_xsum_r2, df_xsum_rl = get_rouge_columns(df_orig_xsum)
df_xsum_r1_filtered, df_xsum_r2_filtered, df_xsum_rl_filtered = get_rouge_columns(df_filtered_xsum)
df_xsum_r1_lead3, df_xsum_r2_lead3, df_xsum_rl_lead3 = get_rouge_columns(df_leadn_xsum)
if not df_ref_xsum.empty:
    df_xsum_r1_ref, df_xsum_r2_ref, df_xsum_rl_ref = get_rouge_columns(df_ref_xsum)

  0%|          | 0/100 [00:00<?, ?it/s]

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 

## **R1 metric**

In [46]:
df_xsum_r1.mean()

Reference data length    405.980000
Filtered data length     342.100000
Summary length            50.990000
Precision                  0.249228
Recall                     0.883862
F-Score                    0.363612
Filtered                   0.910000
dtype: float64

In [101]:
df_xsum_r1.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Entry 38 (R1),63,63,61,0.938776,0.901961,0.92,1
Entry 47 (R1),56,0,58,0.934783,0.86,0.895833,0
Entry 24 (R1),59,0,53,0.84,0.954545,0.893617,0
Entry 0 (R1),51,51,53,0.723404,0.790698,0.755556,1
Entry 84 (R1),88,88,52,0.634921,0.930233,0.754717,1


In [47]:
df_xsum_r1_filtered.mean()

Reference data length    407.340659
Filtered data length     375.934066
Summary length            51.142857
Precision                  0.269266
Recall                     0.880395
F-Score                    0.376324
Filtered                   1.000000
dtype: float64

In [102]:
df_xsum_r1_filtered.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Entry 38 (R1),63,63,61,0.938776,0.901961,0.92,1
Entry 32 (R1),268,20,56,0.913043,0.65625,0.763636,1
Entry 0 (R1),51,51,53,0.723404,0.790698,0.755556,1
Entry 84 (R1),88,88,52,0.634921,0.930233,0.754717,1
Entry 11 (R1),84,84,53,0.597403,0.938776,0.730159,1


In [48]:
df_xsum_r1_lead3.mean()

Reference data length     22.380000
Filtered data length     342.100000
Summary length           349.400000
Precision                  0.620619
Recall                     0.082302
F-Score                    0.141569
Filtered                   0.910000
dtype: float64

In [103]:
df_xsum_r1_lead3.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Entry 84 (R1),20,88,88,0.631579,0.190476,0.292683,1
Entry 38 (R1),13,63,63,0.615385,0.163265,0.258065,1
Entry 53 (R1),26,109,109,0.565217,0.158537,0.247619,1
Entry 39 (R1),21,93,93,0.52381,0.15942,0.244444,1
Entry 45 (R1),27,139,139,0.576923,0.148515,0.23622,1


In [49]:
if not df_ref_xsum.empty:
    print(df_xsum_r1_ref.mean())

Reference data length     30.227723
Filtered data length     346.376238
Summary length           346.188119
Precision                  0.615941
Recall                     0.082619
F-Score                    0.141445
Filtered                   0.910891
dtype: float64


In [104]:
df_xsum_r1_ref.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Entry 84 (R1),20,88,88,0.631579,0.190476,0.292683,1
Entry 38 (R1),13,63,63,0.615385,0.163265,0.258065,1
Entry 53 (R1),26,109,109,0.565217,0.158537,0.247619,1
Entry 39 (R1),21,93,93,0.52381,0.15942,0.244444,1
Entry 45 (R1),27,139,139,0.576923,0.148515,0.23622,1


## **R2 metric**

In [50]:
df_xsum_r2.mean()

Reference data length    405.980000
Filtered data length     342.100000
Summary length            50.990000
Precision                  0.162368
Recall                     0.762958
F-Score                    0.242643
Filtered                   0.910000
dtype: float64

In [105]:
df_xsum_r2.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 38 (R2),63,63,61,0.809524,0.836066,0.822581,1
Episode 47 (R2),56,0,58,0.818182,0.789474,0.803571,0
Episode 24 (R2),59,0,53,0.693548,0.826923,0.754386,0
Episode 11 (R2),84,84,53,0.516484,0.87037,0.648276,1
Episode 84 (R2),88,88,52,0.505882,0.86,0.637037,1


In [51]:
df_xsum_r2_filtered.mean()

Reference data length    407.340659
Filtered data length     375.934066
Summary length            51.142857
Precision                  0.169658
Recall                     0.759728
F-Score                    0.251296
Filtered                   1.000000
dtype: float64

In [106]:
df_xsum_r2_filtered.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 38 (R2),63,63,61,0.809524,0.836066,0.822581,1
Episode 11 (R2),84,84,53,0.516484,0.87037,0.648276,1
Episode 84 (R2),88,88,52,0.505882,0.86,0.637037,1
Episode 76 (R2),189,85,49,0.460674,0.854167,0.59854,1
Episode 0 (R2),51,51,53,0.584906,0.607843,0.596154,1


In [52]:
df_xsum_r2_lead3.mean()

Reference data length     22.380000
Filtered data length     342.100000
Summary length           349.400000
Precision                  0.160156
Recall                     0.012272
F-Score                    0.022228
Filtered                   0.910000
dtype: float64

In [107]:
df_xsum_r2_lead3.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 39 (R2),21,93,93,0.190476,0.042553,0.069565,1
Episode 45 (R2),27,139,139,0.214286,0.041379,0.069364,1
Episode 26 (R2),21,144,144,0.25,0.034965,0.06135,1
Episode 15 (R2),22,103,188,0.272727,0.031915,0.057143,1
Episode 50 (R2),20,217,261,0.35,0.027559,0.051095,1


In [53]:
if not df_ref_xsum.empty:
    print(df_xsum_r2_ref.mean())

Reference data length     30.227723
Filtered data length     346.376238
Summary length           346.188119
Precision                  0.158570
Recall                     0.012151
F-Score                    0.022008
Filtered                   0.910891
dtype: float64


In [108]:
df_xsum_r2_ref.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 39 (R2),21,93,93,0.190476,0.042553,0.069565,1
Episode 45 (R2),27,139,139,0.214286,0.041379,0.069364,1
Episode 26 (R2),21,144,144,0.25,0.034965,0.06135,1
Episode 15 (R2),22,103,188,0.272727,0.031915,0.057143,1
Episode 50 (R2),20,217,261,0.35,0.027559,0.051095,1


## **RL metric**

In [54]:
df_xsum_rl.mean()

Reference data length    405.980000
Filtered data length     342.100000
Summary length            50.990000
Precision                  0.247542
Recall                     0.879709
F-Score                    0.361348
Filtered                   0.910000
dtype: float64

In [109]:
df_xsum_rl.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 38 (RL),63,63,61,0.938776,0.901961,0.92,1
Episode 47 (RL),56,0,58,0.934783,0.86,0.895833,0
Episode 24 (RL),59,0,53,0.84,0.954545,0.893617,0
Episode 84 (RL),88,88,52,0.634921,0.930233,0.754717,1
Episode 11 (RL),84,84,53,0.597403,0.938776,0.730159,1


In [55]:
df_xsum_rl_filtered.mean()

Reference data length    407.340659
Filtered data length     375.934066
Summary length            51.142857
Precision                  0.267563
Recall                     0.876643
F-Score                    0.374091
Filtered                   1.000000
dtype: float64

In [110]:
df_xsum_rl_filtered.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 38 (RL),63,63,61,0.938776,0.901961,0.92,1
Episode 32 (RL),268,20,56,0.913043,0.65625,0.763636,1
Episode 84 (RL),88,88,52,0.634921,0.930233,0.754717,1
Episode 11 (RL),84,84,53,0.597403,0.938776,0.730159,1
Episode 0 (RL),51,51,53,0.680851,0.744186,0.711111,1


In [56]:
df_xsum_rl_lead3.mean()

Reference data length     22.380000
Filtered data length     342.100000
Summary length           349.400000
Precision                  0.540540
Recall                     0.070307
F-Score                    0.121312
Filtered                   0.910000
dtype: float64

In [111]:
df_xsum_rl_lead3.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 84 (RL),20,88,88,0.526316,0.15873,0.243902,1
Episode 83 (RL),31,409,409,0.75,0.117647,0.20339,1
Episode 39 (RL),21,93,93,0.428571,0.130435,0.2,1
Episode 64 (RL),21,708,221,0.636364,0.114754,0.194444,1
Episode 61 (RL),30,258,298,0.678571,0.110465,0.19,1


In [57]:
if not df_ref_xsum.empty:
    print(df_xsum_rl_ref.mean())

Reference data length     30.227723
Filtered data length     346.376238
Summary length           346.188119
Precision                  0.536288
Recall                     0.070459
F-Score                    0.121069
Filtered                   0.910891
dtype: float64


In [112]:
df_xsum_rl_ref.sort_values(by=['F-Score'], ascending=False).head()

Unnamed: 0,Reference data length,Filtered data length,Summary length,Precision,Recall,F-Score,Filtered
Episode 84 (RL),20,88,88,0.526316,0.15873,0.243902,1
Episode 83 (RL),31,409,409,0.75,0.117647,0.20339,1
Episode 39 (RL),21,93,93,0.428571,0.130435,0.2,1
Episode 64 (RL),21,708,221,0.636364,0.114754,0.194444,1
Episode 61 (RL),30,258,298,0.678571,0.110465,0.19,1


***