## Imports (needs to be run only once)

In [1]:
# For the first execution, you will need to uncomment this line
# to download the SpaCy model and other necessary packages. Then you can comment it back
# !python -m spacy download fr_core_news_sm
# !python -m pip install ipynb

# MLSUM Corpus
from datasets import load_dataset

# Loading article data
import json

# Our packages
from eval.rouge_l import RougeLEval
from summ.lsa import LSASummarizer
from bert_score import BERTScorer

from tqdm import tqdm

dataset = load_dataset('mlsum', 'fr')

rouge_l = RougeLEval()
lsa_summ = LSASummarizer()

Reusing dataset mlsum (/Users/josephkeenan/.cache/huggingface/datasets/mlsum/fr/1.0.0/77f23eb185781f439927ac2569ab1da1083195d8b2dab2b2f6bbe52feb600688)


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




## Summarize a single article

In [2]:
# Pick an article and its reference summary
article = dataset['test']['text'][54]
ref_summ = dataset['test']['summary'][54]

# Computes the summary and evaluation
gen_summ = lsa_summ.get_summary(article)
scores1, scores2 = rouge_l.evaluate_one(ref_summ, gen_summ)
print(gen_summ[0])
print()
print(gen_summ[1])
print()
print(ref_summ)
print()
print(scores1)
print(scores2)

Douze personnes ont été abattues vendredi 31 mai par un tireur dans un bâtiment municipal de Virginia Beach (Etat de Virginie), station balnéaire de la côte est américaine.

personne abattre vendredi 31 mai tireur bâtiment municipal Virginia Beach etat Virginie station balnéaire côte américain

Le suspect principal, un employé des services de la ville, a tiré « à l’aveugle ». Il est lui aussi décédé.

{'rougeL': Score(precision=0.125, recall=0.19047619047619047, fmeasure=0.1509433962264151)}
{'rougeL': Score(precision=0.05, recall=0.1, fmeasure=0.06666666666666667)}


## Summarize a series of articles

In [4]:
texts = dataset['test']['text']
ref_summs = dataset['test']['summary']

# Here we pick 5 articles
gen_summs = []
for text in tqdm(texts[:5]):
    gen_summs.append(lsa_summ.get_summary(text))

scores1, scores2 = rouge_l.evaluate_many(ref_summs, gen_summs, 5)
results = rouge_l.get_results(scores1, scores2)

for k, v in results.items():
    print(k.ljust(25), round(v*100, 3), '%')

100%|█████████████████████████████████████████████| 5/5 [00:09<00:00,  1.91s/it]
100%|█████████████████████████████████████████████| 5/5 [00:00<00:00, 67.48it/s]

Long precision avg        16.572 %
Keyword precision avg     18.134 %
Long recall avg           16.915 %
Keyword recall avg        10.552 %
Long F1-score avg         13.425 %
Keyword F1-score avg      11.365 %





#### Optional: Save generated summaries to file

In [24]:
with open('generated.txt', 'w') as f:
    for summ1, summ2 in tqdm(gen_summs):
        f.write(summ1)
        f.write('\n\n')
        f.write(summ2)
        f.write('\n\n')

100%|████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<?, ?it/s]


## Summarize a series of scraped articles

In [2]:
with open('data/actu_preliminary.json', 'r', encoding='utf-8') as jsonfile:
    data = json.load(jsonfile)

texts = [article['text'] for article in data]
ref_summs = [article['summary'] for article in data]

gen_summs = []
for text in tqdm(texts):
    gen_summs.append(lsa_summ.get_summary(text))

scores1, scores2 = rouge_l.evaluate_many(ref_summs, gen_summs)
results = rouge_l.get_results(scores1, scores2)

for k, v in results.items():
    print(k.ljust(25), round(v*100, 3), '%')

100%|███████████████████████████████████████████| 47/47 [01:18<00:00,  1.68s/it]
100%|███████████████████████████████████████████| 47/47 [00:00<00:00, 85.18it/s]

Long precision avg        50.316 %
Keyword precision avg     58.72 %
Long recall avg           53.568 %
Keyword recall avg        48.97 %
Long F1-score avg         57.839 %
Keyword F1-score avg      52.371 %



