## Imports (needs to be run only once)

In [6]:
# For the first execution, you will need to uncomment this line
# to download the SpaCy model and other necessary packages. Then you can comment it back
# !python -m spacy download fr_core_news_sm
# !python -m pip install ipynb

# MLSUM Corpus
from datasets import load_dataset

# Loading article data
import json

# Our packages
from eval.rouge_l import RougeLEval
from summ.lsa import LSASummarizer

from tqdm import tqdm

dataset = load_dataset('mlsum', 'fr')

rouge_l = RougeLEval()
lsa_summ = LSASummarizer()

Reusing dataset mlsum (C:\Users\maxim\.cache\huggingface\datasets\mlsum\fr\1.0.0\77f23eb185781f439927ac2569ab1da1083195d8b2dab2b2f6bbe52feb600688)


  0%|          | 0/3 [00:00<?, ?it/s]

## Summarize a single article

In [3]:
# Pick an article and its reference summary
article = dataset['train']['text'][4]
ref_summ = dataset['train']['summary'][4]

# Computes the summary and evaluation
gen_summ = lsa_summ.get_summary(article)
scores1, scores2 = rouge_l.evaluate_one(ref_summ, gen_summ)
print(gen_summ[0])
print()
print(gen_summ[1])
print()
print(ref_summ)
print()
print(scores1)
print(scores2)

Les secours ont été prévenus vers 5 heures du matin, mais "l'incendie avait déjà bien démarré", a-t-il expliqué.

secours être prévenir 5 heure matin incendie bien démarrer -t il expliquer

Cinq personnes sont mortes, et treize autres ont été blessées à Nîmes, dans le Gard, dans un incendie qui s'est déclenché vendredi 1er janvier au petit matin.

{'rougeL': Score(precision=0.16666666666666666, recall=0.13333333333333333, fmeasure=0.14814814814814814)}
{'rougeL': Score(precision=0.21428571428571427, recall=0.2, fmeasure=0.20689655172413796)}


## Summarize a series of articles

In [8]:
texts = dataset['test']['text']
ref_summs = dataset['test']['summary']

# Here we pick 5 articles
gen_summs = []
for text in tqdm(texts[:5]):
    gen_summs.append(lsa_summ.get_summary(text))

scores1, scores2 = rouge_l.evaluate_many(ref_summs, gen_summs, 5)
results = rouge_l.get_results(scores1, scores2)

for k, v in results.items():
    print(k.ljust(25), round(v*100, 3), '%')

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:22<00:00, 16.51s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 43.59it/s]

Long precision avg        16.959 %
Keyword precision avg     18.962 %
Long recall avg           17.561 %
Keyword recall avg        8.933 %
Long F1-score avg         12.231 %
Keyword F1-score avg      9.841 %





#### Optional: Save generated summaries to file

In [24]:
with open('generated.txt', 'w') as f:
    for summ1, summ2 in tqdm(gen_summs):
        f.write(summ1)
        f.write('\n\n')
        f.write(summ2)
        f.write('\n\n')

100%|████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<?, ?it/s]


## Summarize a series of scraped articles

In [29]:
with open('data/actu_preliminary.json', 'r', encoding='utf-8') as jsonfile:
    data = json.load(jsonfile)

texts = [article['text'] for article in data]
ref_summs = [article['summary'] for article in data]

gen_summs = []
for text in tqdm(texts):
    gen_summs.append(lsa_summ.get_summary(text))

scores1, scores2 = rouge_l.evaluate_many(ref_summs, gen_summs)
results = rouge_l.get_results(scores1, scores2)

for k, v in results.items():
    print(k.ljust(25), round(v*100, 3), '%')

100%|██████████████████████████████████████████████████████████████████████████████████| 47/47 [14:35<00:00, 18.63s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 47/47 [00:00<00:00, 64.20it/s]

Long precision avg        54.567 %
Keyword precision avg     63.049 %
Long recall avg           57.536 %
Keyword recall avg        52.699 %
Long F1-score avg         63.779 %
Keyword F1-score avg      56.84 %



