## Imports (needs to be run only once)

In [2]:
# If you ever get any error, try uncommenting these two lines
# to download the required SpaCy model and other Python libraries packages.
# Then you can comment them back
# !python -m spacy download fr_core_news_sm
# !python -m pip install -r requirements.txt

# MLSUM Corpus
from datasets import load_dataset

# Loading article data
import json

# Our packages
from eval.rouge_l import RougeLEval
from summ.lsa import LSASummarizer
from summ.bert_embed import BertEmbeddingsSummarizer

# BERTScore import
# you can install this package by running pip install bert-score
from bert_score import BERTScorer

from tqdm import tqdm

dataset = load_dataset('mlsum', 'fr')

rouge_l = RougeLEval()
lsa_summ = LSASummarizer()
flaubert_summ = BertEmbeddingsSummarizer('flaubert/flaubert_large_cased')
camembert_summ = BertEmbeddingsSummarizer('camembert/camembert-large')

Reusing dataset mlsum (C:\Users\maxim\.cache\huggingface\datasets\mlsum\fr\1.0.0\77f23eb185781f439927ac2569ab1da1083195d8b2dab2b2f6bbe52feb600688)


  0%|          | 0/3 [00:00<?, ?it/s]

Some weights of the model checkpoint at flaubert/flaubert_large_cased were not used when initializing FlaubertModel: ['pred_layer.proj.weight', 'pred_layer.proj.bias']
- This IS expected if you are initializing FlaubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FlaubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at camembert/camembert-large were not used when initializing CamembertModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing CamembertModel from the checkpoint of a model trained on another task

## Summarize a single article

In [5]:
# Pick an article and its reference summary
article = dataset['test']['text'][54]
ref_summ = dataset['test']['summary'][54]

# Computes the summary and evaluation
gen_summ = flaubert_summ.get_summary(article)
scores1, scores2 = rouge_l.evaluate_one(ref_summ, gen_summ)
print(gen_summ[0])
print()
print(gen_summ[1])
print()
print(ref_summ)
print()
print(scores1)
print(scores2)

» Touché, le tireur a succombé à ses blessures.
Tim Kaine.
, a raconté M. Cervera.
Kaitlin McKeown / AP
Repérant l’
Article réservé à nos abonnés

Touché tireur succomber blessure
Tim Kaine
raconter m. Cervera
Kaitlin mckeown AP
repérant
article réserver abonné

Le suspect principal, un employé des services de la ville, a tiré « à l’aveugle ». Il est lui aussi décédé.

{'rougeL': Score(precision=0.12, recall=0.14285714285714285, fmeasure=0.13043478260869565)}
{'rougeL': Score(precision=0.0, recall=0.0, fmeasure=0.0)}


## Summarize a series of articles

In [6]:
texts = dataset['test']['text']
ref_summs = dataset['test']['summary']

# Here we pick 5 articles
gen_summs = []
for text in tqdm(texts[:5]):
    gen_summs.append(flaubert_summ.get_summary(text))

scores1, scores2 = rouge_l.evaluate_many(ref_summs, gen_summs, 5)
results = rouge_l.get_results(scores1, scores2)

for k, v in results.items():
    print(k.ljust(25), round(v*100, 3), '%')

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [06:51<00:00, 82.33s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 10.18it/s]

Long precision avg        10.628 %
Long recall avg           9.801 %
Long F1-score avg         10.176 %
Keyword precision avg     5.288 %
Keyword recall avg        4.038 %
Keyword F1-score avg      4.538 %





#### Optional: Save generated summaries to file

In [None]:
with open('generated.txt', 'w') as f:
    for summ1, summ2 in tqdm(gen_summs):
        f.write(summ1)
        f.write('\n\n')
        f.write(summ2)
        f.write('\n\n')

## Summarize a series of scraped articles

In [None]:
with open('data/actu_preliminary.json', 'r', encoding='utf-8') as jsonfile:
    data = json.load(jsonfile)

texts = [article['text'] for article in data]
ref_summs = [article['summary'] for article in data]

gen_summs = []
for text in tqdm(texts):
    gen_summs.append(lsa_summ.get_summary(text))

scores1, scores2 = rouge_l.evaluate_many(ref_summs, gen_summs)
results = rouge_l.get_results(scores1, scores2)

for k, v in results.items():
    print(k.ljust(25), round(v*100, 3), '%')