## ONLY if running on Colaboratory, run this cell first (once)

In [None]:
!git clone https://github.com/pie3636/newsjam.git
!mv newsjam/* .

## Install missing modules if needed (only run once)

In [None]:
!python -m pip install -r requirements.txt
!python -m spacy download fr_core_news_sm
# Note: You'll have to restart the kernel/runtime after running this cell

## Imports (only run once)

In [1]:
# MLSUM Corpus
from datasets import load_dataset

# Loading article data
import json

# Our packages
from eval.rouge_l import RougeLEval
from eval.bert_eval import BERT_Eval
from eval.time import TimeEval
from summ.lsa import LSASummarizer
from summ.bert_embed import BertEmbeddingsSummarizer

from tqdm import tqdm

dataset = load_dataset('mlsum', 'fr')

rouge_l = RougeLEval()
bert = BERT_Eval()
timer = TimeEval()
lsa_summ = LSASummarizer()
flaubert_summ = BertEmbeddingsSummarizer('flaubert/flaubert_large_cased')
camembert_summ = BertEmbeddingsSummarizer('camembert/camembert-large')

Reusing dataset mlsum (C:\Users\maxim\.cache\huggingface\datasets\mlsum\fr\1.0.0\77f23eb185781f439927ac2569ab1da1083195d8b2dab2b2f6bbe52feb600688)


  0%|          | 0/3 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at flaubert/flaubert_large_cased were not used when initializing FlaubertModel: ['pred_layer.proj.bias', 'p

## Summarize a single article

In [None]:
# Pick an article and its reference summary
article = dataset['test']['text'][54]
ref_summ = dataset['test']['summary'][54]

# Computes the summary and evaluation
timer.evaluate_one(article, BertEmbeddingsSummarizer, 'camembert/camembert-large')

## Summarize a series of articles

In [2]:
texts = dataset['test']['text'][:10]
ref_summs = dataset['test']['summary'][:10]

# Here we pick 5 articles
# gen_summs = []
# for text in tqdm(texts[:5]):
#     gen_summs.append(flaubert_summ.get_summary(text))

# scores1, scores2 = rouge_l.evaluate_many(ref_summs, gen_summs, 5)
# results = rouge_l.get_results(scores1, scores2)

# for k, v in results.items():
#     print(k.ljust(25), round(v*100, 3), '%')

timer.evaluate_many(texts, LSASummarizer)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [03:24<00:00, 20.47s/it]


20.630670810000005

#### Optional: Save generated summaries to file

In [None]:
with open('generated.txt', 'w') as f:
    for summ1, summ2 in tqdm(gen_summs):
        f.write(summ1)
        f.write('\n\n')
        f.write(summ2)
        f.write('\n\n')

## Summarize a series of scraped articles

In [None]:
with open('data/actu_preliminary.json', 'r', encoding='utf-8') as jsonfile:
    data = json.load(jsonfile)

texts = [article['text'] for article in data]
ref_summs = [article['summary'] for article in data]

gen_summs = []
for text in tqdm(texts):
    gen_summs.append(flaubert_summ.get_summary(text))

scores1, scores2 = rouge_l.evaluate_many(ref_summs, gen_summs)
results = rouge_l.get_results(scores1, scores2)

for k, v in results.items():
    print(k.ljust(25), round(v*100, 3), '%')

Implementation of BERTScore

In [None]:
long_summs, short_summs, ref_summs, key_ref_sums =  bert.split_summs(gen_summs, ref_summs)

In [None]:
bert.bert_score(long_summs, short_summs, ref_summs, key_ref_sums)

In [None]:
bert.get_matrix(long_summs, ref_summs, 4)