## ONLY if running on Colaboratory, run this cell first (once)

In [None]:
!git clone https://github.com/pie3636/newsjam.git
!mv newsjam/* .

## Install missing modules if needed (only run once)

In [None]:
!python -m pip install -r requirements.txt
!python -m spacy download fr_core_news_sm
# Note: You'll have to restart the kernel/runtime after running this cell

## Imports (only run once)

In [None]:
# MLSUM Corpus & CNN/Daily Mail Corpus
from datasets import load_dataset

# Loading article data
import json

# Our packages
from eval.rouge_l import RougeLEval
from eval.bert_eval import BERT_Eval
from eval.time import TimeEval

from summ.lsa import LSASummarizer
from summ.bert_embed import BertEmbeddingsSummarizer

from tqdm import tqdm

dataset_fr = load_dataset('mlsum', 'fr')
dataset_en = load_dataset('cnn_dailymail', '3.0.0')

rouge_l = RougeLEval()
bert = BERT_Eval()
timer = TimeEval()
lsa_summ = LSASummarizer()
flaubert_summ = BertEmbeddingsSummarizer('flaubert/flaubert_large_cased')
camembert_summ = BertEmbeddingsSummarizer('camembert/camembert-large')

## Summarize a single article

In [None]:
# Pick an article and its reference summary
article_fr = dataset_mlsum['test']['text'][54]
ref_summ_fr = dataset_mlsum['test']['summary'][54]

article_en = dataset_en['test']['article'][43]
ref_summ_en = dataset_en['test']['highlights'][43]

# Computes the summary and evaluation
# timer.evaluate_one(article, BertEmbeddingsSummarizer, 'camembert/camembert-large')

## Summarize a series of articles

In [None]:
texts = dataset_fr['test']['text'][:50]
ref_summs = dataset_fr['test']['summary'][:50]

# Here we pick 5 articles
# gen_summs = []
# for text in tqdm(texts[:5]):
#     gen_summs.append(flaubert_summ.get_summary(text))

# scores1, scores2 = rouge_l.evaluate_many(ref_summs, gen_summs, 5)
# results = rouge_l.get_results(scores1, scores2)

# for k, v in results.items():
#     print(k.ljust(25), round(v*100, 3), '%')

timer.evaluate_many(texts, LSASummarizer)

#### Optional: Save generated summaries to file

In [None]:
with open('generated.txt', 'w') as f:
    for summ1, summ2 in tqdm(gen_summs):
        f.write(summ1)
        f.write('\n\n')
        f.write(summ2)
        f.write('\n\n')

## Summarize a series of scraped articles

In [None]:
with open('data/actu_preliminary.json', 'r', encoding='utf-8') as jsonfile:
    data = json.load(jsonfile)

texts = [article['text'] for article in data]
ref_summs = [article['summary'] for article in data]

gen_summs = []
for text in tqdm(texts):
    gen_summs.append(lsa_summ.get_summary(text))

scores1, scores2 = rouge_l.evaluate_many(ref_summs, gen_summs)
results = rouge_l.get_results(scores1, scores2)

for k, v in results.items():
    print(k.ljust(25), round(v*100, 3), '%')

In [None]:
for x in gen_summs:
    print(x)

Implementation of BERTScore

In [None]:
long_summs, short_summs, ref_summs, key_ref_summs =  bert.split_summs(gen_summs, ref_summs, gen_keys=True)

In [None]:
for x in long_summs:
    print(x, x.index)

In [None]:
x = 15
long_summs[10] = 'B'
print(long_summs[x])
print(len(long_summs[x]))
print()
print(ref_summs[x])
ref_summs[10] = '汉字'
print(len(ref_summs[x]))

In [None]:
bert.bert_score(long_summs, short_summs, ref_summs, key_ref_summs)

In [None]:
bert.get_matrix(long_summs, ref_summs, 4)