## ONLY if running on Colaboratory, run this cell first (once)

In [None]:
!git clone https://github.com/pie3636/newsjam.git
!mv newsjam/* .

## Install missing modules if needed (only run once)

In [None]:
!python -m pip install -r requirements.txt
!python -m spacy download fr_core_news_sm
# Note: You'll have to restart the kernel/runtime after running this cell

## Imports (only run once)

In [1]:
# MLSUM Corpus
from datasets import load_dataset

# Loading article data
import json

# Our packages
from eval.rouge_l import RougeLEval
from eval.bert_eval import BERT_Eval
from eval.time import TimeEval

from summ.lsa import LSASummarizer
from summ.bert_embed import BertEmbeddingsSummarizer

from tqdm import tqdm

dataset = load_dataset('mlsum', 'fr')

rouge_l = RougeLEval()
bert = BERT_Eval()
timer = TimeEval()
lsa_summ = LSASummarizer()
flaubert_summ = BertEmbeddingsSummarizer('flaubert/flaubert_large_cased')
camembert_summ = BertEmbeddingsSummarizer('camembert/camembert-large')

ModuleNotFoundError: No module named 'PIL._binary'

## Summarize a single article

In [3]:
# Pick an article and its reference summary
article = dataset['test']['text'][54]
ref_summ = dataset['test']['summary'][54]

# Computes the summary and evaluation
# timer.evaluate_one(article, BertEmbeddingsSummarizer, 'camembert/camembert-large')

## Summarize a series of articles

In [None]:
texts = dataset['test']['text'][:10]
ref_summs = dataset['test']['summary'][:10]

# Here we pick 5 articles
# gen_summs = []
# for text in tqdm(texts[:5]):
#     gen_summs.append(flaubert_summ.get_summary(text))

# scores1, scores2 = rouge_l.evaluate_many(ref_summs, gen_summs, 5)
# results = rouge_l.get_results(scores1, scores2)

# for k, v in results.items():
#     print(k.ljust(25), round(v*100, 3), '%')

timer.evaluate_many(texts, LSASummarizer)

#### Optional: Save generated summaries to file

In [None]:
with open('generated.txt', 'w') as f:
    for summ1, summ2 in tqdm(gen_summs):
        f.write(summ1)
        f.write('\n\n')
        f.write(summ2)
        f.write('\n\n')

## Summarize a series of scraped articles

In [None]:
with open('data/actu_preliminary.json', 'r', encoding='utf-8') as jsonfile:
    data = json.load(jsonfile)

texts = [article['text'] for article in data]
ref_summs = [article['summary'] for article in data]

gen_summs = []
for text in tqdm(texts):
    gen_summs.append(flaubert_summ.get_summary(text))

scores1, scores2 = rouge_l.evaluate_many(ref_summs, gen_summs)
results = rouge_l.get_results(scores1, scores2)

for k, v in results.items():
    print(k.ljust(25), round(v*100, 3), '%')

Implementation of BERTScore

In [None]:
long_summs, short_summs, ref_summs, key_ref_sums =  bert.split_summs(gen_summs, ref_summs)

In [None]:
bert.bert_score(long_summs, short_summs, ref_summs, key_ref_sums)

In [None]:
bert.get_matrix(long_summs, ref_summs, 4)

In [4]:
import spacy


In [5]:
nlp = spacy.load("fr_core_news_sm")

In [6]:
doc = nlp(article)

In [7]:
import summ.utils
keyword_sentences = summ.utils.get_keyword_sentences(doc)
print(article)
print(keyword_sentences)

La zone du bâtiment municipal de Virginia Beach a été sécurisée. Kaitlin McKeown / AP Douze personnes ont été abattues vendredi 31 mai par un tireur dans un bâtiment municipal de Virginia Beach (Etat de Virginie), station balnéaire de la côte est américaine. Le bilan, dans un premier temps établi à 11 morts, a été revu à la hausse après le décès d’une victime qui « a succombé à ses blessures sur le chemin de l’hôpital », a détaillé le chef de la police de Virginia Beach, James Cervera. Quatre autres personnes blessées sont soignées dans les hôpitaux de la région et d’autres auraient pu s’y rendre par leurs propres moyens, a précisé le responsable policier. Il était peu après 16 heures vendredi (22 heures à Paris) quand le suspect, « un employé de longue date » est entré dans le bâtiment et a commencé « immédiatement à tirer à l’aveugle sur toutes les victimes », a raconté M. Cervera. Repérant l’étage du bâtiment où se trouvait le tireur par le bruit du sifflement des balles, les polici

In [None]:
embeddings = []
word_idx_to_sent = []


In [10]:
i = 0
sent = list(doc.sents)[0]
print(sent)

La zone du bâtiment municipal de Virginia Beach a été sécurisée.


In [8]:
from transformers import AutoModel, AutoTokenizer
model = AutoModel.from_pretrained('flaubert/flaubert_large_cased')
tokenizer = AutoTokenizer.from_pretrained('flaubert/flaubert_large_cased')

Some weights of the model checkpoint at flaubert/flaubert_large_cased were not used when initializing FlaubertModel: ['pred_layer.proj.bias', 'pred_layer.proj.weight']
- This IS expected if you are initializing FlaubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing FlaubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
tokenized_sent = tokenizer.tokenize(sent.text)
print(tokenized_sent)

['La</w>', 'zone</w>', 'du</w>', 'bâtiment</w>', 'municipal</w>', 'de</w>', 'Virginia</w>', 'Beach</w>', 'a</w>', 'été</w>', 'sécurisée</w>', '.</w>']


In [None]:
encoded_sent = tokenizer.encode(tokenized_sent)
sentence_embeds = flaubert_summ.get_sent_embeds(encoded_sent)
print(sentence_embeds)

In [None]:
from spacy.lang.fr.stop_words import STOP_WORDS
import string

for j, token in enumerate(tokenized_sent):
    if not token in STOP_WORDS and token not in string.punctuation:
        print(token)
        embeddings.append(sentence_embeds[0][j].detach().numpy())
        word_idx_to_sent.append((i, j))

In [None]:
sentence_embeds.shape

In [None]:
for j, token in enumerate(tokenized_sent):
    word_idx_to_sent.append((i, j))

In [None]:
word_idx_to_sent

In [None]:
import torch
sent = list(doc.sents)[0]
tokenized_sent = flaubert_summ.tokenizer.tokenize(sent.text)
print(tokenized_sent)
encoded_sentence = tokenizer.encode(tokenized_sent, is_split_into_words=True)
print(encoded_sentence)
res = model(torch.tensor([encoded_sentence]))

In [None]:
print(len(encoded_sentence))
print(len(sent))
print(res.last_hidden_state.shape)

In [None]:
embeddings = []

for i, sent in enumerate(doc.sents):
    tokenized_sent = flaubert_summ.tokenizer.tokenize(sent.text)

    if not tokenized_sent:
        continue

    sentence_embeds = flaubert_summ.get_sent_embeds(tokenized_sent)
    embeddings.extend(torch.unbind(sentence_embeds[0].detach()))

    for j, token in enumerate(tokenized_sent):
        word_idx_to_sent.append((i, j))

In [None]:
print(len(embeddings))
embeddings

In [None]:
from torch.nn.utils.rnn import pad_sequence
stacked = torch.stack(embeddings)
print(stacked.shape)
print(len(word_idx_to_sent))

In [None]:
id = 5
print(stacked[id])
sent = list(doc.sents)[id]
tokenized_sent = flaubert_summ.tokenizer.tokenize(sent.text)
encoded_sentence = tokenizer.encode(tokenized_sent, is_split_into_words=True)
print(encoded_sentence)
for token in tokenized_sent:
    print(token)
print(len(encoded_sentence))
print(len(tokenized_sent))
# from sklearn.cluster import KMeans
# kmeans = KMeans(n_clusters=5).fit(stacked)
# embed_labels = kmeans.labels_
# centroids = kmeans.cluster_centers_
# print(embed_labels, centroids)

In [None]:
print(encoded_sentence[2])
print(encoded_sentence[7])
print(encoded_sentence[12])
print(stacked[2])
print(stacked[7])
print(stacked[12])

In [None]:
print(tokenized_sent)
l = []
for token in tokenized_sent:
    l += tokenizer.encode(token, is_split_into_words=True)[1:-1]
l = [0] + l + [1]
print(l)
print(tokenizer.encode(tokenized_sent, is_split_into_words=True))

In [None]:
sent_emb = flaubert_summ.get_sent_embeds(tokenized_sent)

In [None]:
print(sent_emb.shape)
encoded = tokenizer.encode(tokenized_sent, split_into_words=True)
print(len(encoded))
print(encoded)
print(tokenized_sent)
print(len(tokenized_sent))

In [None]:
id = 5
sent = list(doc.sents)[id]
tokenized_sent = tokenizer.tokenize(sent.text)
encoded = tokenizer.encode(tokenized_sent, split_into_words=True)
print(encoded)
print(len(encoded))
sent_emb = model(torch.tensor([encoded]))[0]
print(sent_emb.shape)
print(sent_emb)