### Imports (only run once)

In [40]:
# MLSUM Corpus
from datasets import load_dataset

# SpaCy model for segmentation, tokenization, stopwords and stemming
import spacy
from spacy.lang.fr.stop_words import STOP_WORDS

# For the first execution, you will need to uncomment this line
# to download the SpaCy model. Then you can comment it back
# !python -m spacy download fr_core_news_sm

# Models for Latent Semantic Indexing
from gensim import corpora
from gensim import models
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel

# Evaluation
from rouge_score import rouge_scorer
from tqdm import tqdm

# Loading article data
import json

dataset = load_dataset('mlsum', 'fr')
nlp = spacy.load("fr_core_news_sm") # Model trained on French News



  0%|          | 0/3 [00:00<?, ?it/s]

### Source functions (only run once)

In [53]:
def get_top_sentences(num_topics, top_scores, article_size):
    """
        Picks the ordered list of indices of the best sentences to summarize the text
        Arguments:
            `num_topics`   The number of topics used in the LSI model
                           Example: 2
            `top_scores`   An array containing the top sentences (index and score) for each model
                           Example: [[(3, 0.5), (4, 0.35), (1, 0.15)], [(6, 0.75), (1, 0.45), (2, 0.3)]]
            `article_size` The number of sentences in the original article
        Returns:
            A list of the indices of the top sentences
    """
    # Algorithm: First choose the best sentence of each topic
    # Then choose the second best sentence of each topic, then the third...
    # Keep going until the desired number of sentences has been reached
    top_sentences = []
    for i in range(article_size):
        for j in range(num_topics):
            if i >= len(top_scores[j]):
                continue
            if top_scores[j][i][0] not in top_sentences:
                top_sentences.append(top_scores[j][i][0])
    return top_sentences

def get_summary(article):
    """
        Computes the optimal summary of an article using Latent Semantic Analysis
        Arguments:
            `article` The raw text content of the original article (without title)
        Returns a tuple containing:
            - The generated summary in text form
            - A keywords-only version of the generated summary
    """
    
    doc = nlp(article)

    # Split the text into sentences, remove stopwords, stem words and remove punctuation
    sentences = []
    cur_sentence = []
    for sent in doc.sents:
        for token in sent:
            if not token.text.lower() in STOP_WORDS and not token.is_punct:
                cur_sentence.append(token.lemma_)
        sentences.append(cur_sentence)
        cur_sentence = []

    # Convert sentences to bags of words
    dictionary = corpora.Dictionary(sentences)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in sentences]
    
    # Create a TF-IDF model that gives each word in each sentence a frequency score
    tfidf = models.TfidfModel(doc_term_matrix)
    sentences_tfidf = tfidf[doc_term_matrix]

    # Try to find the optimal number of topics for Latent Semantic Indexing
    # For that, we try using 2, 3, ..., 10 topics and we compute the coherence values
    # of the model for each number of topics.
    coherence_values = []
    model_list = []
    for num_topics in range(2, 10):
        model = LsiModel(sentences_tfidf, num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=sentences, dictionary=dictionary)
        coherence_values.append(coherencemodel.get_coherence())

    # Pick the number of topics that gives the highest coherence values
    max_coherence = coherence_values.index(max(coherence_values))
    num_topics = 2 + max_coherence
    model = model_list[max_coherence]
    
    # Apply the LSI model to our corpus
    corpus_lsi = model[doc_term_matrix]

    # Compute and store the scores of each sentence for each topic
    top_scores = [[] for i in range(num_topics)]
    for i, scores in enumerate(corpus_lsi):
        for j, score in scores:
            top_scores[j].append((i, abs(score)))

    # Sort the tables so that they contain the sentences in decreasing score order
    for topic in top_scores:
        topic.sort(reverse=True, key=lambda x: x[1])
    
    # Get a list of all sentences in decreasing order of importance
    sents = list(doc.sents)
    top_sentences = get_top_sentences(num_topics, top_scores, len(sents) + 1)
    
    # Try to add each sentence to the summary, starting from the best one
    # and making sure to not go over a tweet's length
    sents_to_add = []
    summary_size = 0
    for i in top_sentences:
        full_sent = sents[i].text
        new_size = summary_size + len(full_sent)
        if summary_size + new_size <= 280:
            sents_to_add.append(i)
            summary_size += len(full_sent) + 1 # +1 because of the space/newline between sentences
    
    # Now that we have the optimal list of sentences,
    # build the actual summary as well as the keyword-only version
    summary = ''
    keyword_summary = ''
    for sent_idx in sents_to_add:
        keyword_sent = ' '.join(word for word in sentences[sent_idx])
        full_sent = sents[sent_idx].text
        keyword_summary += keyword_sent + '\n'
        summary += full_sent + '\n'
    
    # Remove the final space/newline
    if summary:
        summary = summary[:-1]
        keyword_summary = keyword_summary[:-1]
    return summary, keyword_summary

def evaluate_rouge(summary, long_summ, short_summ):
    """
        Computes the ROUGE-L score corresponding to the evaluation of a generated summary
        (in two versions: full text and keyword-only version) with a reference one
        Arguments:
            `summary`    The reference summary of the article
            `long_summ`  The generated summary (full text version)
            `short_summ` The generated summary (keywords-only version)
        Returns a tuple containing:
            - The scores of the full text summary
            - The scores of the keyword-only summary
    """
    
    # Process the reference summary (segment it)
    # Also make a copy that is stemmed and has no stopwords to compare it with the
    # keyword-only generated summary
    summ = nlp(summary)
    summ_sentences = []
    summ_cur_sentence = []
    for sent in summ.sents:
        for token in sent:
            if not token.text.lower() in STOP_WORDS and not token.is_punct:
                summ_cur_sentence.append(token.lemma_)
        summ_sentences.append(summ_cur_sentence)
        summ_cur_sentence = []

    # Creates the instance that allows us to evaluate our summaries
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=False)

    # Put the summaries together using newlines (required by RougeScorer)
    ref_summary = '\n'.join([sent.text for sent in summ.sents])
    keyword_ref_summary = '\n'.join([' '.join(sent) for sent in summ_sentences])
    
    # Compute and return the scores
    scores = scorer.score(ref_summary, long_summ)
    scores_keyword = scorer.score(keyword_ref_summary, short_summ)
    return scores, scores_keyword

def sum_loop(article_text, article_sum, num_articles=None):
    """
        Evaluates the summarization process for all articles in a set
        Arguments:
            `article_text` A list containing the raw text of each article
            `article_sum`  A list containing the reference summaries of each article
            `num_articles` The number of articles to evaluate (default: all)
        Returns a tuple containing:
            - The evaluation scores for all full generated summaries
            - The evaluation scores for all keywords-only generated summaries
    """
    if num_articles is None:
        num_articles = len(article_text)
    
    long_eval_list = []
    keyword_eval_list = []
    
    for x in tqdm(range(num_articles)):
        get_summary(article_text[x])
        long_summ, short_summ = get_summary(article_text[x])
        long_eval, keyword_eval = evaluate_rouge(article_sum[x], long_summ, short_summ)
        long_eval_list.append(long_eval)
        keyword_eval_list.append(keyword_eval)
    return long_eval_list, keyword_eval_list
    
def eval_loop(long_eval_list, keyword_eval_list):
    """
        Computes the average evaluation scores from a list
        Arguments:
            `long_eval_list`    A list containing all evaluation scores for full generated summaries
            `keyword_eval_list` A list containing all evaluation scores for keyword-only generated summaries
        Returns a dict containing the average precision, recall and f1-score
            for both full and keyword_only generated summaries
    """
    # Values from long_summ data
    precision_list = []
    recall_list = []
    fmeasure_list = []
    
    # Values from short_summ data
    precision_list2 = []
    recall_list2 = []
    fmeasure_list2 = []
    
    for x in range(len(long_eval_list)): # Goes through each element in evaluation list (element = rougeL set)
        for y in range(3): # Goes through each item within a single rougeL set, precision[0], recall[1], fmeasure[2]
            value = long_eval_list[x]['rougeL'][y]
            if y == 0:
                precision_list.append(value)
            if y == 1:
                recall_list.append(value)
            if y == 2:
                fmeasure_list.append(value)
                    
                    
    for x in range(len(keyword_eval_list)):
        for y in range(3):
            value = keyword_eval_list[x]['rougeL'][y]
            if y == 0:
                precision_list2.append(value)
            if y == 1:
                recall_list2.append(value)
            if y == 2:
                fmeasure_list2.append(value)

    list_length = len(precision_list)
    
    results = {}
    results["Long precision avg"] = sum(precision_list) / list_length
    results["Keyword precision avg"] = sum(precision_list2) / list_length
    results["Long recall avg"] = sum(recall_list) / list_length
    results["Keyword recall avg"] = sum(recall_list2) / list_length
    results["Long F1-score avg"] = sum(fmeasure_list) / list_length
    results["Keyword F1-score avg"] = sum(fmeasure_list2) / list_length
    
    return results

### Run the whole program on a single article

In [39]:
# Pick an article and its reference summary
article = dataset['train']['text'][4]
summary = dataset['train']['summary'][4]

# Computes the summary and evaluation
long_summ, short_summ = get_summary(article)
scores1, scores2 = evaluate_rouge(summary, long_summ, short_summ)
print(long_summ)
print()
print(short_summ)
print()
print(summary)
print()
print(scores1)
print(scores2)

Les secours ont été prévenus vers 5 heures du matin, mais "l'incendie avait déjà bien démarré", a-t-il expliqué.

secours être prévenir 5 heure matin incendie bien démarrer -t il expliquer

Cinq personnes sont mortes, et treize autres ont été blessées à Nîmes, dans le Gard, dans un incendie qui s'est déclenché vendredi 1er janvier au petit matin.

{'rougeL': Score(precision=0.16666666666666666, recall=0.13333333333333333, fmeasure=0.14814814814814814)}
{'rougeL': Score(precision=0.21428571428571427, recall=0.2, fmeasure=0.20689655172413796)}


### Run the whole program on a series of articles

In [38]:
article_text = dataset['test']['text']
article_sum = dataset['test']['summary']

# Here we pick 5 articles
scores1, scores2 = sum_loop(article_text, article_sum, 5)

results = eval_loop(scores1, scores2)
for k, v in results.items():
    print(k.ljust(25), round(v*100, 3), '%')

Long precision avg        19.747 %
Keyword precision avg     16.677 %
Long recall avg           28.736 %
Keyword recall avg        24.778 %
Long F1-score avg         23.085 %
Keyword F1-score avg      19.648 %


### Run the whole program on scraped articles

In [None]:
with open('actu-preliminary.json', 'r', encoding='utf-8') as jsonfile:
    data = json.load(jsonfile)
data_text = [article['text'] for article in data]
data_summary = [article['summary'] for article in data]

scores1, scores2 = sum_loop(data_text, data_summary)

results = eval_loop(scores1, scores2)
for k, v in results.items():
    print(k.ljust(25), round(v*100, 3), '%')

In [48]:
# Pick an article and its reference summary
article = data_text[1]
summary = data_summary[1]

# Computes the summary and evaluation
long_summ, short_summ = get_summary(article)
scores1, scores2 = evaluate_rouge(summary, long_summ, short_summ)
print(long_summ)
print()
print(short_summ)
print()
print(summary)
print()
print(scores1)
print(scores2)

Le masque ne sera plus obligatoire à compter du 18 octobre 2021 dans les stades, les marchés et brocantes de la Loire du moment qu'ils sont en extérieur dans la Loire.

masque obligatoire compter 18 octobre 2021 stade marché brocante Loire moment extérieur Loire

Le masque ne sera plus obligatoire à compter du 18 octobre 2021 dans les stades, les marchés et brocantes de la Loire du moment qu'ils sont en extérieur dans la Loire.

{'rougeL': Score(precision=1.0, recall=1.0, fmeasure=1.0)}
{'rougeL': Score(precision=1.0, recall=1.0, fmeasure=1.0)}
