In [1]:
from datasets import load_dataset
import spacy
from spacy.lang.fr.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora
from gensim import models
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel
from rouge_score import rouge_scorer

dataset = load_dataset('mlsum', 'fr')
nlp = spacy.load("fr_core_news_sm")

Reusing dataset mlsum (/Users/josephkeenan/.cache/huggingface/datasets/mlsum/fr/1.0.0/77f23eb185781f439927ac2569ab1da1083195d8b2dab2b2f6bbe52feb600688)


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [66]:
def get_top_sentences(num_topics, top_scores, summary_size=5):
    top_sentences = set()
    count = 0
    for i in range(summary_size):
        for j in range(num_topics):
            if i >= len(top_scores[j]):
                continue
            top_sentences.add(top_scores[j][i][0])
            if len(top_sentences) == summary_size:
                return sorted(top_sentences)
    return sorted(top_sentences)

def get_summary(article):
    doc = nlp(article)

    sentences = []
    cur_sentence = []
    for sent in doc.sents:
        for token in sent:
            if not token.text.lower() in STOP_WORDS and not token.is_punct:
                cur_sentence.append(token.lemma_)
        sentences.append(cur_sentence)
        cur_sentence = []

    dictionary = corpora.Dictionary(sentences)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in sentences]
    tfidf = models.TfidfModel(doc_term_matrix)
    sentences_tfidf = tfidf[doc_term_matrix]

    coherence_values = []
    model_list = []
    for num_topics in range(2, 10):
        model = LsiModel(sentences_tfidf, num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=sentences, dictionary=dictionary)
        coherence_values.append(coherencemodel.get_coherence())

    max_coherence = coherence_values.index(max(coherence_values))
    num_topics = 2 + max_coherence
    model = model_list[max_coherence]
    corpus_lsi = model[doc_term_matrix]

    top_scores = [[] for i in range(num_topics)]
    for i, scores in enumerate(corpus_lsi):
        for j, score in scores:
            top_scores[j].append((i, abs(score)))

    for topic in top_scores:
        topic.sort(reverse=True, key=lambda x: x[1])
        
    sents = list(doc.sents)
    summary = ''
    keyword_summary = ''
    added_sents = set()
    for i in range(1, len(sents) + 1):
        top_sentences = get_top_sentences(num_topics, top_scores, i)
        #print(top_sentences)
        for sent_idx in top_sentences:
            keyword_sent = ' '.join(word for word in sentences[sent_idx])
            full_sent = sents[sent_idx].text
            if sent_idx not in added_sents and len(summary + full_sent + '\n') <= 280 + 1:
                keyword_summary += keyword_sent + '\n'
                summary += full_sent + '\n'
                added_sents.add(sent_idx)
    if summary:
        summary = summary[:-1]
        keyword_summary = keyword_summary[:-1]
    return summary, keyword_summary

def evaluate_rouge(summary, long_summ, short_summ):
    summ = nlp(summary)
    summ_sentences = []
    summ_cur_sentence = []
    for sent in summ.sents:
        for token in sent:
            if not token.text.lower() in STOP_WORDS and not token.is_punct:
                summ_cur_sentence.append(token.lemma_)
        summ_sentences.append(summ_cur_sentence)
        summ_cur_sentence = []

    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=False)

    ref_summary = '\n'.join([sent.text for sent in summ.sents])
    keyword_ref_summary = '\n'.join([' '.join(sent) for sent in summ_sentences])
    
    scores = scorer.score(ref_summary, long_summ)
    scores_keyword = scorer.score(keyword_ref_summary, short_summ)
    return scores, scores_keyword


article = dataset['train']['text'][4]
summary = dataset['train']['summary'][4]

long_summ, short_summ = get_summary(article)
scores1, scores2 = evaluate_rouge(summary, long_summ, short_summ)

print(scores1)
print(scores2)

{'rougeL': Score(precision=0.09090909090909091, recall=0.16666666666666666, fmeasure=0.11764705882352942)}
{'rougeL': Score(precision=0.0625, recall=0.14285714285714285, fmeasure=0.08695652173913043)}


In [3]:
print(long_summ)


Les secours ont été prévenus vers 5 heures du matin, mais "l'incendie avait déjà bien démarré", a-t-il expliqué.
"L'origine de l'incendie est indéterminée mais a priori accidentelle", a déclaré le procureur adjoint de la République de Nîmes, cité par Europe 1.


In [4]:
print(short_summ)


secours prévenir 5 heure matin incendie déjà démarré t il expliquer
origine incendie indéterminer priori accidentel déclarer procureur adjoint république Nîmes citer Europe 1


In [5]:
print(summary)


Cinq personnes sont mortes, et treize autres ont été blessées à Nîmes, dans le Gard, dans un incendie qui s'est déclenché vendredi 1er janvier au petit matin.


In [78]:
def eval_average(article_text, article_sum, num_articles):
    long_eval_list = []
    keyword_eval_list = []

    for x in range(num_articles):
        get_summary(article_text[x])
        long_summ, short_summ = get_summary(article_text[x])
        long_eval, keyword_eval = evaluate_rouge(article_sum[x], long_summ, short_summ)
        long_eval_list.append(long_eval)
        keyword_eval_list.append(keyword_eval)
    
    # contains values from long_summ data
    precision_list = []
    recall_list = []
    fmeasure_list = []
    
    # list2 contains values from the short_summ data
    precision_list2 = []
    recall_list2 = []
    fmeasure_list2 = []
    
    for x in range(len(long_eval_list)): # Goes through each element in evaluation list (element = rougeL set)
        for y in range(3): # Goes through each item within a single rougeL set, precision[0], recall[1], fmeasure[2]
            value = long_eval_list[x]['rougeL'][y]
            if y == 0:
                precision_list.append(value)
            if y == 1:
                recall_list.append(value)
            if y == 2:
                fmeasure_list.append(value)
                    
                    
    for x in range(len(keyword_eval_list)):
        for y in range(3):
            value = keyword_eval_list[x]['rougeL'][y]
            if y == 0:
                precision_list2.append(value)
            if y == 1:
                recall_list2.append(value)
            if y == 2:
                fmeasure_list2.append(value)


    list_length = len(precision_list)
    
    long_precision_avg = sum(precision_list) / list_length
    keyword_precision_avg = sum(precision_list2) / list_length
    long_recall_avg = sum(recall_list) / list_length
    keyword_recall_avg = sum(recall_list2) / list_length
    long_fmeasure_avg = sum(fmeasure_list) / list_length
    keyword_fmeasure_avg = sum(fmeasure_list2) / list_length
    
    return f'long precision average: {long_precision_avg}', f'keyword precision average: {keyword_precision_avg}', f'long recall average: {long_recall_avg}', f'keyword recall average: {keyword_recall_avg}', f'long fmeasure average: {long_fmeasure_avg}', f'keyword fmeasure average: {keyword_fmeasure_avg}'

article_text = dataset['train']['text']
article_sum = dataset['train']['summary']

eval_average(article_text, article_sum, 4)

('long precision average: 0.2132835301952949',
 'keyword precision average: 0.15119047619047618',
 'long recall average: 0.3784968229869546',
 'keyword recall average: 0.24853801169590642',
 'long fmeasure average: 0.2701600965589488',
 'keyword fmeasure average: 0.18794326241134754')