In [1]:
from datasets import load_dataset
import spacy
from spacy.lang.fr.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora
from gensim import models
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel
from rouge_score import rouge_scorer

dataset = load_dataset('mlsum', 'fr')
nlp = spacy.load("fr_core_news_sm")

Reusing dataset mlsum (/Users/josephkeenan/.cache/huggingface/datasets/mlsum/fr/1.0.0/77f23eb185781f439927ac2569ab1da1083195d8b2dab2b2f6bbe52feb600688)


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [2]:
def get_top_sentences(num_topics, top_scores, summary_size=5):
    top_sentences = set()
    count = 0
    for i in range(summary_size):
        for j in range(num_topics):
            if i >= len(top_scores[j]):
                continue
            top_sentences.add(top_scores[j][i][0])
            if len(top_sentences) == summary_size:
                return sorted(top_sentences)
    return sorted(top_sentences)

def get_summary(article):
    doc = nlp(article)

    sentences = []
    cur_sentence = []
    for sent in doc.sents:
        for token in sent:
            if not token.text.lower() in STOP_WORDS and not token.is_punct:
                cur_sentence.append(token.lemma_)
        sentences.append(cur_sentence)
        cur_sentence = []

    dictionary = corpora.Dictionary(sentences)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in sentences]
    tfidf = models.TfidfModel(doc_term_matrix)
    sentences_tfidf = tfidf[doc_term_matrix]

    coherence_values = []
    model_list = []
    for num_topics in range(2, 10):
        model = LsiModel(sentences_tfidf, num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=sentences, dictionary=dictionary)
        coherence_values.append(coherencemodel.get_coherence())

    max_coherence = coherence_values.index(max(coherence_values))
    num_topics = 2 + max_coherence
    model = model_list[max_coherence]
    corpus_lsi = model[doc_term_matrix]

    top_scores = [[] for i in range(num_topics)]
    for i, scores in enumerate(corpus_lsi):
        for j, score in scores:
            top_scores[j].append((i, abs(score)))

    for topic in top_scores:
        topic.sort(reverse=True, key=lambda x: x[1])
        
    sents = list(doc.sents)
    summary = ''
    keyword_summary = ''
    added_sents = set()
    for i in range(1, len(sents) + 1):
        top_sentences = get_top_sentences(num_topics, top_scores, i)
        print(top_sentences)
        for sent_idx in top_sentences:
            keyword_sent = ' '.join(word for word in sentences[sent_idx])
            full_sent = sents[sent_idx].text
            if sent_idx not in added_sents and len(summary + full_sent + '\n') <= 280 + 1:
                keyword_summary += keyword_sent + '\n'
                summary += full_sent + '\n'
                added_sents.add(sent_idx)
    if summary:
        summary = summary[:-1]
        keyword_summary = keyword_summary[:-1]
    return summary, keyword_summary

def evaluate_rouge(summary, long_summ, short_summ):
    summ = nlp(summary)
    summ_sentences = []
    summ_cur_sentence = []
    for sent in summ.sents:
        for token in sent:
            if not token.text.lower() in STOP_WORDS and not token.is_punct:
                summ_cur_sentence.append(token.lemma_)
        summ_sentences.append(summ_cur_sentence)
        summ_cur_sentence = []

    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=False)

    ref_summary = '\n'.join([sent.text for sent in summ.sents])
    keyword_ref_summary = '\n'.join([' '.join(sent) for sent in summ_sentences])
    
    scores = scorer.score(ref_summary, long_summ)
    scores_keyword = scorer.score(keyword_ref_summary, short_summ)
    return scores, scores_keyword


article = dataset['train']['text'][4]
summary = dataset['train']['summary'][4]

long_summ, short_summ = get_summary(article)
scores1, scores2 = evaluate_rouge(summary, long_summ, short_summ)

print(scores1)
print(scores2)

[5]
[5, 8]
[0, 5, 8]
[0, 2, 5, 8]
[0, 2, 4, 5, 8]
[0, 1, 2, 4, 5, 8]
[0, 1, 2, 4, 5, 8]
[0, 1, 2, 4, 5, 8]
[0, 1, 2, 4, 5, 8]
{'rougeL': Score(precision=0.09259259259259259, recall=0.16666666666666666, fmeasure=0.11904761904761904)}
{'rougeL': Score(precision=0.06451612903225806, recall=0.14285714285714285, fmeasure=0.08888888888888889)}


In [3]:
print(long_summ)


Les secours ont été prévenus vers 5 heures du matin, mais "l'incendie avait déjà bien démarré", a-t-il expliqué.
"L'origine de l'incendie est indéterminée mais a priori accidentelle", a déclaré le procureur adjoint de la République de Nîmes, cité par Europe 1.


In [4]:
print(short_summ)


secours prévenir 5 heure matin incendie déjà démarré t il expliquer
origine incendie indéterminer priori accidentel déclarer procureur adjoint république Nîmes citer Europe 1


In [5]:
print(summary)


Cinq personnes sont mortes, et treize autres ont été blessées à Nîmes, dans le Gard, dans un incendie qui s'est déclenché vendredi 1er janvier au petit matin.


In [7]:
evaluation_list = [] #list of evaluation values

for x in range(0,5):
    get_summary(dataset['train']['text'][x])
    long_summ, short_summ = get_summary(dataset['train']['text'][x])
    evaluation = evaluate_rouge(dataset['train']['summary'][x], long_summ, short_summ)
    evaluation_list.append(evaluation)
print(evaluation_list)

[5]
[5, 21]
[5, 21, 25]
[5, 21, 25, 27]
[0, 5, 21, 25, 27]
[0, 5, 18, 21, 25, 27]
[0, 1, 5, 18, 21, 25, 27]
[0, 1, 5, 18, 21, 24, 25, 27]
[0, 1, 5, 7, 18, 21, 24, 25, 27]
[0, 1, 4, 5, 7, 18, 21, 24, 25, 27]
[0, 1, 4, 5, 7, 8, 18, 21, 24, 25, 27]
[0, 1, 4, 5, 7, 8, 18, 21, 22, 24, 25, 27]
[0, 1, 2, 4, 5, 7, 8, 18, 21, 22, 24, 25, 27]
[0, 1, 2, 4, 5, 7, 8, 16, 18, 21, 22, 24, 25, 27]
[0, 1, 2, 4, 5, 7, 8, 16, 18, 20, 21, 22, 24, 25, 27]
[0, 1, 2, 4, 5, 7, 8, 13, 16, 18, 20, 21, 22, 24, 25, 27]
[0, 1, 2, 4, 5, 7, 8, 13, 16, 18, 20, 21, 22, 24, 25, 27, 30]
[0, 1, 2, 4, 5, 7, 8, 12, 13, 16, 18, 20, 21, 22, 24, 25, 27, 30]
[0, 1, 2, 4, 5, 7, 8, 12, 13, 16, 18, 19, 20, 21, 22, 24, 25, 27, 30]
[0, 1, 2, 4, 5, 7, 8, 12, 13, 16, 18, 19, 20, 21, 22, 24, 25, 27, 28, 30]
[0, 1, 2, 4, 5, 7, 8, 9, 12, 13, 16, 18, 19, 20, 21, 22, 24, 25, 27, 28, 30]
[0, 1, 2, 4, 5, 7, 8, 9, 12, 13, 16, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 30]
[0, 1, 2, 4, 5, 6, 7, 8, 9, 12, 13, 16, 18, 19, 20, 21, 22, 23, 24, 25, 2

In [12]:
# super messy, will clean up
precision_list = []
precision_list2 = []
recall_list = []
recall_list2 = []
fmeasure_list = []
fmeasure_list2 = []
# list2 contains values from the short_summ data

# first number refers to the index of article in corpus
# second number, either 0 or 1, refers to the values for the long_summ or the short_summ
# third number, after rougeL, refers to precsion[0], recall[1], or fmeasure[2]

for x in range(len(evaluation_list)):
    precision_val1 = evaluation_list[x][0]['rougeL'][0]
    precision_val2 = evaluation_list[x][1]['rougeL'][0]
    recall_val1 = evaluation_list[x][0]['rougeL'][1]
    recall_val2 = evaluation_list[x][1]['rougeL'][1]
    fmeasure_val1 = evaluation_list[x][0]['rougeL'][2]
    fmeasure_val2 = evaluation_list[x][1]['rougeL'][2]
    precision_list.append(precision_val1)
    precision_list2.append(precision_val2)
    recall_list.append(recall_val1)
    recall_list2.append(recall_val2)
    fmeasure_list.append(fmeasure_val1)
    fmeasure_list2.append(fmeasure_val2)

#print(precision_list)
#print(recall_list)
#print(fmeasure_list)

long_precision_avg = sum(precision_list) / len(precision_list)
short_precision_avg = sum(precision_list2) / len(precision_list2)
long_recall_avg = sum(recall_list) / len(precision_list)
short_recall_avg = sum(recall_list2) / len(precision_list2)
long_fmeasure_avg = sum(fmeasure_list) / len(fmeasure_list)
short_fmeasure_avg = sum(fmeasure_list2) / len(fmeasure_list2)

print(f'long precision average: {long_precision_avg}')
print(f'short precision average: {short_precision_avg}')
print()
print(f'long recall average: {long_recall_avg}')
print(f'short recall average: {short_recall_avg}')
print()
print(f'long fmeasure average: {long_fmeasure_avg}')
print(f'short fmeasure average: {short_fmeasure_avg}')

long precision average: 0.1888086423380541
short precision average: 0.13345238095238093

long recall average: 0.336130791722897
short recall average: 0.22740183792815372

long fmeasure average: 0.23965748901186493
short fmeasure average: 0.16774591427690413
