In [41]:
from datasets import load_dataset
import spacy
from spacy.lang.fr.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora
from gensim import models
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel
from rouge_score import rouge_scorer

dataset = load_dataset('mlsum', 'fr')



  0%|          | 0/3 [00:00<?, ?it/s]

In [67]:
def get_top_sentences(num_topics, top_scores, summary_size=5):
    top_sentences = set()
    count = 0
    for i in range(summary_size):
        for j in range(num_topics):
            if i >= len(top_scores[j]):
                continue
            top_sentences.add(top_scores[j][i][0])
            if len(top_sentences) == summary_size:
                return sorted(top_sentences)
    return sorted(top_sentences)

def get_summary(article):
    nlp = spacy.load("fr_core_news_sm")
    doc = nlp(article)

    sentences = []
    cur_sentence = []
    for sent in doc.sents:
        for token in sent:
            if not token.text.lower() in STOP_WORDS and not token.is_punct:
                cur_sentence.append(token.lemma_)
        sentences.append(cur_sentence)
        cur_sentence = []

    dictionary = corpora.Dictionary(sentences)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in sentences]
    tfidf = models.TfidfModel(doc_term_matrix)
    sentences_tfidf = tfidf[doc_term_matrix]

    coherence_values = []
    model_list = []
    for num_topics in range(2, 10):
        model = LsiModel(sentences_tfidf, num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=sentences, dictionary=dictionary)
        coherence_values.append(coherencemodel.get_coherence())

    max_coherence = coherence_values.index(max(coherence_values))
    num_topics = 2 + max_coherence
    model = model_list[max_coherence]
    corpus_lsi = model[doc_term_matrix]

    top_scores = [[] for i in range(num_topics)]
    for i, scores in enumerate(corpus_lsi):
        for j, score in scores:
            top_scores[j].append((i, abs(score)))

    for topic in top_scores:
        topic.sort(reverse=True, key=lambda x: x[1])
        
    sents = list(doc.sents)
    longest_summary = ''
    longest_keyword_summary = ''
    for i in range(1, len(sents) + 1):
        top_sentences = get_top_sentences(num_topics, top_scores, i)
        summary = ''""''
        keyword_summary = ''
        for sent_idx in top_sentences:
            keyword_summary += ' '.join(word for word in sentences[sent_idx]) + "\n"
            summary += sents[sent_idx].text + "\n"
        if len(summary) > 280:
            break
        longest_summary = summary
        longest_keyword_summary = keyword_summary
    if longest_summary:
        longest_summary = longest_summary[:-1]
        longest_keyword_summary = longest_keyword_summary[:-1]
    return longest_summary, longest_keyword_summary


def evaluate_rouge(summary, long_summ, short_summ):
    summ = nlp(summary)
    summ_sentences = []
    summ_cur_sentence = []
    for sent in summ.sents:
        for token in sent:
            if not token.text.lower() in STOP_WORDS and not token.is_punct:
                summ_cur_sentence.append(token.lemma_)
        summ_sentences.append(summ_cur_sentence)
        summ_cur_sentence = []

    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=False)

    ref_summary = '\n'.join([sent.text for sent in summ.sents])
    keyword_ref_summary = '\n'.join([' '.join(sent) for sent in summ_sentences])
    
    scores = scorer.score(ref_summary, long_summ)
    scores_keyword = scorer.score(keyword_ref_summary, short_summ)
    return scores, scores_keyword


article = dataset['train']['text'][4]
summary = dataset['train']['summary'][4]

long_summ, short_summ = get_summary(article)
scores1, scores2 = evaluate_rouge(summary, long_summ, short_summ)
print(scores1)
print(scores2)


{'rougeL': Score(precision=0.09259259259259259, recall=0.16666666666666666, fmeasure=0.11904761904761904)}
{'rougeL': Score(precision=0.0967741935483871, recall=0.2, fmeasure=0.13043478260869568)}
