Basic steps of the LSA Model:
1. Load data
2. Preprocess data (via stop word removal and stemming)
3. Instantiate TFIDF model, which gives a value to each word based on frequency, but normalizes over all sentences (i.e. if a word is common in one sentence, it is deemed important, but if it’s also common in all sentences, it’s value decreases)
4. Calculate coherence values in order to figure out ideal number of topics to split article into
5. For each sentence, calculate how relevant that sentence is to each topic (via corpus_lsi model and doc_term_matrix)
6. Order sentences from most to least relevant for each topic (via top_scores)
7. Going through each topic, picking the top sentence from each topic, and perhaps the second best sentence and so on if needed, until you have reached the summary length limit and sort these sentences according to their original indexes (via get_top_sentences)
8. Transform sentences back into original word-forms from the digital forms that we’ve been working with to end up with the summary
9. Evaluation

In [1]:
from datasets import load_dataset
import spacy
from spacy.lang.fr.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora
from gensim import models
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel
from rouge_score import rouge_scorer

In [2]:
dataset = load_dataset('mlsum', 'fr')

Reusing dataset mlsum (/Users/josephkeenan/.cache/huggingface/datasets/mlsum/fr/1.0.0/77f23eb185781f439927ac2569ab1da1083195d8b2dab2b2f6bbe52feb600688)


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [3]:
article = dataset['train']['text'][4]
summary = dataset['train']['summary'][4]
title = dataset['train']['title'][4]

In [5]:
nlp = spacy.load("fr_core_news_sm")
doc = nlp(article)
summ = nlp(summary)

sentences = []
cur_sentence = []

# This loop is used to get rid of stop words, then to create a list of all of the lemmas for the remaining words
for sent in doc.sents: # For each sentence in the article
    for token in sent: # For each token in the sentence
        if not token.text.lower() in STOP_WORDS and not token.is_punct:
            cur_sentence.append(token.lemma_)
    sentences.append(cur_sentence)
    cur_sentence = []

for s in sentences:
    print(s)
    print()

['personne', 'mourir', 'blesser', 'Nîmes', 'Gard', 'incendie', 'déclencher', 'vendredi', 'premier', 'janvier', 'petit', 'matin']

['feu', 'ignore', 'origine', 'instant', 'prendre', 'étage', 'immeuble']

['arrivée', 'pompier', 'personne', 'décéder', 'appartement', 'appartement', 'voisin', 'intoxication', 'expliquer', 'Télé', 'directeur', 'cabinet', 'préfet', 'Gard']

['dénombre', 'également', 'blessé', 'grave']

['coma', 'transférer', 'Marseille', 't', 'il', 'ajouter']

['secours', 'prévenir', '5', 'heure', 'matin', 'incendie', 'déjà', 'démarré', 't', 'il', 'expliquer']

['France']

['Info', 'précise', 'victime', 'adulte', 'enfant']

['origine', 'incendie', 'indéterminer', 'priori', 'accidentel', 'déclarer', 'procureur', 'adjoint', 'république', 'Nîmes', 'citer', 'Europe', '1']



In [6]:
# Creation of the TF-IDF scores for each word
# Essentially calculates a value for each word based on it's frequency in each sentence 
## and also takes into account frequency of word in document overall

dictionary = corpora.Dictionary(sentences)
print(dictionary)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in sentences]
tfidf = models.TfidfModel(doc_term_matrix)
sentences_tfidf = tfidf[doc_term_matrix]
for sent in sentences_tfidf:
    print(sent)

Dictionary(62 unique tokens: ['Gard', 'Nîmes', 'blesser', 'déclencher', 'incendie']...)
[(0, 0.22661814957458093), (1, 0.22661814957458093), (2, 0.3310540860350354), (3, 0.3310540860350354), (4, 0.1655270430175177), (5, 0.3310540860350354), (6, 0.22661814957458093), (7, 0.3310540860350354), (8, 0.22661814957458093), (9, 0.3310540860350354), (10, 0.3310540860350354), (11, 0.3310540860350354)]
[(12, 0.39318346293947504), (13, 0.39318346293947504), (14, 0.39318346293947504), (15, 0.39318346293947504), (16, 0.2691478902490874), (17, 0.39318346293947504), (18, 0.39318346293947504)]
[(0, 0.18035474530072435), (8, 0.18035474530072435), (19, 0.26347040375935565), (20, 0.5269408075187113), (21, 0.26347040375935565), (22, 0.26347040375935565), (23, 0.26347040375935565), (24, 0.26347040375935565), (25, 0.18035474530072435), (26, 0.26347040375935565), (27, 0.26347040375935565), (28, 0.26347040375935565), (29, 0.26347040375935565)]
[(30, 0.5), (31, 0.5), (32, 0.5), (33, 0.5)]
[(34, 0.45004989627853

In [13]:
# Calculation of coherence values, which tells us the optimal number of topics to separate article into

coherence_values = []
model_list = []
for num_topics in range(2, 10): # The range indicates the numbers of topics we are considering; here we are considering between 2 and 9 topics
    model = LsiModel(sentences_tfidf, num_topics=num_topics, id2word=dictionary)
    model_list.append(model)
    coherencemodel = CoherenceModel(model=model, texts=sentences, dictionary=dictionary)
    coherence_values.append(coherencemodel.get_coherence())

print(coherence_values)
max_coherence = coherence_values.index(max(coherence_values))
num_topics = 2 + max_coherence
model = model_list[max_coherence]
print(f'optimal number of topics: {num_topics}')


[0.4045484405709039, 0.366378392464812, 0.40398161136138316, 0.39120445759058003, 0.4078682168757249, 0.37040766661676466, 0.33697431194923594, 0.31537311193802886]
optimal number of topics: 6


In [14]:
print(model.print_topics(num_topics=num_topics)) # prints each topic, which is essentially a set of words and a number indicating their relation to the topic
print()
corpus_lsi = model[doc_term_matrix]

# This loop prints each sentence along with the values indicating how much the sentence relates to each topic
## Some sentences have no relation to a topic, which is why some sentences are missing a score for certain topics
for score, text in zip(corpus_lsi, doc.sents):
    print(score, text)


[(0, '0.263*"t" + 0.263*"il" + 0.235*"matin" + 0.208*"incendie" + 0.200*"heure" + 0.200*"5" + 0.200*"démarré" + 0.200*"secours" + 0.200*"déjà" + 0.200*"prévenir"'), (1, '0.239*"coma" + 0.239*"transférer" + 0.239*"ajouter" + 0.239*"Marseille" + 0.232*"il" + 0.232*"t" + -0.191*"origine" + -0.188*"Nîmes" + -0.146*"priori" + -0.146*"procureur"'), (2, '-0.286*"appartement" + 0.243*"origine" + 0.235*"feu" + 0.235*"immeuble" + 0.235*"étage" + 0.235*"ignore" + 0.235*"instant" + 0.235*"prendre" + -0.161*"personne" + -0.161*"Gard"'), (3, '0.448*"grave" + 0.448*"dénombre" + 0.448*"blessé" + 0.448*"également" + 0.303*"France" + 0.144*"Info" + 0.144*"enfant" + 0.144*"précise" + 0.144*"victime" + 0.144*"adulte"'), (4, '-0.895*"France" + 0.187*"Info" + 0.187*"précise" + 0.187*"enfant" + 0.187*"victime" + 0.187*"adulte" + 0.076*"dénombre" + 0.076*"grave" + 0.076*"également" + 0.076*"blessé"'), (5, '-0.380*"Info" + -0.380*"victime" + -0.380*"précise" + -0.380*"enfant" + -0.380*"adulte" + -0.327*"France

In [15]:
# Sorts top sentence indices for each topic

top_scores = [[] for i in range(num_topics)]

# This loop helps us find the top scores
for i, scores in enumerate(corpus_lsi): # i = sentence index in corpus
    for j, score in scores: # j = topic index
        top_scores[j].append((i, abs(score)))

# This loop sorts the top scores in descending order for each topic
for topic in top_scores:
    topic.sort(reverse=True, key=lambda x: x[1])

print()
for v in top_scores:
    print(v)



[(5, 2.356155775860713), (0, 1.8987248960778664), (2, 1.49967951720882), (4, 1.2681047078648622), (8, 1.1445606469371634), (1, 0.22053925371658237)]
[(8, 1.9257494885437403), (0, 1.4604938317813678), (4, 1.4188621942924309), (2, 1.0774367414658732), (1, 0.9888033474091669), (5, 0.9812048869645829)]
[(2, 2.2611627453875016), (1, 1.6561184541998684), (8, 1.48756539981413), (0, 0.9734427754384644), (4, 0.6630774409809027), (5, 0.22518166721222574)]
[(3, 1.7936007796425586), (7, 0.7200911509287125), (6, 0.30338557593800464)]
[(7, 0.9369718039156227), (6, 0.8951026397998952), (3, 0.30468365205031966)]
[(7, 1.8983025504437556), (3, 0.8307611663040635), (6, 0.32672382303104996)]


In [16]:
# This function takes a summary size (i.e. how many sentences we want in our summary)
# Then it takes the top score from each topic and inserts the sentence index into our list 'top_sentences'

def get_top_sentences(summary_size=5):
    top_sentences = set()
    count = 0
    for i in range(summary_size): 
        for j in range(num_topics):
            if i >= len(top_scores[j]):
                continue
            top_sentences.add(top_scores[j][i][0])
            if len(top_sentences) == summary_size:
                return sorted(top_sentences)
            
top_sentences = get_top_sentences()
print(top_sentences)

[2, 3, 5, 7, 8]


In [19]:
# This block converts the top_sentences indicies back into their actual sentences and returns a summary that is 280 words or less
# Thus all of the top_sentences may not always be used

sents = list(doc.sents)
longest_summary = ''
for i in range(1, len(sents) + 1): # i = summary_size (i.e. number of sentences)
    top_sentences = get_top_sentences(i)
    summary = ""
    for sent_idx in top_sentences: # joins words back together into strings and adds a new line between each sentence (this new line format was necessary to use the ROUGE evaluation)
        #summary += sents[sent_idx].text + "\n"
        summary += ' '.join(word for word in sentences[sent_idx]) + "\n"
    if len(summary) > 280:
        break
    longest_summary = summary
if longest_summary: # Omits las character in summary, which would be a new line ('\n')
    longest_summary = longest_summary[:-1]

print(longest_summary)

5
secours prévenir 5 heure matin incendie déjà démarré t il expliquer
origine incendie indéterminer priori accidentel déclarer procureur adjoint république Nîmes citer Europe 1


In [29]:
# ROUGE Evaluation

summ_sentences = []
summ_cur_sentence = []

# This loop takes the words in the reference summary (from the corpus) to remove stop words and stem the remaining keywords
for sent in summ.sents: # For each sentence in the summary
    for token in sent: # For each word in a sentence
        if not token.text.lower() in STOP_WORDS and not token.is_punct:
            summ_cur_sentence.append(token.lemma_)
    summ_sentences.append(summ_cur_sentence)
    summ_cur_sentence = []

# Instantiate ROUGE evaluation
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=False)

# joins elements in original article into a string from a list
original_article = '\n'.join([sent.text for sent in doc.sents])
scores_original = scorer.score(original_article, longest_summary)

# joins elements in reference summary into a string from a list
#reference_summary = '\n'.join([sent.text for sent in summ.sents])
reference_summary = '\n'.join([' '.join(sent) for sent in summ_sentences])
print(summ_sentences)
print(reference_summary)

# The ROUGE score evaluates the reference summary against our generated summary (longest_summary)
scores_reference = scorer.score(reference_summary, longest_summary)

#print()
#print(longest_summary)
#print()
#print(original_article)
#print()
#print(reference_summary)
#print()
print(scores_original)
#print()
print(scores_reference)

[['personne', 'mourir', 'blesser', 'Nîmes', 'Gard', 'incendie', 'déclencher', 'vendredi', 'premier', 'janvier', 'petit', 'matin']]
personne mourir blesser Nîmes Gard incendie déclencher vendredi premier janvier petit matin
{'rougeL': Score(precision=0.7741935483870968, recall=0.13114754098360656, fmeasure=0.22429906542056074)}
{'rougeL': Score(precision=0.06451612903225806, recall=0.14285714285714285, fmeasure=0.08888888888888889)}
