Basic steps of the LSA Model:
1. Load data
2. Preprocess data (via stop word removal and stemming)
3. Instantiate TFIDF model, which gives a value to each word based on frequency, but normalizes over all sentences (i.e. if a word is common in one sentence, it is deemed important, but if it’s also common in all sentences, it’s value decreases)
4. Calculate coherence values in order to figure out ideal number of topics to split article into
5. For each sentence, calculate how relevant that sentence is to each topic (via corpus_lsi model and doc_term_matrix)
6. Order sentences from most to least relevant for each topic (via top_scores)
7. Going through each topic, picking the top sentence from each topic, and perhaps the second best sentence and so on if needed, until you have reached the summary length limit and sort these sentences according to their original indexes (via get_top_sentences)
8. Transform sentences back into original word-forms from the digital forms that we’ve been working with to end up with the summary
9. Evaluation

In [1]:
# Dataset
from datasets import load_dataset

# Tokenization & Stemming
import spacy
from spacy.lang.fr.stop_words import STOP_WORDS

# Steps needed to instantiate Latent Semantic Analysis and LSA model itself
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora
from gensim import models
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel

# Evaluation
from rouge_score import rouge_scorer

In [5]:
# Loading French dataset
dataset = load_dataset('mlsum', 'fr')

Reusing dataset mlsum (/Users/josephkeenan/.cache/huggingface/datasets/mlsum/fr/1.0.0/77f23eb185781f439927ac2569ab1da1083195d8b2dab2b2f6bbe52feb600688)


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [4]:
# Loading English dataset
dataset = load_dataset('cnn_dailymail', '3.0.0')

Reusing dataset cnn_dailymail (/Users/josephkeenan/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [16]:
# Choosing an article & reference summary
article = dataset['train']['text'][11]
# summary = dataset['train']['summary'][11]

# This is how you would choose a title (although it is not needed for our program)
# title = dataset['train']['title'][11]

In [17]:
# nlp() function tokenizes the text
nlp = spacy.load("fr_core_news_sm")
doc = nlp(article)
summ = nlp(summary)

en = spacy.load('en_core_web_sm')
en_stopwords = en.Defaults.stop_words

# sentences is a list that contains each tokenized and stemmed sentence
sentences = []

# cur_sentence is used to put the stemmed sentence into a list, so we can then transfer it to our main sentences list
cur_sentence = []

# Loop is used to get rid of stop words, then to create a list of all of the lemmas for the remaining words
for sent in doc.sents: # For each sentence in the article
    for token in sent: # For each token in the sentence
        if not token.text.lower() in STOP_WORDS and not token.is_punct:
            cur_sentence.append(token.lemma_)
    sentences.append(cur_sentence)
    cur_sentence = []

for s in sentences:
    print(s)
    print()

['immeuble', 'jouxter', 'siège', 'social', 'APTN', 'Winnipeg', 'travaille', 'arrache', 'pied', 'transformer', 'intérieur', 'ancien', 'banque', 'studio', 'télévision', 'ultramoderne']

['retransmettre', '12', '28', 'février', 'émission', 'consacrer', 'jeu', 'olympique', 'hiver', 'lieu', 'Vancouver', 'Whistler', 'Côte', 'ouest', 'Canada']

['aptn', 'diffuseur', 'officiel', 'sein', 'consortium', 'chaîne', 'canadien', 'oeuvrer', 'manière', 'grand', 'partie', 'langue', 'autochtone']

['désignation', 'grand', 'réussite', 'réjouir', 'Jean', 'laros', 'directeur', 'général', 'chaîne', 'nommer', 'radiodiffuseur', 'autochtone', 'officiel', 'jeu', 'hiver']

['réseau', 'proposer', 'couverture', 'direct', 'haute', 'définition', 'compétition', 'cérémonie', 'reportage', 'émission', 'information', 'olympique']

['programmation', 'spécial', 'heure', 'jour', 'structurer', 'bloc', 'linguistique', 'heure', 'anglais', 'autant', 'français', 'couvertur', 'vraiment', 'national', 'français', 'autant', 'langue',

In [18]:
# Creation of the TF-IDF scores for each word
# Essentially calculates a value for each word based on it's frequency in each sentence 
## and also takes into account frequency of word in document overall

# Converts our preproccesed sentences into a bag of words
dictionary = corpora.Dictionary(sentences)
print(dictionary)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in sentences]

# Instantiate TF-IDF Model
tfidf = models.TfidfModel(doc_term_matrix)
sentences_tfidf = tfidf[doc_term_matrix]
for sent in sentences_tfidf:
    print(sent)

Dictionary(136 unique tokens: ['APTN', 'Winnipeg', 'ancien', 'arrache', 'banque']...)
[(0, 0.25), (1, 0.25), (2, 0.25), (3, 0.25), (4, 0.25), (5, 0.25), (6, 0.25), (7, 0.25), (8, 0.25), (9, 0.25), (10, 0.25), (11, 0.25), (12, 0.25), (13, 0.25), (14, 0.25), (15, 0.25)]
[(16, 0.27937211830783126), (17, 0.27937211830783126), (18, 0.20952908873087345), (19, 0.27937211830783126), (20, 0.27937211830783126), (21, 0.27937211830783126), (22, 0.27937211830783126), (23, 0.27937211830783126), (24, 0.20952908873087345), (25, 0.20952908873087345), (26, 0.27937211830783126), (27, 0.20952908873087345), (28, 0.27937211830783126), (29, 0.27937211830783126), (30, 0.20952908873087345)]
[(31, 0.24523470628015032), (32, 0.13717382358909), (33, 0.32697960837353374), (34, 0.24523470628015032), (35, 0.32697960837353374), (36, 0.32697960837353374), (37, 0.19741700393039868), (38, 0.11567210183701522), (39, 0.32697960837353374), (40, 0.32697960837353374), (41, 0.24523470628015032), (42, 0.32697960837353374), (43

In [19]:
# Calculation of coherence values, which tells us the optimal number of topics to separate article into

coherence_values = []
model_list = []

# The range indicates the number of topics we are considering; here we are considering between 2 and 9 topics
for num_topics in range(2, 10): 
    model = LsiModel(sentences_tfidf, num_topics=num_topics, id2word=dictionary)
    model_list.append(model)
    coherencemodel = CoherenceModel(model=model, texts=sentences, dictionary=dictionary)
    coherence_values.append(coherencemodel.get_coherence())

print(coherence_values)


max_coherence = coherence_values.index(max(coherence_values))
num_topics = 2 + max_coherence
model = model_list[max_coherence]
print(f'optimal number of topics: {num_topics}')


[0.2341532253965724, 0.39774293065833694, 0.40715853269905955, 0.40793151643258607, 0.3815894394757713, 0.3781185300425466, 0.3719331925296573, 0.41920108542439904]
optimal number of topics: 9


In [20]:
# prints each topic, which is essentially a set of words and a number indicating their relation to the topic
for topic in model.print_topics(num_topics=num_topics):
    print(topic)
print()


corpus_lsi = model[doc_term_matrix]

# This loop prints the values of our corpus_lsi model
## text = each sentence
### score = the values indicating how much the sentence relates to each topic
#### Some sentences have no relation to a topic, which is why some sentences are missing a score for certain topics
for score, text in zip(corpus_lsi, doc.sents):
    print(score, text)


(0, '-0.217*"autochtone" + -0.210*"langue" + -0.188*"autant" + -0.175*"officiel" + -0.175*"chaîne" + -0.173*"grand" + -0.173*"pouvoir" + -0.160*"heure" + -0.144*"professionnel" + -0.144*"colorer"')
(1, '0.249*"sportif" + 0.217*"journaliste" + -0.206*"pouvoir" + -0.184*"autant" + 0.157*"déner" + 0.157*"innus" + 0.157*"explique" + 0.157*"m." + 0.157*"avoir" + 0.157*"cris"')
(2, '-0.265*"terme" + -0.228*"trouver" + -0.221*"ensemble" + -0.221*"venir" + -0.221*"enrichir" + 0.172*"jeu" + 0.172*"hiver" + 0.138*"chaîne" + 0.138*"officiel" + -0.132*"exister"')
(3, '-0.206*"compétition" + -0.181*"olympique" + -0.181*"émission" + -0.166*"pouvoir" + 0.164*"chaîne" + 0.164*"officiel" + 0.152*"aptn" + -0.152*"direct" + -0.152*"haute" + -0.152*"cérémonie"')
(4, '-0.195*"trouver" + 0.181*"journaliste" + 0.171*"m." + 0.171*"avoir" + 0.171*"explique" + 0.171*"innus" + 0.171*"déner" + 0.171*"cris" + -0.165*"émission" + -0.165*"olympique"')
(5, '0.256*"terme" + -0.209*"équipe" + -0.192*"aide" + 0.183*"tli

In [15]:
# Sorts top sentence indices for each topic

top_scores = [[] for i in range(num_topics)]

# This loop helps us find the top scores
# i = sentence index in corpus
# j = topic index
for i, scores in enumerate(corpus_lsi): 
    for j, score in scores: 
        top_scores[j].append((i, abs(score)))

# This loop sorts the top scores in descending order for each topic
for topic in top_scores:
    topic.sort(reverse=True, key=lambda x: x[1])

print()
for v in top_scores:
    print(v)



[(5, 2.356155775860713), (0, 1.8987248960778664), (2, 1.49967951720882), (4, 1.2681047078648622), (8, 1.1445606469371634), (1, 0.22053925371658237)]
[(8, 1.9257494885437403), (0, 1.4604938317813678), (4, 1.4188621942924309), (2, 1.0774367414658732), (1, 0.9888033474091669), (5, 0.9812048869645829)]
[(2, 2.2611627453875016), (1, 1.6561184541998684), (8, 1.48756539981413), (0, 0.9734427754384644), (4, 0.6630774409809027), (5, 0.22518166721222574)]
[(3, 1.7936007796425586), (7, 0.7200911509287125), (6, 0.30338557593800464)]
[(7, 0.9369718039156227), (6, 0.8951026397998952), (3, 0.30468365205031966)]
[(7, 1.8983025504437556), (3, 0.8307611663040635), (6, 0.32672382303104996)]


In [16]:
# This function takes a summary size (i.e. how many sentences we want in our summary)
# Then it takes the top score from each topic and inserts the sentence index into our list 'top_sentences'

def get_top_sentences(summary_size=5):
    top_sentences = set()
    count = 0
    for i in range(summary_size): 
        for j in range(num_topics):
            if i >= len(top_scores[j]):
                continue
            top_sentences.add(top_scores[j][i][0])
            if len(top_sentences) == summary_size:
                return sorted(top_sentences)
            
top_sentences = get_top_sentences()
print(top_sentences)

[2, 3, 5, 7, 8]


In [19]:
# This block converts the top_sentences indicies back into their actual sentences and returns a summary that is 280 words or less
# Thus all of the top_sentences may not always be used

sents = list(doc.sents)
longest_summary = ''
for i in range(1, len(sents) + 1): # i = summary_size (i.e. number of sentences)
    top_sentences = get_top_sentences(i)
    summary = ""
    
    # joins words back together into strings and adds a new line between each sentence (this new line format was necessary to use the ROUGE evaluation)
    for sent_idx in top_sentences: 
        #summary += sents[sent_idx].text + "\n"
        summary += ' '.join(word for word in sentences[sent_idx]) + "\n"
    if len(summary) > 280:
        break
    longest_summary = summary
if longest_summary: # Omits last character in summary, which would be a new line ('\n')
    longest_summary = longest_summary[:-1]

print(longest_summary)

5
secours prévenir 5 heure matin incendie déjà démarré t il expliquer
origine incendie indéterminer priori accidentel déclarer procureur adjoint république Nîmes citer Europe 1


In [29]:
# ROUGE Evaluation

summ_sentences = []
summ_cur_sentence = []

# This loop takes the words in the reference summary (from the corpus) to remove stop words and stem the remaining keywords
for sent in summ.sents: # For each sentence in the summary
    for token in sent: # For each word in a sentence
        if not token.text.lower() in STOP_WORDS and not token.is_punct:
            summ_cur_sentence.append(token.lemma_)
    summ_sentences.append(summ_cur_sentence)
    summ_cur_sentence = []

# Instantiate ROUGE evaluation
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=False)

# joins elements in original article into a string from a list
original_article = '\n'.join([sent.text for sent in doc.sents])
scores_original = scorer.score(original_article, longest_summary)

# joins elements in reference summary into a string from a list
#reference_summary = '\n'.join([sent.text for sent in summ.sents])
reference_summary = '\n'.join([' '.join(sent) for sent in summ_sentences])
print(summ_sentences)
print(reference_summary)

# The ROUGE score evaluates the reference summary against our generated summary (longest_summary)
# For this test we evaluated the lemmas in the reference to the lemmas in our generated topic
## In our more compact LSA-Run file we compare the scores when we evaluate lemmas vs when we evaluate full sentences (un-stemmed & stop words included)
scores_reference = scorer.score(reference_summary, longest_summary)

#print()
#print(longest_summary)
#print()
#print(original_article)
#print()
#print(reference_summary)
#print()
print(scores_original)
#print()
print(scores_reference)

[['personne', 'mourir', 'blesser', 'Nîmes', 'Gard', 'incendie', 'déclencher', 'vendredi', 'premier', 'janvier', 'petit', 'matin']]
personne mourir blesser Nîmes Gard incendie déclencher vendredi premier janvier petit matin
{'rougeL': Score(precision=0.7741935483870968, recall=0.13114754098360656, fmeasure=0.22429906542056074)}
{'rougeL': Score(precision=0.06451612903225806, recall=0.14285714285714285, fmeasure=0.08888888888888889)}
