In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
import wordcloud

import spacy
#spacy.cli.download("it_core_news_sm")
ITA_tokenizer = spacy.load("it_core_news_sm", disable=['parser', 'ner'])

num_topics = 4

it_stop_words = []#stopwords.words('italian')
custom_stop_words = ['me','xe', 'el', 'lettera', 'schmitz','ettore', 'signora', 'signore', 'scrivere', 'sapere','fare', 'cosa','essere']
it_stop_words.extend(custom_stop_words)
full_corpus = pd.read_csv('carteggio_svevo3.csv', sep=';')
ita_corpus = full_corpus[full_corpus['mainLanguage'] == 'ITA'].reset_index()
corpus_for_tfidf = full_corpus[full_corpus['mainLanguage'] == 'ITA'].reset_index()

Following the text of the project proposal the goal is to find:
- which are the main topics of discussion in the corpus, who are the people which each topic is more associated with, how does the interest on different topics evolve over the time

In [2]:
def pre_proc(text, pos = ['PROPN', 'NOUN', 'VERB'], stop_words = it_stop_words ):
    #for the tokenization part we used a piece of software from spacy that performs tokenization, lemmatization
    #and pos tagging in one shot
    
    text = text.lower()
    doc = ITA_tokenizer(text)
    tokenized_text = []
    #tokenizing and lemmatizing on one shot
    for token in doc:
        if token.lemma_ not in stop_words and token.pos_ in pos and not token.is_stop and token.is_alpha:
            tokenized_text.append(token.lemma_)
            #print(token.pos_)
    return tokenized_text

def extract_topic_keywords(model, num_words, plot = True):
    #extracting for each topic the keywords and making some plots
    num_topics = len(model.print_topics())
    if plot:
        for t,w  in lda_model_tfidf.show_topics(formatted=False, num_topics=num_topics, num_words=num_words):
            
            print("Topic {} keywords:\n\n format['word','frequency']\n\n {}".format(t,w))

            w = dict(w)
            width = 800
            height = 600

            wcloud = wordcloud.WordCloud(width = width, height = height, background_color='White').generate_from_frequencies(w)
            plt.figure(dpi=150)
            plt.imshow(wcloud)
            plt.axis('off')
            plt.show()
    
    
    return [[tup2[0] for tup2 in tup[1]]for tup in model.show_topics(formatted=False, num_topics=num_topics, num_words=num_words)]

def get_topic_scores(model, bow_corpus):
    #getting for each topic the score assigned for each topic
    corpus_scores = []
    for letter in bow_corpus:
        score = [0 for i in range(num_topics)]
        model_output =  model[letter]
        
        for out in model_output:
            i = out[0]
            score[i] = out[1]
        
        corpus_scores.append(score)
    return corpus_scores

def get_coherence(model, corpus, dictionary):
    
    c_av = models.CoherenceModel(model=model, corpus = bow,  dictionary=dictionary, coherence='u_mass').get_coherence()
    c_per_topic = models.CoherenceModel(model=model, corpus = bow,  dictionary=dictionary, coherence='u_mass').get_coherence_per_topic()
    return c_av,c_per_topic

In [3]:
sender = ita_corpus['sender']
recipient = ita_corpus['recipient']
important_people = ['Livia Veneziani','Eugenio Montale','Paul Henri Michel','Valerio Jahier','Marieanne Crémieux Comnène','James Joyce','Giuseppe Prezzolini','Ferdinando Pasini','Valéry Larbaud']
sr = []
for i in range(len(sender)):
    if sender[i] != 'Ettore Schmitz' and sender[i] in important_people:
        sr.append(sender[i])
    elif recipient[i] in important_people:
        sr.append(recipient[i])
    else:
        sr.append('OTHER')
ita_corpus['SR'] = sr  
ita_corpus['SR'].value_counts()

Livia Veneziani               608
OTHER                          63
Eugenio Montale                62
Paul Henri Michel              17
Valerio Jahier                 15
Marieanne Crémieux Comnène     14
James Joyce                    13
Giuseppe Prezzolini            12
Ferdinando Pasini              11
Valéry Larbaud                 11
Name: SR, dtype: int64

In [4]:
sender = corpus_for_tfidf['sender']
recipient = corpus_for_tfidf['recipient']
important_people = ['Livia Veneziani','Eugenio Montale','Paul Henri Michel','Valerio Jahier','Marieanne Crémieux Comnène','James Joyce','Giuseppe Prezzolini','Ferdinando Pasini','Valéry Larbaud']
sr = []
for i in range(len(sender)):
    if sender[i] != 'Ettore Schmitz' and sender[i] in important_people:
        sr.append(sender[i])
    elif recipient[i] in important_people:
        sr.append(recipient[i])
    else:
        sr.append('OTHER')
corpus_for_tfidf['SR'] = sr  
corpus_for_tfidf['SR'].value_counts()

Livia Veneziani               608
OTHER                          63
Eugenio Montale                62
Paul Henri Michel              17
Valerio Jahier                 15
Marieanne Crémieux Comnène     14
James Joyce                    13
Giuseppe Prezzolini            12
Ferdinando Pasini              11
Valéry Larbaud                 11
Name: SR, dtype: int64

In [5]:
processed_letters = []
for letter in ita_corpus['text']:
    

    tokenized_letter = pre_proc(letter,pos = ['PROPN', 'NOUN', 'VERB'])
    processed_letters.append(tokenized_letter)
  

In [13]:
import gensim
pure_dictionary = gensim.corpora.Dictionary(processed_letters)

dictionary = gensim.corpora.Dictionary(processed_letters)
dictionary.filter_extremes(no_below=5, no_above=0.5)

dictionary_for_tfidf = gensim.corpora.Dictionary(processed_letters)

pruned_dictionary_for_tfidf = gensim.corpora.Dictionary(processed_letters)
pruned_dictionary_for_tfidf.filter_extremes(no_below=5, no_above=0.5)

In [15]:
pure_bow = [pure_dictionary.doc2bow(doc) for doc in processed_letters] 

bow = [dictionary.doc2bow(doc) for doc in processed_letters]

bow_for_tfidf=[dictionary_for_tfidf.doc2bow(doc) for doc in processed_letters]

bow_for_pruned_tfidf=[pruned_dictionary_for_tfidf.doc2bow(doc) for doc in processed_letters]

from gensim import corpora, models

tfidf = models.TfidfModel(bow_for_tfidf)
corpus_for_tfidf = tfidf[bow_for_tfidf]

pruned_tfidf = models.TfidfModel(bow_for_pruned_tfidf)
corpus_for_pruned_tfidf = tfidf[bow_for_pruned_tfidf]

In [None]:
max_n_topics = 4

coherency_scores = np.zeros([max_n_topics,4])

for n in range(1,max_n_topics + 1):
    print("Calculating coherence scores for {} topics".format(n))
    #learn a model without pruning the dictionary
    lda_model_unpruned = gensim.models.LdaMulticore(pure_bow, num_topics=n, id2word=pure_dictionary, passes=10)
    #learn a model pruning the dictionary
    lda_model_pruned = gensim.models.LdaMulticore(bow, num_topics=n, id2word=dictionary, passes=10)
    #learn a model with tf_idf
    lda_model_tfidf = gensim.models.LdaMulticore(corpus_for_tfidf, num_topics=n, id2word=dictionary_for_tfidf, passes=10)
    #learn a model with tf_idf on a pruned dictionary
    lda_model_pruned_tfidf = gensim.models.LdaMulticore(corpus_for_pruned_tfidf, num_topics=n, id2word=pruned_dictionary_for_tfidf, passes=10)

    c1, _ = get_coherence(lda_model_unpruned, pure_bow, pure_dictionary)
    c2, _ = get_coherence(lda_model_pruned, bow, pure_dictionary)
    c3, _ = get_coherence(lda_model_tfidf, corpus_for_tfidf, dictionary_for_tfidf)
    c4, _ = get_coherence(lda_model_pruned_tfidf, corpus_for_pruned_tfidf, pruned_dictionary_for_tfidf)
    print([c1,c2,c3,c4])
    coherency_scores[n - 1,:] = np.array([c1,c2,c3,c4])
  

Calculating coherence scores for 1 topics
[-9.249616584548876, -1.1106195357148791, -8.633821591997556, -1.2399792263626386]
Calculating coherence scores for 2 topics


In [None]:
model_names = ['classical bow no pruning', 'classical bow pruned dictionary', 'tfidf', 'tfidf pruned']
n_topics = [i for i in range(1,max_n_topics+1)]
np.shape(coherency_scores)
plt.figure(dpi=120)
for i in range(4):
    plt.plot(n_topics, coherency_scores[:,i], '.-', label = model_names[i])
    
_ = plt.legend()
_ = plt.show

In [None]:
lda_model = gensim.models.LdaMulticore(bow, num_topics=num_topics, id2word=dictionary, passes=10)

In [None]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=num_topics, id2word=dictionary_for_tfidf, passes=10)


In [None]:
key_words = extract_topic_keywords(lda_model, 20)

for i in range(len(key_words)):
    print("Topic {}:\n\n{}\n".format(i,key_words[i]))

In [None]:
scores = get_topic_scores(lda_model, bow)
letter_topics = list(map(np.argmax, scores))
ita_corpus['topic_scores'] = scores
ita_corpus['topic'] = letter_topics

In [None]:
people = []
important_people = ['Livia Veneziani','Eugenio Montale','Paul Henri Michel','Valerio Jahier','Marieanne Crémieux Comnène','James Joyce','Giuseppe Prezzolini','Ferdinando Pasini','Valéry Larbaud']

people = important_people
people.append('OTHER')

for person in people:
    sub_data_index = ita_corpus['SR'] == person
    sub_data = ita_corpus[sub_data_index]
    val_count = sub_data['topic'].value_counts()
    print("Person: {}".format(person))
    print("Topics")
    print(val_count)
    print("--- --- --- --- \n")

In [None]:
year = ita_corpus['year']
min_y = min(year)
max_y = max(year)

In [None]:
topic_per_year = []
for y in range(min_y,max_y+1):
    counts = [0 for i in range(num_topics)]
    tpy = ita_corpus[ita_corpus['year'] == y]['topic']
    for topic in tpy:
        counts[topic] += 1 
    topic_per_year.append(counts)
    
        
    

In [None]:
yy = [y for y in range(min_y,max_y+1)]
topic_per_year = np.array(topic_per_year, dtype = float)
plt.figure(dpi = 100)
for topic in range(num_topics):
    plt.plot(yy, topic_per_year[:,topic], '.-', label = "Topic" + str(topic))
plt.legend()
plt.figure(dpi = 100)

topic_per_year_norm = np.zeros_like(topic_per_year)

for i in range(len(topic_per_year)):
    if np.sum(topic_per_year[i,:]) != 0:
       
        topic_per_year_norm[i,:] = topic_per_year[i,:]/np.sum(topic_per_year[i,:])
        #print(np.sum(topic_per_year[i,:]), topic_per_year_norm[i,:])



for topic in range(num_topics):
    plt.plot(yy, topic_per_year_norm[:,topic], '.-', label = "Topic" + str(topic))
_ = plt.legend()

Are the identified topics understandable?

Are the topics coherent?

Does the topic model serve the purpose it is being used for?


In [None]:
key_words = extract_topic_keywords(lda_model_tfidf, 20)

for i in range(len(key_words)):
    print("Topic {}:\n\n{}\n".format(i,key_words[i]))

In [None]:
scores = get_topic_scores(lda_model_tfidf, bow_for_tfidf)
letter_topics = list(map(np.argmax, scores))
corpus_for_tfidf['topic_scores'] = scores
corpus_for_tfidf['topic'] = letter_topics

In [None]:
topic_per_year = []
for y in range(min_y,max_y+1):
    counts = [0 for i in range(num_topics)]
    tpy = corpus_for_tfidf[corpus_for_tfidf['year'] == y]['topic']
    for topic in tpy:
        counts[topic] += 1 
    topic_per_year.append(counts)


yy = [y for y in range(min_y,max_y+1)]
topic_per_year = np.array(topic_per_year, dtype = float)
plt.figure(dpi = 100)
for topic in range(num_topics):
    plt.plot(yy, topic_per_year[:,topic], '.-', label = "Topic" + str(topic))
plt.legend()
plt.figure(dpi = 100)

topic_per_year_norm = np.zeros_like(topic_per_year)

for i in range(len(topic_per_year)):
    if np.sum(topic_per_year[i,:]) != 0:
       
        topic_per_year_norm[i,:] = topic_per_year[i,:]/np.sum(topic_per_year[i,:])
        #print(np.sum(topic_per_year[i,:]), topic_per_year_norm[i,:])



for topic in range(num_topics):
    plt.plot(yy, topic_per_year_norm[:,topic], '.-', label = "Topic" + str(topic))
_ = plt.legend()

In [None]:
people = []
important_people = ['Livia Veneziani','Eugenio Montale','Paul Henri Michel','Valerio Jahier','Marieanne Crémieux Comnène','James Joyce','Giuseppe Prezzolini','Ferdinando Pasini','Valéry Larbaud']

people = important_people
people.append('OTHER')

for person in people:
    sub_data_index = corpus_for_tfidf['SR'] == person
    sub_data = corpus_for_tfidf[sub_data_index]
    val_count = sub_data['topic'].value_counts()
    print("Person: {}".format(person))
    print("Topics")
    print(val_count)
    print("--- --- --- --- \n")

In [None]:
models.CoherenceModel(model=lda_model, corpus = bow,  dictionary=dictionary, coherence='u_mass').get_coherence()


In [None]:
models.CoherenceModel(model=lda_model_tfidf, corpus = bow_for_tfidf,  dictionary=dictionary_for_tfidf, coherence='u_mass').get_coherence()


In [None]:
models.CoherenceModel(model=lda_model, corpus = bow,  dictionary=dictionary, coherence='u_mass').get_coherence_per_topic()


In [None]:
models.CoherenceModel(model=lda_model_tfidf, corpus = bow_for_tfidf,  dictionary=dictionary_for_tfidf, coherence='u_mass').get_coherence_per_topic()
