In [None]:
import numpy as np
import pandas as pd
from top2vec import Top2Vec
from ast import literal_eval
import time
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from utils import topic_diversity

In [None]:
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import strip_tags
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel

# Load the data

In [None]:
df = pd.read_csv('../data/lemmas.csv')
df

# Select a subsample of the data

In [None]:
df_sub = df.sample(frac=1.0, random_state=42).reset_index(drop=True)
# df_sub = df

In [None]:
df_sub.shape

In [None]:
df_sub.head()

# Filter the data
Filter out all POS but Nouns (N), Adjectives (A) and Verbs (V)

Filter out frequent (stop)words that does not carry any extra semantic information

In [None]:
stopwords = ['mít', 'jít', 'být', 'dát', 'moci']

In [None]:
%%time

lemmas_filtered_column = []

# extract lemmas with N, A or V POS tag
for index, poem in df_sub.iterrows():
    if index % 1000 == 0:
        print(index, end=' ')
    
    lemmas_filtered_poem = []
    lemmas_poem = literal_eval(poem['lemmas'])
    for lemma_pos in lemmas_poem:
        lemma, pos = lemma_pos
        if pos in ['N', 'A', 'V'] and lemma not in stopwords:
            lemmas_filtered_poem.append(lemma)
    lemmas_filtered_column.append(lemmas_filtered_poem)

In [None]:
df_sub['lemmas_filtered'] = lemmas_filtered_column

# Create a dictionary

In [None]:
min_frequency = 20

In [None]:
dictionary = Dictionary(df_sub['lemmas_filtered'])

In [None]:
dictionary.most_common()[:30]

In [None]:
# Filter out words
dictionary.filter_extremes(no_below=min_frequency)

In [None]:
print('Number of unique tokens: %d' % len(dictionary))

# Join preprocessed tokens into a single string

In [None]:
df_sub = pd.read_csv('../data/training_data_top2vec.csv')
df_sub.head()

In [None]:
df_sub['texts'] = df_sub['lemmas_filtered'].apply(lambda x: ' '.join(x[2:-2].split("', '")))
df_sub

# Train the model
API: https://top2vec.readthedocs.io/en/latest/api.html

Example: https://towardsdatascience.com/how-to-perform-topic-modeling-with-top2vec-1ae9bb4e89dc

In [None]:
embedding_models = [
    'doc2vec',
    'universal-sentence-encoder',
    'universal-sentence-encoder-multilingual',
    'distiluse-base-multilingual-cased',
    'all-MiniLM-L6-v2',
    'paraphrase-multilingual-MiniLM-L12-v2'
]

embedding_models = [
    'doc2vec'
]

umap_args = {'n_neighbors': 15,
             'n_components': 5,
             'metric': 'cosine'}

hdbscan_args = {'min_cluster_size': 15,
                'metric': 'euclidean',
                'cluster_selection_method': 'leaf'}

speed = 'deep-learn'

In [None]:
def custom_tokenizer(document):
    return simple_preprocess(strip_tags(document), deacc=False)

In [None]:
%%time

results = []

for embedding_model in embedding_models:
    start = time.time()
    
    # train the model
    model = Top2Vec(documents=df_sub['texts'].values,
                    speed=speed,
                    min_count=min_frequency,
                    umap_args=umap_args,
                    hdbscan_args=hdbscan_args,
                    tokenizer=custom_tokenizer,
                    embedding_model=embedding_model)
    
    # compute coherence of the model
    print(model.topic_words)
    
    # cm = CoherenceModel(topics=model.topic_words, topn=10, dictionary=dictionary, texts=df_sub['lemmas_filtered'], coherence='c_v')
    # coherence = cm.get_coherence()
       
    training_time = (time.time() - start)/60
    results.append((embedding_model, coherence, model.get_num_topics(), training_time))
    
    #print('Embedding model: {}. Coherence: {}. Number of topics: {} (training took {} min).'.format(
    #          embedding_model, coherence, model.get_num_topics(), round(training_time, 2)))

In [None]:
results.append((embedding_model, model.get_num_topics(), training_time))
results

In [None]:
print(model.topic_words[4])
model.save('../results/top2vec/top2vec_doc2vec_clean.model')

In [None]:
model.topic_word_scores

# Plot the results

In [None]:
results = pd.read_csv('../results/top2vec/results_top2vec.csv')
results

In [None]:
x = results.embedding_model.values
y1 = results.coherence.values
y2 = results.num_topics.values
y3 = results.diversity.values

# Set font size
plt.rcParams.update({'font.size': 14})

In [None]:
# Create a figure with two subplots
# fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(20, 10))
fig, ax1 = plt.subplots(figsize=(10, 10))

ax1.set_xlabel('Embedding Model')
ax1.set_ylabel('Topic Coherence Score')
ax1.bar(x, y1)
ax1.set_xticklabels(x, rotation=90)
ax1.tick_params(axis='y')

# Add labels to the bars
for i in range(len(x)):
    ax1.text(i, y1[i], str(round(y1[i], 2)), ha='center', va='bottom')

# # ax2 = ax1.twinx()

# ax2.set_ylabel('Number of Topics Found')
# ax2.set_xlabel('Embedding Model')
# ax2.bar(x, y2, color='tab:red')
# ax2.set_xticklabels(x, rotation=90)
# ax2.tick_params(axis='y')

# # Add labels to the bars
# for i in range(len(x)):
#     ax2.text(i, y2[i], str(y2[i]), ha='center', va='bottom')

fig.tight_layout()
plt.savefig('top2vec_coherence.png')
plt.show()

In [None]:
# Set font size
plt.rcParams.update({'font.size': 14})

# Create a figure with two subplots
fig, ax2 = plt.subplots(figsize=(10, 10))

ax2.set_ylabel('Number of Topics Found')
ax2.set_xlabel('Embedding Model')
ax2.bar(x, y2, color='tab:red')
ax2.set_xticklabels(x, rotation=45, ha='right')
ax2.tick_params(axis='y')

# Add labels to the bars
for i in range(len(x)):
    ax2.text(i, y2[i], str(y2[i]), ha='center', va='bottom')

fig.tight_layout()
plt.savefig('top2vec_num_topics.png', dpi=400, bbox_inches='tight', pad_inches=0)
plt.show()

In [None]:
# Set font size
plt.rcParams.update({'font.size': 14})

# Create a figure with two subplots
fig, ax2 = plt.subplots(figsize=(10, 10))

ax2.set_ylabel('Topic Diversity Score')
ax2.set_xlabel('Embedding Model')
ax2.bar(x, y3, color='tab:green')
ax2.set_xticklabels(x, rotation=90)
ax2.tick_params(axis='y')

# Add labels to the bars
for i in range(len(x)):
    ax2.text(i, y3[i], str(round(y3[i], 2)), ha='center', va='bottom')

fig.tight_layout()
plt.savefig('top2vec_diversity.png')
plt.show()

In [None]:
x = results.embedding_model.values
y1 = results.coherence.values
y2 = results.num_topics.values
y3 = results.diversity.values

# Set font size
plt.rcParams.update({'font.size': 14})

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.rcParams.update({'font.size': 18})

# create example data
data = {
    "Coherence": [round(i, 2) for i in y1],
    "Diversity": [round(i, 2) for i in y3]
}
index = x
result = pd.DataFrame(data, index=results.embedding_model.values)

# plot data
x = np.arange(len(result.index))  # x locations of bars
width = 0.4  # width of bars

fig, ax = plt.subplots(figsize=(14,8))
rects1 = ax.bar(x - width/2, result["Coherence"], width, label="Coherence")
rects2 = ax.bar(x + width/2, result["Diversity"], width, label="Diversity")

# add values on top of bars
for rect in rects1:
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width() / 2, height, str(height),
            ha='center', va='bottom')
for rect in rects2:
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width() / 2, height, str(height),
            ha='center', va='bottom')

# add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel("Score")
# ax.set_title("Major and Minor topics by topic name")
ax.set_xticks(x)
ax.set_xticklabels(result.index, rotation=45, ha='right')
ax.legend()

plt.savefig('top2vec-coherence-diversity.png', dpi=400, bbox_inches='tight', pad_inches=0)

# display the plot
plt.show()

# Load the best model

In [None]:
trained_model = Top2Vec.load('../results/top2vec/top2vec_doc2vec.model')

In [None]:
trained_model.get_topic_sizes()

In [None]:
top_n_words = {}

for i in range(trained_model.get_num_topics()):
    word_scores = []
    for j in range(10):
        word_scores.append((trained_model.topic_words[i][j], trained_model.topic_word_scores[i][j]))
    top_n_words[i] = word_scores

In [None]:
%%time

results = []

for n in range(165, 4, -5):
    # reduce number of topics
    topic_mapping = trained_model.hierarchical_topic_reduction(n)
            
    # compute coherence of the model
    cm = CoherenceModel(topics=trained_model.topic_words_reduced, topn=10, dictionary=dictionary, texts=df_sub['lemmas_filtered'], coherence='c_v')
    coherence = cm.get_coherence()
    
    # compute diversity of the model
    diversity = topic_diversity(trained_model.topic_words_reduced, top_n=10)
    
    results.append((n, coherence, diversity))
    print(n, coherence, diversity)

In [None]:
resultstxt = pd.read_csv('../results/top2vec/top2vec.txt')
resultstxt

In [None]:
import matplotlib.pyplot as plt

# Set font size
plt.rcParams.update({'font.size': 14})

# separate the x and y values into two separate lists
x_values = [result[0] for result in results]
y1_values = [result[1] for result in results]
y2_values = [result[2] for result in results]

# Create a figure with two subplots
fig, ax1 = plt.subplots(figsize=(10, 6))

ax1.set_xlabel('Number of Topics')
ax1.set_ylabel('Score')
ax1.plot(x_values, y1_values, label ='Coherence')
ax1.plot(x_values, y2_values, label ='Diversity')
ax1.tick_params(axis='y')
ax1.invert_xaxis()

ax1.grid(axis="x", which='major', color="black", alpha=.1, linewidth=.5)
ax1.grid(axis="y", which='major', color="black", alpha=.1, linewidth=.5)

fig.tight_layout()
plt.legend()
plt.savefig('top2vec_reduction.png', dpi=400, bbox_inches='tight', pad_inches=0)
plt.show()

In [None]:
topic_mapping = trained_model.hierarchical_topic_reduction(35)

In [None]:
topic_mapping[-39]

In [None]:
top_n_words = {}

for i in range(trained_model.get_num_topics()):
    word_scores = []
    for j in range(10):
        word_scores.append((trained_model.topic_words[i][j], trained_model.topic_word_scores[i][j]))
    top_n_words[i] = word_scores

In [None]:
top_n_words_reduced = {}

for i in range(len(topic_mapping)):
    word_scores = []
    for j in range(10):
        word_scores.append((trained_model.topic_words_reduced[i][j], trained_model.topic_word_scores_reduced[i][j]))
    top_n_words_reduced[i] = word_scores

In [None]:
top_n_words_reduced[5]

In [None]:
def word_cloud(top_n_words, topic):
    word_dict = {word: prob for word, prob in top_n_words[topic]}

    wc = WordCloud(background_color='white', width=800, height=400)
    wc.generate_from_frequencies(word_dict)

    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.show()

In [None]:
topic_mapping = trained_model.hierarchical_topic_reduction(150)

In [None]:
topic_mapping

In [None]:
word_cloud(top_n_words, 132)

In [None]:
word_cloud(top_n_words, 149)

In [None]:
word_cloud(top_n_words_reduced, 111)

In [None]:
def word_cloud_save(top_n_words, name):
    word_dict = {word: prob for word, prob in top_n_words}

    wc = WordCloud(background_color='white', width=800, height=400)
    wc.generate_from_frequencies(word_dict)

    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.savefig(name, bbox_inches='tight', pad_inches=0, dpi=400)
    plt.show()

In [None]:
word_cloud_save(top_n_words[132], 'top2vec_132.png')

In [None]:
word_cloud_save(top_n_words[149], 'top2vec_149.png')

In [None]:
word_cloud_save(top_n_words_reduced[111], 'top2vec_reduced_111.png')

In [None]:
for i in top_n_words_reduced:
    print(i+1)
    word_cloud(top_n_words_reduced, i)

In [None]:
for i in top_n_words_reduced:
    print(i+1)
    word_cloud_save(top_n_words_reduced[i], 'top2vec_35_topics_{}.png'.format(i+1))

In [None]:
import pandas as pd

categories = ['', 'Unsupervised', 'Unsupervised', 'Unsupervised', 'Unsupervised', 'Unsupervised', 'Supervised', 'Semi--Supervised']
methods = ['Annotated data', 'LDA (unigrams)', 'LDA (bigrams)', 'Top2Vec', 'Top2Vec (reduced)', 'K--means', 'SVM', 'c--TF--IDF']
num_topics = [25, 40, 30, 168, 35, 55, 25, 25]
coherences = [0.4241, 0.4513, 0.4512, 0.5614, 0.6169, 0.5397, 0.4510, 0.4716]
diversities = [0.7440, 0.5275, 0.5500, 0.8375, 0.9600, 0.6345, 0.6400, 0.8520]

idx = [categories, methods]

# Calling DataFrame constructor after zipping
# both lists, with columns specified
df = pd.DataFrame(list(zip(num_topics, coherences, diversities)),
               columns =['Number of topics', 'Coherence', 'Diversity'],
                 index=idx)
df

In [None]:
print(df.to_latex(formatters={"name": str.upper},
            float_format="{:.4f}".format))

In [None]:
import pandas as pd

categories = ['Unsupervised', 'Unsupervised', 'Unsupervised', 'Unsupervised', 'Unsupervised', 'Supervised', 'Semi--Supervised']
methods = ['LDA (unigrams)', 'LDA (bigrams)', 'Top2Vec', 'Top2Vec (reduced)', 'K--means', 'SVM', 'c--TF--IDF']
topic_names = ['-', 
               '-', 
               '-', 
               '-', 
               '-', 
               'Exotics/Travel', 
               'Exotics/Travel']
words = ['moře, vlna, loď, břeh, voda, plout, bouře, mořský, skála, veslo', 
         'moře, vlna, břeh, loď, voda, hvězda, hora, slunce, zlatý, plout', 
         'loď, plavec, plachta, přístav, koráb, člun, příď, paluba, stožár, stěžeň', 
         'loď, plavec, člun, plachta, přístav, koráb, příď, plout, stožár, vlna', 
         'člověk, rád, nebe, vědět, svět, čas, píseň, život, rok, bůh', 
         'moře, loď, vlna, břeh, plout, voda, vlak, loďka, noc, dálka', 
         'loď, vlna, moře, břeh, plout, koráb, dálka, noc, vlak, loďka']


idx = [categories, methods]

# Calling DataFrame constructor after zipping
# both lists, with columns specified
df1 = pd.DataFrame(list(zip(topic_names, words)),
               columns =['Topic name', 'Top 10 words of the topic'],
                 index=idx)
df1

In [None]:
print(df1.to_latex(formatters={"name": str.upper}))

In [None]:
for i in df1['Top 10 words of the topic']:
    print(i)