### Imports

In [None]:
import math
import numpy as np
import pandas as pd
from top2vec import Top2Vec
import pandas as pd
from csv import writer
import umap
import umap.plot
import matplotlib.pyplot as plt
import csv

#### Parameters

In [None]:
path = "data/lem"
year = "2010_2019"
category = "all"
file_name = year+"_"+category+"_no_names_beletrie"
file_path = path+"/"+file_name + ".txt"
chunk = 2000
divide = "chunk"

### Load dokuments

Three options - chunks, blocks and the whole document 

In [None]:

def load_books_chunks_from_document(CONST, file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    books = []
    books_info = []
    start_tag = '<doc title="'
    end_tag = '</doc>'
    start_index = 0

    while True:
        book_start = content.find(start_tag, start_index)
        if book_start == -1:
            break

        book_end = content.find(end_tag, book_start)
        if book_end == -1:
            break

        book_text = content[book_start:book_end + len(end_tag)]
        book_info = book_text.strip()[5:book_text.index('>') + 1]  # Remove '<doc' and '</doc>'

        book_info_list = book_info.split('" ')
        book_info_dict = {}

        for item in book_info_list:
            key, value = item.split('=')
            book_info_dict[key.strip()] = value.strip('"')

        book_content = book_text[book_text.index('>') + 1:-len(end_tag)].strip()
        book_content = book_content.split(' ')
        length = len(book_content)
        for i in range(math.ceil(length/CONST)):
            i = i*CONST
            end = i+CONST if i+CONST < length-1 else length-1
            books.append(" ".join(book_content[i:end]))
            books_info.append(book_info_dict)

        start_index = book_end + len(end_tag)

    return books, books_info


def load_books_from_document(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    books = []
    books_info = []
    start_tag = '<doc title="'
    end_tag = '</doc>'
    start_index = 0

    while True:
        book_start = content.find(start_tag, start_index)
        if book_start == -1:
            break

        book_end = content.find(end_tag, book_start)
        if book_end == -1:
            break

        book_text = content[book_start:book_end + len(end_tag)]
        book_info = book_text.strip()[5:book_text.index('>') + 1]  # Remove '<doc' and '</doc>'  # Remove '<doc' and '</doc>'

        book_info_list = book_info.split('" ')
        book_info_dict = {}

        for item in book_info_list:
            key, value = item.split('=')
            book_info_dict[key.strip()] = value.strip('"')

        book_content = book_text[book_text.index('>') + 1:-len(end_tag)].strip()        
        books.append(book_content)

        books_info.append(book_info_dict)

        start_index = book_end + len(end_tag)

    return books, books_info



def load_books_blocks_from_document(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    books = []
    books_info = []
    start_tag = '<doc title="'
    end_tag = '</doc>'
    start_index = 0

    while True:
        book_start = content.find(start_tag, start_index)
        if book_start == -1:
            break

        book_end = content.find(end_tag, book_start)
        if book_end == -1:
            break

        book_text = content[book_start:book_end + len(end_tag)]
        book_info = book_text.strip()[5:book_text.index('>') + 1]  # Remove '<doc' and '</doc>'  # Remove '<doc' and '</doc>'

        book_info_list = book_info.split('" ')
        book_info_dict = {}

        for item in book_info_list:
            key, value = item.split('=')
            book_info_dict[key.strip()] = value.strip('"')

        book_content = book_text[book_text.index('>') + 1:-len(end_tag)].strip()
        book_content = book_content.split('\n')
        for book_block in book_content:
            books.append(book_block)
            books_info.append(book_info_dict)

        start_index = book_end + len(end_tag)

    return books, books_info


if divide == "chunk":
    books, books_info = load_books_chunks_from_document(chunk, file_path)
else:    
    books, books_info = load_books_blocks_from_document(file_path)

In [None]:
if divide == "chunk":
    d = str(chunk) + " " + divide
    save_path = "data/models/{divide}/top2vec_".format(divide = d) + file_name
else:
    save_path = "data/models/{divide}/top2vec_".format(divide = "blocks") + file_name    

Save books info 

In [None]:


# with open('data\\books_info_{date}.csv'.format(date = year), 'w', newline='') as f:
#     writer = csv.writer(f)
#     writer.writerows(books_info)

### Create top2vec model

In [None]:

# Create the Top2Vec model
top2vec_model = Top2Vec(documents=books)

top2vec_model.save(save_path)


In [None]:
top2vec_model = Top2Vec.load(save_path)

topic_sizes, topic_nums = top2vec_model.get_topic_sizes()

num_topics = top2vec_model.get_num_topics()

print("Number of topics: ", num_topics)

original_num_topics = num_topics

#for topic_size, topic_num in zip(topic_sizes[:num_topics], topic_nums[:num_topics]):
#    print(f"Topic Num {topic_num} has {topic_size} documents.")

num_topics = 10
reduced = True
ret = top2vec_model.hierarchical_topic_reduction(num_topics=num_topics)

print("Number of topics: ", top2vec_model.get_num_topics(reduced=reduced))

print(ret)
topic_sizes, topic_nums = top2vec_model.get_topic_sizes(reduced=reduced)
for topic_size, topic_num in zip(topic_sizes[:num_topics], topic_nums[:num_topics]):
    print(f"Topic Num {topic_num} has {topic_size} documents.")


In [None]:
# Get the topics and their document clusters
topic_words, word_scores, topics = top2vec_model.get_topics(num_topics=num_topics, reduced=reduced) # Specify the number of topics you want

topic_number = 0

for words, scores, num in zip(topic_words[topic_number:], word_scores[topic_number:], topics[topic_number:]):
    print(f"Topic {num}")
    for word, score in zip(words, scores):
        print(word, score)


In [None]:
documents, document_scores, document_ids = top2vec_model.search_documents_by_topic(topic_num=1, num_docs=10, reduced=reduced)

for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f"Document: {doc_id}, Score: {score}")
    print("-----------")
    print(doc)
    print("-----------")
    print()
 

### Create DF from infos about books

In [None]:
print(books_info)
df_books_info = pd.DataFrame(books_info)
print(df_books_info)

In [None]:
# Convert the topic-document matrix to a format that pyLDAvis understands

data = {}
num_docs = len(books)
for i in range(0, num_topics):
    
    # get score for all documents
    _, document_scores, document_ids = top2vec_model.search_documents_by_topic(topic_num=i, num_docs=topic_sizes[i], reduced=reduced ) #num_docs=num_docs
    
    # iterate doc ids and their score for topic number i
    for score, doc_id in zip(document_scores, document_ids):

        # get book title 
        book_title = books_info[doc_id]['title']
        
        # if book is in keys
        if book_title in data.keys():
            # add score to topic number i
            data[book_title][i] += score 
        else:
            # create list 
            data[book_title] = [0]*num_topics   
            data[book_title][i] += score     

df_dict = {
        'document': data.keys(),
        'topic_contributions': data.values()
        }             

#df = pd.DataFrame.from_dict(data, orient = 'columns')
df = pd.DataFrame.from_dict(df_dict, orient = 'columns')

td_dict = {}

for _, row in df.iterrows():
    topic_distribution = row['topic_contributions']
    document_name = row['document']
    count =  len(df_books_info[df_books_info['title'] == document_name] )
    td_dict[document_name] = [(i/count) for i in topic_distribution]
    



print(td_dict)
df_save = pd.DataFrame.from_dict(td_dict)
df_save.to_excel("data\\topics\\top2vec\\books_topic_cosine_distance_{date}.xlsx".format(date = year))

In [None]:
print(top2vec_model.doc_top)

ret = top2vec_model.hierarchical_topic_reduction(num_topics=num_topics)

doc_top_reduced = [0] * len(books_info)
for topic_num in range(num_topics): 
    documents, document_scores, document_ids = top2vec_model.search_documents_by_topic(topic_num=topic_num, num_docs=topic_sizes[topic_num], reduced=reduced)
    print(document_ids)
    for document_id in document_ids:
        doc_top_reduced[document_id] = topic_num +1
        books_info[document_id]['topic'] = topic_num +1
doc_top_reduced = np.array(doc_top_reduced)    

df_books_info = pd.DataFrame(books_info)
df_books_info.to_excel('data\\books info\\books_info_{date}.xlsx'.format(date = year))

print(type(top2vec_model.doc_top))

In [None]:
df_books_info = pd.DataFrame(books_info).drop_duplicates() 
df_books_info.set_index('title')


df_save_T = df_save.transpose()

df_books_info_distance = df_books_info.merge(df_save_T, left_on='title', right_index = True, how = 'outer' )
df_books_info_distance.to_excel("data\\topics\\top2vec\\books_info_topic_distance_{date}.xlsx".format(date = year))

In [None]:
num_topics = 10

top2vec_model = Top2Vec.load(save_path)
ret = top2vec_model.hierarchical_topic_reduction(num_topics=num_topics)


# # Get the document-topic vectors (embeddings)
# embeddings = top2vec_model.get_documents_topics(doc_ids=list(range(0, len(books))))

umap_args_model = {
"n_neighbors": 200,
"n_components": 2,
"metric": "cosine",
'min_dist':0.5,
'spread':1,
'random_state': 42
}
umap_model = umap.UMAP(**umap_args_model).fit(top2vec_model.document_vectors)

umap.plot.points(umap_model, labels=doc_top_reduced ) #


#plt.legend([])

plt.title('Topics 2010 - 2018')

# plt.xlabel('UMAP Dimension 1')
# plt.ylabel('UMAP Dimension 2')
plt.show()

In [None]:
top2vec_model.generate_topic_wordcloud(7, background_color='white', reduced=reduced)