#### IMPORT

In [7]:
from nltk.tokenize import word_tokenize
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
from csv import writer
from top2vec import Top2Vec
import math

#### LOAD BOOKS 

In [8]:
def load_books_chunks_from_document(CONST, file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    books = []
    books_info = []
    start_tag = '<doc title="'
    end_tag = '</doc>'
    start_index = 0

    while True:
        book_start = content.find(start_tag, start_index)
        if book_start == -1:
            break

        book_end = content.find(end_tag, book_start)
        if book_end == -1:
            break

        book_text = content[book_start:book_end + len(end_tag)]
        book_info = book_text.strip()[5:book_text.index('>') + 1]  # Remove '<doc' and '</doc>'

        book_info_list = book_info.split('" ')
        book_info_dict = {}

        for item in book_info_list:
            key, value = item.split('=')
            book_info_dict[key.strip()] = value.strip('"')

        book_content = book_text[book_text.index('>') + 1:-len(end_tag)].strip()
        book_content = book_content.split(' ')
        length = len(book_content)
        for i in range(math.ceil(length/CONST)):
            i = i*CONST
            end = i+CONST if i+CONST < length-1 else length-1
            books.append(" ".join(book_content[i:end]))
            books_info.append(book_info_dict)

        start_index = book_end + len(end_tag)

    return books, books_info


def load_books_from_document(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    books = []
    books_info = []
    start_tag = '<doc title="'
    end_tag = '</doc>'
    start_index = 0

    while True:
        book_start = content.find(start_tag, start_index)
        if book_start == -1:
            break

        book_end = content.find(end_tag, book_start)
        if book_end == -1:
            break

        book_text = content[book_start:book_end + len(end_tag)]
        book_info = book_text.strip()[5:book_text.index('>') + 1]  # Remove '<doc' and '</doc>'  # Remove '<doc' and '</doc>'

        book_info_list = book_info.split('" ')
        book_info_dict = {}

        for item in book_info_list:
            key, value = item.split('=')
            book_info_dict[key.strip()] = value.strip('"')

        book_content = book_text[book_text.index('>') + 1:-len(end_tag)].strip()        
        books.append(book_content)

        books_info.append(book_info_dict)

        start_index = book_end + len(end_tag)

    return books, books_info



def load_books_blocks_from_document(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    books = []
    books_info = []
    start_tag = '<doc title="'
    end_tag = '</doc>'
    start_index = 0

    while True:
        book_start = content.find(start_tag, start_index)
        if book_start == -1:
            break

        book_end = content.find(end_tag, book_start)
        if book_end == -1:
            break

        book_text = content[book_start:book_end + len(end_tag)]
        book_info = book_text.strip()[5:book_text.index('>') + 1]  # Remove '<doc' and '</doc>'  # Remove '<doc' and '</doc>'

        book_info_list = book_info.split('" ')
        book_info_dict = {}

        for item in book_info_list:
            key, value = item.split('=')
            book_info_dict[key.strip()] = value.strip('"')

        book_content = book_text[book_text.index('>') + 1:-len(end_tag)].strip()
        book_content = book_content.split('\n')
        for book_block in book_content:
            books.append(book_block)
            books_info.append(book_info_dict)

        start_index = book_end + len(end_tag)

    return books, books_info




### TOPIC COHERENCE SCORE

In [9]:
path = "data/lem"
year = "1990_2019"
category = "all"
file_name = year+"_"+category+"_no_names_beletrie"
file_path = path+"/"+file_name + ".txt"
chunk = 2000
divide = "chunk"


for year in ["1990_1999", "2000_2009", "2010_2019"]:
    for divide in ["chunk"]: 
            for chunk in [2000]:
                file_name = year+"_"+category+"_no_names_beletrie_with_stopwords"
                file_path = path+"/"+file_name + ".txt"
                if divide == "chunk":
                    books, books_info = load_books_chunks_from_document(chunk, file_path)
                    d = str(chunk) + " " + divide
                    save_path = "data/models/{divide}/top2vec__with_stopwords".format(divide = d) + file_name
                else:  
                    file_name = year+"_"+category+"_no_names_beletrie"
                    save_path = "data/models/{divide}/top2vec__with_stopwords".format(divide = "blocks") + file_name    
                    
                    file_name = year+"_"+category+"_no_names_blocks_beletrie"
                    file_path = path+"/"+file_name + ".txt" 
                    books, books_info = load_books_blocks_from_document(file_path)
                     

                top2vec_model = Top2Vec.load(save_path)

                topic_sizes, topic_nums = top2vec_model.get_topic_sizes()

                num_topics = top2vec_model.get_num_topics()

                print("Number of topics: ", num_topics)

                original_num_topics = num_topics

                for num_topics in [5, 10, 20, 30,  original_num_topics]:

                #for topic_size, topic_num in zip(topic_sizes[:num_topics], topic_nums[:num_topics]):
                #    print(f"Topic Num {topic_num} has {topic_size} documents.")
                    if num_topics == original_num_topics: 
                        reduced = False
                    else:
                        reduced = True    
                        ret = top2vec_model.hierarchical_topic_reduction(num_topics=num_topics)

                    print("Number of topics: ", top2vec_model.get_num_topics(reduced=reduced))

                    # Get the topics and their document clusters
                    topic_words, word_scores, topics = top2vec_model.get_topics(num_topics=num_topics, reduced=reduced) 

                    topic_sizes, topic_nums = top2vec_model.get_topic_sizes(reduced=reduced)

                    # Tokenize the documents
                    tokenized_data = [word_tokenize(doc) for doc in books]

                    dictionary = Dictionary(tokenized_data)

                    # Calculate topic coherence using Gensim's CoherenceModel
                    coherence_model = CoherenceModel(
                        topics=topic_words,
                        texts=tokenized_data,
                        dictionary=dictionary,
                        coherence='c_v'  # You can use other coherence measures as well
                    )
                    coherence_score = coherence_model.get_coherence()

                    print("Topic Coherence Score:", coherence_score)

    

Number of topics:  61
Number of topics:  5
Topic Coherence Score: 0.4944872879976783
Number of topics:  10
Topic Coherence Score: 0.4417502225503015
Number of topics:  20
Topic Coherence Score: 0.44478408784843076
Number of topics:  30
Topic Coherence Score: 0.42588145785895165
Number of topics:  61
Topic Coherence Score: 0.41091343826656324
Number of topics:  76
Number of topics:  5
Topic Coherence Score: 0.38490202226310133
Number of topics:  10
Topic Coherence Score: 0.38467773855381304
Number of topics:  20
Topic Coherence Score: 0.3841448364774397
Number of topics:  30
Topic Coherence Score: 0.3900326360022047
Number of topics:  76
Topic Coherence Score: 0.40496879995123164
Number of topics:  93
Number of topics:  5
Topic Coherence Score: 0.4335416382679271
Number of topics:  10
Topic Coherence Score: 0.4247315894192266
Number of topics:  20
Topic Coherence Score: 0.4064873652164122
Number of topics:  30
Topic Coherence Score: 0.40947265281006334
Number of topics:  93
Topic Cohere