In [1]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from read_corpus_functions import load_books_blocks_from_document, load_books_from_document, load_books_chunks_from_document, load_books_from_document_without_residual_ner
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import tempfile
import pickle
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import strip_tags
from gensim.models.phrases import Phrases
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [2]:
path = "data/lem/all"
file_name =   "jerome_synv9_all" #"jerome_synv9_id_no_names_blocks"  "jerome_synv9_id" #_NOV_MEM_COL_deep_learn
file_path = path+"/"+file_name + ".txt"
txtype_select = []
books, books_info = load_books_from_document_without_residual_ner(file_path, txtype_select, 'ner_combined.obj')

In [3]:
def heat_map_visualization(df):
    languages = df.srclang.unique()
    topics = np.unique(df.topic.unique())   
    heat_mat = np.zeros(shape=(len(topics), 3))
    for i, topic in enumerate(topics):
        for j, language in enumerate(languages):
            print(str(topic) + ' ' + str(language) )
            print(len(df[(df.topic == topic) & (df.srclang == language)].index)   )
            if language in ['cs: čeština']: #, 'en: angličtina'
                heat_mat[i][0] = len(df[(df.topic == topic) & (df.srclang == language)].index) 
            elif language in ['en: angličtina']: #, 
                heat_mat[i][1] = len(df[(df.topic == topic) & (df.srclang == language)].index)   
            else:
                heat_mat[i][2] += len(df[(df.topic == topic) & (df.srclang == language)].index)     
        heat_mat[i][0] = heat_mat[i][0] /  len(df[(df.topic == topic)])  
        heat_mat[i][1] = heat_mat[i][1] /  len(df[(df.topic == topic)])
        heat_mat[i][2] = heat_mat[i][2] /  len(df[(df.topic == topic)])
    return heat_mat


def plot_heat_map(heat_mat, df, save_name):
    plt.figure(figsize=(15, 12))
    fig = plt.imshow(heat_mat, cmap='viridis', interpolation='nearest')
    plt.xticks(range(3), ['cz','en', 'other'])
    num_topics = len(df.topic.unique()) 
    y_ticks = ['Topic ' + str(n-1) for n in range(num_topics)]
    plt.yticks(range(num_topics), y_ticks)
    plt.colorbar(fig)
    plt.savefig("plots/whole books/bertopic/{}".format(save_name))



In [None]:
for embedding_name in ['paraphrase-multilingual-MiniLM-L12-v2', 'distiluse-base-multilingual-cased-v2']:
    embedding_model = SentenceTransformer(embedding_name)
    embeddings = embedding_model.encode(books, show_progress_bar=True)
    for u_map_n_neighbours in range(4, 5, 1):
        umap_model = UMAP(n_neighbors=u_map_n_neighbours, n_components=5, metric='cosine', random_state=42)
        for hdbscan_min_cluster_size in range(3, 12, 1):
            
            hdbscan_model = HDBSCAN(min_cluster_size=hdbscan_min_cluster_size, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
            topic_model = BERTopic(

            # Pipeline models
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            embedding_model=embedding_model,
            #embedding_model=ft,

            # Hyperparameters
            top_n_words=10,
            verbose=True
            )

            topics, probs = topic_model.fit_transform(books, embeddings)
            save_name = 'bert_umap_n_n{}_hdbscan_m_c_s{}_{}'.format(str(u_map_n_neighbours), str(hdbscan_min_cluster_size), str(embedding_name))
            topic_model.save('models/whole books/bertopic/{}'.format(save_name), serialization="pickle")

            df_topics = pd.DataFrame(topic_model.get_topics())
            df_topics.to_excel('data/topic words/BERTopic/whole books/{}.xlsx'.format(save_name))


            titles = [book_info['title'] for _,book_info in enumerate(books_info)]
            authors = [book_info['author'] for _,book_info in enumerate(books_info)]
            src_langs = [book_info['srclang'] for _,book_info in enumerate(books_info)]
            txtypes = [book_info['txtype'] for _,book_info in enumerate(books_info)]
            df = pd.DataFrame({'topic': topics, 'title': titles, 'author': authors, 'srclang': src_langs, 'txtype': txtypes})

            df.to_excel('data/results/BERTopic/whole books/{}.xlsx'.format(save_name))

            heat_mat = heat_map_visualization(df)
            plot_heat_map(heat_mat, df, save_name)
            
 