### INSTALL PACKAGES

In [None]:
!pip install top2vec
!pip install umap-learn[plot]

### IMPORT PACKAGES

In [None]:
from top2vec import Top2Vec
from os import listdir
from os.path import isfile, join
import umap.plot
from umap.umap_ import UMAP
import matplotlib.pyplot as plt
import re

### LOAD FILES

In [None]:
n = 3

def load_files():
    corpus_dict = {"volume": [],
                   "title":[],
                   "text":[]}

    for i in range(1, 13):
        path = f"Volume_{i + 1}"
        files = [f for f in listdir(path) if isfile(join(path, f))]
        for file in files:
            with open(path + "/" + file, 'r', encoding='utf-8') as f:
                content = f.read()

                num_parts = 1  # You can change this to the desired number of parts
                story_parts = split_story_content(content, num_parts)

                # Add volume, title, and each part of the text to the corpus_dict
                for part_num, part_content in enumerate(story_parts, 1):
                    corpus_dict['volume'].append(i)
                    corpus_dict['title'].append(f"{file[:-4]}_part{part_num}")
                    corpus_dict['text'].append(part_content)

                # corpus_dict['volume'].append(i)
                # corpus_dict['title'].append(file[:-4])
                # corpus_dict['text'].append(content)

    return corpus_dict


def split_story_content(content, num_parts):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', content)

    sentences_per_part = len(sentences) // num_parts

    story_parts = [sentences[i:i + sentences_per_part] for i in range(0, len(sentences), sentences_per_part)]

    return [' '.join(part) for part in story_parts]

corpus_dict = load_files()

### PARAMETERS 

In [None]:
print(Top2Vec.__doc__)

### CREATE  TOP2VEC MODEL

In [None]:
top2vec_model = Top2Vec(corpus_dict["text"], embedding_model="doc2vec", speed='learn', min_count=5)

In [None]:
topic_sizes, topic_nums = top2vec_model.get_topic_sizes()

num_topics = top2vec_model.get_num_topics()

topic_sizes, topic_nums = top2vec_model.get_topic_sizes()
for topic_size, topic_num in zip(topic_sizes[:num_topics], topic_nums[:num_topics]):
    print(f"Topic Num {topic_num} has {topic_size} documents.")

In [None]:
# Get the topics and their document clusters
topic_words, word_scores, topics = top2vec_model.get_topics(num_topics=num_topics) # Specify the number of topics you want

for topic_number in range(num_topics): 
    for words, scores, num in zip(topic_words[topic_number:], word_scores[topic_number:], topics[topic_number:]):
        print(f"Topic {num}")
        for word, score in zip(words, scores):
            print(word, score)

### GENERATE WORD CLOUD

In [None]:
top2vec_model.generate_topic_wordcloud(0, background_color='white')

### UMAP PLOT

In [None]:
umap_args_model = {
"n_neighbors": 2,
"n_components": 2,
"metric": "cosine",
'min_dist':0.9,
'spread':1,
'random_state': 42
}
umap_model = umap.UMAP(**umap_args_model).fit(top2vec_model.document_vectors)

umap.plot.points(umap_model, labels = top2vec_model.doc_top ) #

plt.show()