# Labeling Topics

Let's practice labeling topics!

In [2]:
import little_mallet_wrapper

import glob
from pathlib import Path

Run this cell to get previous topic model results loaded

In [9]:
directory = "../NYT-Obituaries"
filenames = glob.glob(f"{directory}/*.txt")
#Make a list of all original NYT obituaries (not pre-processed)
original_texts = []

for file in filenames:
    text = open(file).read()
    original_texts.append(text)

obit_titles = [Path(file).stem for file in filenames]

num_topics = 15
output_directory_path = 'topic-model-output'
path_to_topic_keys              = f"{output_directory_path}/mallet.topic_keys.{str(num_topics)}"
path_to_topic_distributions     = f"{output_directory_path}/mallet.topic_distributions.{str(num_topics)}"

topics = little_mallet_wrapper.load_topic_keys(path_to_topic_keys)
topic_distributions = little_mallet_wrapper.load_topic_distributions(path_to_topic_distributions)

from IPython.display import Markdown, display
import re

def make_md(string):
    """A function that transforms string data into Markdown
    so it can be nicely formatted with bolding and emojis
    """
    display(Markdown(str(string)))

def get_top_docs(docs, topic_distributions, topic_index, n=5):
    
    """A function that shows the top documents for a given set of topic distributions
    and a specific topic number
    """
    
    sorted_data = sorted([(_distribution[topic_index], _document) for _distribution, _document in zip(topic_distributions, docs)], reverse=True)
    topic_words = topics[topic_index]
    make_md(f"### ✨Topic {topic_index}✨\n\n{topic_words}\n\n---")
    
    for probability, doc in sorted_data[:n]:
        # Make topic words bolded
        for word in topic_words:
            if word in doc.lower():
                doc = re.sub(f"\\b{word}\\b", f"**{word}**", doc, re.IGNORECASE)
        make_md(f'✨  \n**Topic Probability**: {probability}  \n**Document**: {doc}\n\n')

# Topic 2

In [None]:
get_top_docs(obit_titles, topic_distributions, topic_index=2, n=5)

In [None]:
get_top_docs(original_texts, topic_distributions, topic_index=2, n=5)

**Topic 2 Label**: 

## Topic 11

In [None]:
get_top_docs(obit_titles, topic_distributions, topic_index=11, n=5)

In [None]:
get_top_docs(original_texts, topic_distributions, topic_index=11, n=5)

**Topic 11 Label**: 