### Topic Modeling Using Mallet

In [None]:
path_to_mallet = '/Users/owenmonroe/Desktop/mallet-2.0.8/bin/mallet'

In [None]:
import little_mallet_wrapper
import seaborn
import glob
from pathlib import Path

#### Establishing Files

In [None]:
texts_directory = 'inauguration_speeches'
files = glob.glob(f"{texts_directory}/*.txt")
files

#### Setting Training Data

In [87]:
training_data = []
for file in files:
    text = open(file, encoding='utf-8').read()
    processed_text = little_mallet_wrapper.process_string(text, numbers='remove')
    training_data.append(processed_text)

#### Keeping Original Texts

In [None]:
original_texts = []
for file in files:
    text = open(file, encoding='utf-8').read()
    original_texts.append(text)

#### Getting Speech Titles

In [88]:
speech_titles = [Path(file).stem for file in files]
speech_titles

# this is a random order? (unsure why) Check the text editor list for doing topic distributions, but it starts at 0!

['1961Kennedy',
 '1857Buchanan',
 '1805Jefferson',
 '2005GW_Bush',
 '1965L_Johnson',
 '1917Wilson',
 '1885Cleveland',
 '1881Garfield',
 '2013Obama',
 '1889B_Harrison',
 '1813Madison',
 '1833Jackson',
 '1973Nixon',
 '1793Washington',
 '1981Reagan',
 '2021Biden',
 '1977Carter',
 '1877Hayes',
 '1989Bush',
 '1841WH_Harrison',
 '1905T_Roosevelt',
 '1817Monroe',
 '1969Nixon',
 '1901McKinley',
 '1865Lincoln',
 '1789Washington',
 '2009Obama',
 '1993Clinton',
 '1897McKinley',
 '2017Trump',
 '1825Quincy_Adams',
 '1913Wilson',
 '2001GW_Bush',
 '1837Van_Buren',
 '1925Coolidge',
 '1849Taylor',
 '1821Monroe',
 '1937F_Roosevelt',
 '1921Harding',
 '1853Pierce',
 '1869Grant',
 '1909Taft',
 '1953Eisenhower',
 '1941F_Roosevelt',
 '1957Eisenhower',
 '1801Jefferson',
 '1845Polk',
 '1949Truman',
 '1929Hoover',
 '1893Cleveland',
 '1985Reagan',
 '1809Madison',
 '1797Adams',
 '1829Jackson',
 '1997Clinton',
 '1945F_Roosevelt',
 '1873Grant',
 '1933F_Roosevelt',
 '1861Lincoln']

In [None]:
little_mallet_wrapper.print_dataset_stats(training_data)

In [None]:
num_topics = 12

In [None]:
training_data = training_data

In [None]:
#Change to your desired output directory
output_directory_path = 'output_data'

#No need to change anything below here
Path(f"{output_directory_path}").mkdir(parents=True, exist_ok=True)

path_to_training_data           = f"{output_directory_path}/training.txt"
path_to_formatted_training_data = f"{output_directory_path}/mallet.training"
path_to_model                   = f"{output_directory_path}/mallet.model.{str(num_topics)}"
path_to_topic_keys              = f"{output_directory_path}/mallet.topic_keys.{str(num_topics)}"
path_to_topic_distributions     = f"{output_directory_path}/mallet.topic_distributions.{str(num_topics)}"

In [None]:
little_mallet_wrapper.quick_train_topic_model(path_to_mallet, output_directory_path, num_topics, training_data)

In [None]:
topics = little_mallet_wrapper.load_topic_keys(path_to_topic_keys)

for topic_number, topic in enumerate(topics):
    print(f"✨Topic {topic_number}✨\n\n{topic}\n")

In [82]:
topic_distributions = little_mallet_wrapper.load_topic_distributions(path_to_topic_distributions)


In [84]:
speech_to_check = "2017Trump"

speech_number = speech_titles.index(speech_to_check)

print(f"Topic Distributions for {speech_titles[speech_number]}\n")
for topic_number, (topic, topic_distribution) in enumerate(zip(topics, topic_distributions[speech_number])):
    print(f"✨Topic {topic_number} {topic[:6]} ✨\nProbability: {round(topic_distribution, 3)}\n")

speech_number

Topic Distributions for 2017Trump

✨Topic 0 ['law', 'business', 'must', 'american', 'labor', 'tariff'] ✨
Probability: 0.023

✨Topic 1 ['confidence', 'nations', 'happiness', 'gratitude', 'press', 'sentiments'] ✨
Probability: 0.0

✨Topic 2 ['jobs', 'back', 'going', 'heroes', 'tax', 'women'] ✨
Probability: 0.202

✨Topic 3 ['power', 'one', 'constitution', 'would', 'might', 'spirit'] ✨
Probability: 0.0

✨Topic 4 ['union', 'would', 'duties', 'war', 'fellow', 'limits'] ✨
Probability: 0.0

✨Topic 5 ['great', 'nation', 'must', 'shall', 'free', 'peace'] ✨
Probability: 0.246

✨Topic 6 ['know', 'life', 'things', 'democracy', 'man', 'change'] ✨
Probability: 0.026

✨Topic 7 ['civilization', 'order', 'problems', 'task', 'counsel', 'republic'] ✨
Probability: 0.0

✨Topic 8 ['government', 'people', 'states', 'country', 'public', 'upon'] ✨
Probability: 0.028

✨Topic 9 ['union', 'constitution', 'opinion', 'upon', 'slavery', 'authority'] ✨
Probability: 0.0

✨Topic 10 ['america', 'world', 'people', 'new', '

29

In [90]:
topic_distributions[58]

[0.051358929854168026,
 0.003205758608039765,
 4.7557105658577006e-05,
 0.04826014016002757,
 0.023616827861104448,
 0.260956934967566,
 0.0021188228598594404,
 9.327513762416246e-05,
 0.24688264926899392,
 0.3555560033815954,
 0.007721802837726113,
 0.0001812979576365834]