### Topic Modeling Using Mallet

In [59]:
path_to_mallet = '/Users/owenmonroe/Desktop/mallet-2.0.8/bin/mallet'

In [60]:
import little_mallet_wrapper
import seaborn
import glob
from pathlib import Path

#### Establishing Files

In [61]:
texts_directory = 'inauguration_speeches'
files = glob.glob(f"{texts_directory}/*.txt")
files

['inauguration_speeches/1961Kennedy.txt',
 'inauguration_speeches/1857Buchanan.txt',
 'inauguration_speeches/1805Jefferson.txt',
 'inauguration_speeches/2005GW_Bush.txt',
 'inauguration_speeches/1965L_Johnson.txt',
 'inauguration_speeches/1917Wilson.txt',
 'inauguration_speeches/1885Cleveland.txt',
 'inauguration_speeches/1881Garfield.txt',
 'inauguration_speeches/2013Obama.txt',
 'inauguration_speeches/1889B_Harrison.txt',
 'inauguration_speeches/1813Madison.txt',
 'inauguration_speeches/1833Jackson.txt',
 'inauguration_speeches/1973Nixon.txt',
 'inauguration_speeches/1793Washington.txt',
 'inauguration_speeches/1981Reagan.txt',
 'inauguration_speeches/2021Biden.txt',
 'inauguration_speeches/1977Carter.txt',
 'inauguration_speeches/1877Hayes.txt',
 'inauguration_speeches/1989Bush.txt',
 'inauguration_speeches/1841WH_Harrison.txt',
 'inauguration_speeches/1905T_Roosevelt.txt',
 'inauguration_speeches/1817Monroe.txt',
 'inauguration_speeches/1969Nixon.txt',
 'inauguration_speeches/1901M

#### Setting Training Data

In [62]:
training_data = []
for file in files:
    text = open(file, encoding='utf-8').read()
    processed_text = little_mallet_wrapper.process_string(text, numbers='remove')
    training_data.append(processed_text)

#### Keeping Original Texts

In [63]:
original_texts = []
for file in files:
    text = open(file, encoding='utf-8').read()
    original_texts.append(text)

#### Getting Speech Titles

In [64]:
speech_titles = [Path(file).stem for file in files]
speech_titles

['1961Kennedy',
 '1857Buchanan',
 '1805Jefferson',
 '2005GW_Bush',
 '1965L_Johnson',
 '1917Wilson',
 '1885Cleveland',
 '1881Garfield',
 '2013Obama',
 '1889B_Harrison',
 '1813Madison',
 '1833Jackson',
 '1973Nixon',
 '1793Washington',
 '1981Reagan',
 '2021Biden',
 '1977Carter',
 '1877Hayes',
 '1989Bush',
 '1841WH_Harrison',
 '1905T_Roosevelt',
 '1817Monroe',
 '1969Nixon',
 '1901McKinley',
 '1865Lincoln',
 '1789Washington',
 '2009Obama',
 '1993Clinton',
 '1897McKinley',
 '2017Trump',
 '1825Quincy_Adams',
 '1913Wilson',
 '2001GW_Bush',
 '1837Van_Buren',
 '1925Coolidge',
 '1849Taylor',
 '1821Monroe',
 '1937F_Roosevelt',
 '1921Harding',
 '1853Pierce',
 '1869Grant',
 '1909Taft',
 '1953Eisenhower',
 '1941F_Roosevelt',
 '1957Eisenhower',
 '1801Jefferson',
 '1845Polk',
 '1949Truman',
 '1929Hoover',
 '1893Cleveland',
 '1985Reagan',
 '1809Madison',
 '1797Adams',
 '1829Jackson',
 '1997Clinton',
 '1945F_Roosevelt',
 '1873Grant',
 '1933F_Roosevelt',
 '1861Lincoln']

In [65]:
little_mallet_wrapper.print_dataset_stats(training_data)

Number of Documents: 59
Mean Number of Words per Document: 1103.2
Vocabulary Size: 9025


In [66]:
num_topics = 12

In [67]:
training_data = training_data

In [68]:
#Change to your desired output directory
output_directory_path = 'output_data'

#No need to change anything below here
Path(f"{output_directory_path}").mkdir(parents=True, exist_ok=True)

path_to_training_data           = f"{output_directory_path}/training.txt"
path_to_formatted_training_data = f"{output_directory_path}/mallet.training"
path_to_model                   = f"{output_directory_path}/mallet.model.{str(num_topics)}"
path_to_topic_keys              = f"{output_directory_path}/mallet.topic_keys.{str(num_topics)}"
path_to_topic_distributions     = f"{output_directory_path}/mallet.topic_distributions.{str(num_topics)}"

In [69]:
little_mallet_wrapper.quick_train_topic_model(path_to_mallet, output_directory_path, num_topics, training_data)

Importing data...
Complete
Training topic model...


Mallet LDA: 12 topics, 4 topic bits, 1111 topic mask
Data loaded.
max tokens: 3797
total tokens: 65089
<10> LL/token: -9.16891
<20> LL/token: -8.9163
<30> LL/token: -8.83091
<40> LL/token: -8.78407

0	0.41667	upon law much business self political men hope action also resources believe labor control defense shall trade desire means increase 
1	0.41667	country nations support might duty foreign civil ever prosperity people interest fair possible never effect hand give patriotism produce promote 
2	0.41667	one man president fellow things liberty citizens first common children earth done act others fear hopes friends stand called ever 
3	0.41667	great upon united executive free present people rights commerce principles well principle shall purposes happiness within liberty less therefore regard 
4	0.41667	public citizens laws peace interests administration national policy institutions country rights revenue confidence duty whose governments best political foreign measures 
5	0.41667	power 

Complete


([['law',
   'business',
   'must',
   'american',
   'labor',
   'tariff',
   'laws',
   'legislation',
   'congress',
   'make',
   'proper',
   'policy',
   'education',
   'trade',
   'south',
   'secure',
   'property',
   'race',
   'consideration',
   'taken'],
  ['confidence',
   'nations',
   'happiness',
   'gratitude',
   'press',
   'sentiments',
   'councils',
   'left',
   'examples',
   'partial',
   'various',
   'blessings',
   'tranquillity',
   'military',
   'feel',
   'mind',
   'rights',
   'advancement',
   'arduous',
   'certainly'],
  ['jobs',
   'back',
   'going',
   'heroes',
   'tax',
   'women',
   'prayer',
   'watching',
   'steps',
   'founding',
   'budget',
   'get',
   'streets',
   'workers',
   'alliances',
   'price',
   'breeze',
   'song',
   'night',
   'putting'],
  ['power',
   'one',
   'constitution',
   'would',
   'might',
   'spirit',
   'however',
   'citizens',
   'members',
   'control',
   'granted',
   'exist',
   'circumstances',
 

In [70]:
topics = little_mallet_wrapper.load_topic_keys(path_to_topic_keys)

for topic_number, topic in enumerate(topics):
    print(f"✨Topic {topic_number}✨\n\n{topic}\n")

✨Topic 0✨

['law', 'business', 'must', 'american', 'labor', 'tariff', 'laws', 'legislation', 'congress', 'make', 'proper', 'policy', 'education', 'trade', 'south', 'secure', 'property', 'race', 'consideration', 'taken']

✨Topic 1✨

['confidence', 'nations', 'happiness', 'gratitude', 'press', 'sentiments', 'councils', 'left', 'examples', 'partial', 'various', 'blessings', 'tranquillity', 'military', 'feel', 'mind', 'rights', 'advancement', 'arduous', 'certainly']

✨Topic 2✨

['jobs', 'back', 'going', 'heroes', 'tax', 'women', 'prayer', 'watching', 'steps', 'founding', 'budget', 'get', 'streets', 'workers', 'alliances', 'price', 'breeze', 'song', 'night', 'putting']

✨Topic 3✨

['power', 'one', 'constitution', 'would', 'might', 'spirit', 'however', 'citizens', 'members', 'control', 'granted', 'exist', 'circumstances', 'character', 'respective', 'executive', 'confederacy', 'given', 'instrument', 'department']

✨Topic 4✨

['union', 'would', 'duties', 'war', 'fellow', 'limits', 'equal', 'co

In [71]:
topic_distributions = little_mallet_wrapper.load_topic_distributions(path_to_topic_distributions)

IndexError: list index out of range