In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from hmmlearn import hmm
import string
from gensim import corpora
from gensim.utils import simple_preprocess

# Speaker identification

In [2]:
# convert a talk into an array of integers using the dictionary provided
def prep_text(text, dictionary):
    X = dictionary.doc2idx(text)
    return np.array(X).reshape(-1, 1)

In [3]:
# read in all talks for the 20 most frequent speakers
n_speakers = 20
summary = pd.read_json("../merged_summary_topics.json")
top_speakers = summary["Speaker"].value_counts()[:n_speakers].index.to_list()

talks = {}
for name in top_speakers:
    talks[name] = []
    for filename in summary[summary["Speaker"] == name]["File"]:
        with open("../" + filename, "r") as f:
            text = f.read()
            processed = simple_preprocess(text)
            if len(text):
                talks[name].append(processed)

In [4]:
# concatenate the first 10 of President Monson's talks
name = 'Thomas S. Monson'
text = sum(talks[name][:10], start=[])

# for training on the vocabulary of every talk in the dataset
"""corpus = sum(talks.values(), start=[])
dictionary = corpora.Dictionary(corpus)"""

# I was getting errors when training on the entire vocabulary (maybe it's
# too big?) so I switched to training on just 10 of Monson's talks
dictionary = corpora.Dictionary([text])

In [5]:
model = hmm.MultinomialHMM(n_components=5, n_iter=100)
model.fit(prep_text(text, dictionary))

MultinomialHMM(n_components=5, n_iter=100,
               random_state=RandomState(MT19937) at 0x7F1E1016AA40)

In [6]:
# find the talk with the highest log probability
print("Score for a talk from Monson in the training data:",
     model.score(prep_text(talks[name][0], dictionary))
)
print("Score for a talk from Monson not in training data:",
      model.score(prep_text(talks[name][11], dictionary)))

max_score, max_name = -np.inf, None
for name in list(talks.keys()):
    text = talks[name][0]
    score = model.score(prep_text(text, dictionary))
    print(name, ":", score)
    
    if score > max_score:
        max_score, max_name = score, name

print(
    f"\nSpeaker with the maximum score: {max_name} with score = {max_score}"
)

Score for a talk from Monson in the training data: -12386.70507160791
Score for a talk from Monson not in training data: -20892.051520507233
Gordon B. Hinckley : -20600.303374539784
Thomas S. Monson : -12386.70507160791
James E. Faust : -6163.611542611331
Boyd K. Packer : -13074.549244157624
Henry B. Eyring : -14038.864281241078
L. Tom Perry : -1871.889773193568
M. Russell Ballard : -3694.441395052454
Russell M. Nelson : -12396.947800096525
Dallin H. Oaks : -5430.404540700939
Spencer W. Kimball : -28802.738669790207
Ezra Taft Benson : -21426.328398671427
Richard G. Scott : -3854.391043258016
David B. Haight : -16531.660183070388
Dieter F. Uchtdorf : -9338.943582325177
Robert D. Hales : -10760.134888782677
Marion G. Romney : -20532.55945078576
Joseph B. Wirthlin : -6465.7266629871965
Jeffrey R. Holland : -19028.73808139474
Howard W. Hunter : -16914.098636919363
Neal A. Maxwell : -5328.623126015953

Speaker with the maximum score: L. Tom Perry with score = -1871.889773193568


# Text generation

In [7]:
# using the previously trained model, sample 100 words
" ".join([dictionary[i] for i in model.sample(100)[0].flatten()])

'to may christ as to put battalions lee to his glad entitled his soul that or child often have have christal mother held between go to were to of gently years out the glorious come ever each therefore betraying good low testament held the thoughts the problem peter to monson whispered matt the most man ye shalt himself to marked her plight brought the york frequently of the toast taught walk the time such with beyond their doors of my or the important for beings in that chime glorified the gate he who does walk his message this master along'

# Character-level text generation

In [8]:
#
# Character-level utility functions from homework 9.5
#

def vec_translate(a, my_dict):
    # translate array from symbols to state numbers or -vice versa
    return np.vectorize(my_dict.__getitem__)(a)


def prep_data(filename):
    # Get the data as a single string
    with open(filename) as f:
        data=f.read().lower() #read and convert to -lower case
    # remove punctuation and newlines
    remove_punct = {ord(char): None for char in string.punctuation+"\n\r"}
    data = data.translate(remove_punct)
    # make a list of the symbols in the data
    symbols = sorted(list(set(data)))
    # convert the data to a NumPy array of symbols
    a = np.array(list(data))
    #make a conversion dict from symbols to state -numbers
    symbols_to_obs = {x:i for i,x in enumerate(symbols)}
    #convert the symbols in a to state numbers
    obs_sequence = vec_translate(a,symbols_to_obs)
    return symbols, obs_sequence

In [9]:
symbols, obs = prep_data("../data/2000.txt")

model = hmm.MultinomialHMM(n_components=50, n_iter=20)
model.fit(obs.reshape(-1, 1))
X, _ = model.sample(100)
X = X.flatten()
"".join([symbols[i] for i in X])

'thsere oniwrs of—go tonisahe hos auto bupns ty aioelinuto aerh w thue ons tny cpouctl he tnac bracre'