In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from hmmlearn import hmm
import string
from gensim import corpora
from gensim.utils import simple_preprocess
from time import time
from joblib import Parallel, delayed
from collections import Counter
from functools import partial

from utils import prep_text, prep_data, vec_translate

In [2]:
# read in all talks for the 20 most frequent speakers
n_speakers = 20
summary = pd.read_json("../merged_summary_topics.json")
top_speakers = summary["Speaker"].value_counts()[:n_speakers].index.to_list()

talks = {}
for name in top_speakers:
    talks[name] = []
    for filename in summary[summary["Speaker"] == name]["File"]:
        with open("../" + filename, "r") as f:
            text = f.read()
            processed = simple_preprocess(text)
            if len(text):
                talks[name].append(processed)

# Speaker identification with one HMM

In [3]:
# concatenate the first 10 of President Monson's talks
name = 'Thomas S. Monson'
text = sum(talks[name][:10], start=[])

# for training on the vocabulary of every talk in the dataset
"""corpus = sum(talks.values(), start=[])
dictionary = corpora.Dictionary(corpus)"""

# I was getting errors when training on the entire vocabulary (maybe it's
# too big?) so I switched to training on just 10 of Monson's talks
dictionary = corpora.Dictionary([text])

In [4]:
model = hmm.MultinomialHMM(n_components=5, n_iter=100)
model.fit(prep_text(text, dictionary))

MultinomialHMM(n_components=5, n_iter=100,
               random_state=RandomState(MT19937) at 0x7F560404B740)

In [5]:
# find the talk with the highest log probability
print("Score for a talk from Monson in the training data:",
     model.score(prep_text(talks[name][0], dictionary))
)
print("Score for a talk from Monson not in training data:",
      model.score(prep_text(talks[name][11], dictionary)))

max_score, max_name = -np.inf, None
for name in list(talks.keys()):
    score = model.score(prep_text(talks[name][0], dictionary))
    print(name, ":", score)
    
    if score > max_score:
        max_score, max_name = score, name

print(
    f"\nSpeaker with the maximum score: {max_name} with score = {max_score}"
)

Score for a talk from Monson in the training data: -12452.11667042868
Score for a talk from Monson not in training data: -18530.168671097912
Thomas S. Monson : -12452.11667042868
Gordon B. Hinckley : -18278.154742686434
James E. Faust : -5338.276916207026
Boyd K. Packer : -11903.825848543132
Henry B. Eyring : -12740.307297212075
L. Tom Perry : -1362.8999517689779
M. Russell Ballard : -2605.5008951763675
Russell M. Nelson : -10634.035300107926
Dallin H. Oaks : -4334.680069573178
Spencer W. Kimball : -28589.126197459103
Ezra Taft Benson : -19480.948601500815
Dieter F. Uchtdorf : -9006.251824468758
Richard G. Scott : -3480.3292300853864
David B. Haight : -15472.878548559429
Robert D. Hales : -9508.757406609
Marion G. Romney : -17526.393818300054
Joseph B. Wirthlin : -5728.436822951388
Howard W. Hunter : -15744.028627505291
Jeffrey R. Holland : -17431.3873426812
Neal A. Maxwell : -5531.255997349309

Speaker with the maximum score: L. Tom Perry with score = -1362.8999517689779


# Text generation

In [6]:
# using the previously trained model, sample 100 words
" ".join([dictionary[i] for i in model.sample(100)[0].flatten()])

'us saw surrender to learn winds drew world our statement on he above shall me that master to on he of their me that you by each amen stooped for have bus the hope of such hand years they for death of faith who the night of replace nose he time back parables one tree and clean that but of still into the perfection its by the waters to the are eternal will scant into handsome whom of love never the best will giant depart and the mark of my what he mother for moment witness him train to hall'

# Character-level text generation

In [7]:
symbols, obs = prep_data("../data/2000.txt")

model = hmm.MultinomialHMM(n_components=50, n_iter=20)
model.fit(obs.reshape(-1, 1))
X, _ = model.sample(100)
X = X.flatten()
"".join([symbols[i] for i in X])

'isms ony t isg donmaonsat on the to oice is tlisesncome theawe hale on purepeas of wrass to the fand'

# Speaker identification with multiple models

In [8]:
def train_model(name, training_size):
    text = sum(talks[name][:training_size], start=[])
    dictionary = corpora.Dictionary([text])
    
    model = hmm.MultinomialHMM(n_components=10, n_iter=100)
    model.fit(prep_text(text, dictionary))
    return model, dictionary

In [9]:
# train 20 models, one for each speaker
training_size = 40
speakers = list(talks.keys())

models = Parallel(n_jobs=-1, verbose=20)(
    delayed(
        partial(train_model, training_size=training_size)
    )(name) for name in speakers
)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done   7 out of  20 | elapsed:  7.7min remaining: 14.3min
[Parallel(n_jobs=-1)]: Done   9 out of  20 | elapsed: 14.4min remaining: 17.6min
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed: 15.0min remaining: 12.2min
[Parallel(n_jobs=-1)]: Done  13 out of  20 | elapsed: 15.4min remaining:  8.3min
[Parallel(n_jobs=-1)]: Done  15 out of  20 | elapsed: 15.9min remaining:  5.3min
[Parallel(n_jobs=-1)]: Done  17 out of  20 | elapsed: 19.1min remaining:  3.4min
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 20.2min finished


In [12]:
# evaluate the trained models on two speakers

top_n_speakers = 2

p_s = np.array([len(talks[name]) for name in speakers], dtype=float)
p_s /= np.sum(p_s)
log_p_s = np.log(p_s[:top_n_speakers])

def eval_model(name, training_size):
    counts = Counter()
    
    for talk in talks[name][training_size:]:
        max_score = -np.inf
        closest_speaker = None
        
        scores = np.array([model.score(prep_text(talk, dictionary)) for model, dictionary in models[:top_n_speakers]])
        
        # normalize using Bayes rule
        p_s_t = scores + log_p_s
        
        closest_speaker = speakers[np.argmax(p_s_t)]
        counts[closest_speaker] += 1
        
    accuracy = counts[name] / sum(counts.values())
        
    return counts, accuracy, scores


results = Parallel(n_jobs=-1, verbose=20)(
    delayed(
        partial(eval_model, training_size=training_size)
    )(name) for name in speakers[:top_n_speakers]
)

total_correct = 0
total = 0

for speaker, (counts, accuracy, _scores)in zip(speakers, results):
    total_correct += counts[speaker]
    total += sum(counts.values())
    
    print(f"{speaker}: {counts}; % correct = {accuracy}")
    print()

print(f"Overall accuracy: {total_correct / total}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    7.1s


Thomas S. Monson: Counter({'Thomas S. Monson': 111, 'Gordon B. Hinckley': 56}); % correct = 0.6646706586826348

Gordon B. Hinckley: Counter({'Gordon B. Hinckley': 160, 'Thomas S. Monson': 7}); % correct = 0.9580838323353293

Overall accuracy: 0.811377245508982


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    8.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    8.3s finished


In [14]:
# test on training data

top_n_speakers = 2

p_s = np.array([len(talks[name]) for name in speakers], dtype=float)
p_s /= np.sum(p_s)
log_p_s = np.log(p_s[:top_n_speakers])

def eval_model_train_set(name, training_size):
    counts = Counter()
    
    for talk in talks[name][:training_size]:
        max_score = -np.inf
        closest_speaker = None
        
        scores = np.array([model.score(prep_text(talk, dictionary)) for model, dictionary in models[:top_n_speakers]])
        
        # normalize using Bayes rule
        p_s_t = scores + log_p_s
        
        closest_speaker = speakers[np.argmax(p_s_t)]
        counts[closest_speaker] += 1
        
    accuracy = counts[name] / sum(counts.values())
        
    return counts, accuracy, scores


results = Parallel(n_jobs=-1, verbose=20)(
    delayed(
        partial(eval_model_train_set, training_size=training_size)
    )(name) for name in speakers[:top_n_speakers]
)

total_correct = 0
total = 0

for speaker, (counts, accuracy, _scores)in zip(speakers, results):
    total_correct += counts[speaker]
    total += sum(counts.values())
    
    print(f"{speaker}: {counts}; % correct = {accuracy}")
    print(_scores)
    print()

print(f"Overall accuracy: {total_correct / total}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.3s


Thomas S. Monson: Counter({'Thomas S. Monson': 40}); % correct = 1.0
[-16406.1503697  -25088.75081924]

Gordon B. Hinckley: Counter({'Gordon B. Hinckley': 38, 'Thomas S. Monson': 2}); % correct = 0.95
[-22884.90250718 -17270.91751101]

Overall accuracy: 0.975


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    4.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    4.5s finished


In [22]:
# test on other sets of two speakers

top_n_speakers = 2

p_s = np.array([len(talks[name]) for name in speakers], dtype=float)
p_s /= np.sum(p_s)
log_p_s = np.log(p_s[1:top_n_speakers+1])

def eval_model(name, training_size):
    counts = Counter()
    
    for talk in talks[name][training_size:]:
        max_score = -np.inf
        closest_speaker = None
        
        scores = np.array([model.score(prep_text(talk, dictionary)) for model, dictionary in models[1:top_n_speakers+1]])
        
        # normalize using Bayes rule
        p_s_t = scores + log_p_s
        
        closest_speaker = speakers[np.argmax(p_s_t)+1]
        counts[closest_speaker] += 1
        
    accuracy = counts[name] / sum(counts.values())
        
    return counts, accuracy, scores


results = Parallel(n_jobs=-1, verbose=20)(
    delayed(
        partial(eval_model, training_size=training_size)
    )(name) for name in speakers[1:top_n_speakers+1]
)

total_correct = 0
total = 0

for speaker, (counts, accuracy, _scores)in zip(
    speakers[1:top_n_speakers+1], results):
    total_correct += counts[speaker]
    total += sum(counts.values())
    
    print(f"{speaker}: {counts}; % correct = {accuracy}")
    print()

print(f"Overall accuracy: {total_correct / total}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    5.6s


Gordon B. Hinckley: Counter({'Gordon B. Hinckley': 100, 'James E. Faust': 67}); % correct = 0.5988023952095808

James E. Faust: Counter({'James E. Faust': 55, 'Gordon B. Hinckley': 2}); % correct = 0.9649122807017544

Overall accuracy: 0.6919642857142857


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    7.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    7.4s finished


In [23]:
# test on yet another set of two speakers

top_n_speakers = 2

p_s = np.array([len(talks[name]) for name in speakers], dtype=float)
p_s /= np.sum(p_s)
log_p_s = np.log(p_s[2:top_n_speakers+2])

def eval_model(name, training_size):
    counts = Counter()
    
    for talk in talks[name][training_size:]:
        max_score = -np.inf
        closest_speaker = None
        
        scores = np.array([model.score(prep_text(talk, dictionary)) for model, dictionary in models[2:top_n_speakers+2]])
        
        # normalize using Bayes rule
        p_s_t = scores + log_p_s
        
        closest_speaker = speakers[np.argmax(p_s_t)+2]
        counts[closest_speaker] += 1
        
    accuracy = counts[name] / sum(counts.values())
        
    return counts, accuracy, scores


results = Parallel(n_jobs=-1, verbose=20)(
    delayed(
        partial(eval_model, training_size=training_size)
    )(name) for name in speakers[2:top_n_speakers+2]
)

total_correct = 0
total = 0

for speaker, (counts, accuracy, _scores)in zip(
    speakers[2:top_n_speakers+2], results):
    total_correct += counts[speaker]
    total += sum(counts.values())
    
    print(f"{speaker}: {counts}; % correct = {accuracy}")
    print()

print(f"Overall accuracy: {total_correct / total}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    4.5s


James E. Faust: Counter({'James E. Faust': 52, 'Boyd K. Packer': 5}); % correct = 0.9122807017543859

Boyd K. Packer: Counter({'Boyd K. Packer': 26, 'James E. Faust': 20}); % correct = 0.5652173913043478

Overall accuracy: 0.7572815533980582


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    4.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    4.9s finished


In [21]:
# evaluate the trained models on all the speakers

top_n_speakers = 20

p_s = np.array([len(talks[name]) for name in speakers], dtype=float)
p_s /= np.sum(p_s)
log_p_s = np.log(p_s[:top_n_speakers])

def eval_model(name, training_size):
    counts = Counter()
    
    for talk in talks[name][training_size:]:
        max_score = -np.inf
        closest_speaker = None
        
        scores = np.array([model.score(prep_text(talk, dictionary)) for model, dictionary in models[:top_n_speakers]])
        
        # normalize using Bayes rule
        p_s_t = scores + log_p_s
        
        closest_speaker = speakers[np.argmax(p_s_t)]
        counts[closest_speaker] += 1
        
    accuracy = counts[name] / sum(counts.values())
        
    return counts, accuracy, scores


results = Parallel(n_jobs=-1, verbose=20)(
    delayed(
        partial(eval_model, training_size=training_size)
    )(name) for name in speakers[:top_n_speakers]
)

total_correct = 0
total = 0

for speaker, (counts, accuracy, _scores)in zip(speakers, results):
    total_correct += counts[speaker]
    total += sum(counts.values())
    
    print(f"{speaker}: {counts}; % correct = {accuracy}")
    print()

print(f"Overall accuracy: {total_correct / total}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   25.4s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   29.2s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   31.5s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   35.7s
[Parallel(n_jobs=-1)]: Done   7 out of  20 | elapsed:   43.6s remaining:  1.3min
[Parallel(n_jobs=-1)]: Done   9 out of  20 | elapsed:   45.3s remaining:   55.4s
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed:   46.8s remaining:   38.3s
[Parallel(n_jobs=-1)]: Done  13 out of  20 | elapsed:   52.0s remaining:   28.0s
[Parallel(n_jobs=-1)]: Done  15 out of  20 | elapsed:   54.0s remaining:   18.0s
[Parallel(n_jobs=-1)]: Done  17 out of  20 | elapsed:   54.7s remaining:    9.6s


Thomas S. Monson: Counter({'Marion G. Romney': 73, 'Dieter F. Uchtdorf': 21, 'Dallin H. Oaks': 15, 'Thomas S. Monson': 11, 'L. Tom Perry': 9, 'M. Russell Ballard': 8, 'James E. Faust': 6, 'David B. Haight': 6, 'Richard G. Scott': 5, 'Howard W. Hunter': 3, 'Gordon B. Hinckley': 3, 'Henry B. Eyring': 3, 'Ezra Taft Benson': 2, 'Jeffrey R. Holland': 1, 'Joseph B. Wirthlin': 1}); % correct = 0.0658682634730539

Gordon B. Hinckley: Counter({'Marion G. Romney': 65, 'Gordon B. Hinckley': 48, 'Dallin H. Oaks': 20, 'David B. Haight': 7, 'L. Tom Perry': 7, 'Dieter F. Uchtdorf': 5, 'Ezra Taft Benson': 5, 'Richard G. Scott': 3, 'James E. Faust': 3, 'Jeffrey R. Holland': 1, 'M. Russell Ballard': 1, 'Boyd K. Packer': 1, 'Howard W. Hunter': 1}); % correct = 0.2874251497005988

James E. Faust: Counter({'Marion G. Romney': 28, 'Dallin H. Oaks': 10, 'Dieter F. Uchtdorf': 7, 'James E. Faust': 5, 'David B. Haight': 3, 'Richard G. Scott': 2, 'Jeffrey R. Holland': 1, 'M. Russell Ballard': 1}); % correct = 0.

[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  1.3min finished
