# Setup and read in talks

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from hmmlearn import hmm
import string
from gensim import corpora
from gensim.utils import simple_preprocess
from time import time
from joblib import Parallel, delayed
from collections import Counter
from functools import partial

from utils import prep_text, prep_data, vec_translate

In [2]:
# read in all talks for the 20 most frequent speakers
n_speakers = 20
summary = pd.read_json("../merged_summary_topics.json")
top_speakers = summary["Speaker"].value_counts()[:n_speakers].index.to_list()

talks = {}
for name in top_speakers:
    talks[name] = []
    for filename in summary[summary["Speaker"] == name]["File"]:
        with open("../" + filename, "r") as f:
            text = f.read()
            processed = simple_preprocess(text)
            if len(text):
                talks[name].append(processed)

# Speaker identification with one HMM

In [3]:
# concatenate the first 10 of President Monson's talks
name = 'Thomas S. Monson'
text = sum(talks[name][:10], start=[])

# for training on the vocabulary of every talk in the dataset
"""corpus = sum(talks.values(), start=[])
dictionary = corpora.Dictionary(corpus)"""

# I was getting errors when training on the entire vocabulary (maybe it's
# too big?) so I switched to training on just 10 of Monson's talks
dictionary = corpora.Dictionary([text])

In [4]:
model = hmm.MultinomialHMM(n_components=5, n_iter=100)
model.fit(prep_text(text, dictionary))

MultinomialHMM(n_components=5, n_iter=100,
               random_state=RandomState(MT19937) at 0x7F560404B740)

In [5]:
# find the talk with the highest log probability
print("Score for a talk from Monson in the training data:",
     model.score(prep_text(talks[name][0], dictionary))
)
print("Score for a talk from Monson not in training data:",
      model.score(prep_text(talks[name][11], dictionary)))

max_score, max_name = -np.inf, None
for name in list(talks.keys()):
    score = model.score(prep_text(talks[name][0], dictionary))
    print(name, ":", score)
    
    if score > max_score:
        max_score, max_name = score, name

print(
    f"\nSpeaker with the maximum score: {max_name} with score = {max_score}"
)

Score for a talk from Monson in the training data: -12452.11667042868
Score for a talk from Monson not in training data: -18530.168671097912
Thomas S. Monson : -12452.11667042868
Gordon B. Hinckley : -18278.154742686434
James E. Faust : -5338.276916207026
Boyd K. Packer : -11903.825848543132
Henry B. Eyring : -12740.307297212075
L. Tom Perry : -1362.8999517689779
M. Russell Ballard : -2605.5008951763675
Russell M. Nelson : -10634.035300107926
Dallin H. Oaks : -4334.680069573178
Spencer W. Kimball : -28589.126197459103
Ezra Taft Benson : -19480.948601500815
Dieter F. Uchtdorf : -9006.251824468758
Richard G. Scott : -3480.3292300853864
David B. Haight : -15472.878548559429
Robert D. Hales : -9508.757406609
Marion G. Romney : -17526.393818300054
Joseph B. Wirthlin : -5728.436822951388
Howard W. Hunter : -15744.028627505291
Jeffrey R. Holland : -17431.3873426812
Neal A. Maxwell : -5531.255997349309

Speaker with the maximum score: L. Tom Perry with score = -1362.8999517689779


# Text generation

In [6]:
# using the previously trained model, sample 100 words
" ".join([dictionary[i] for i in model.sample(100)[0].flatten()])

'us saw surrender to learn winds drew world our statement on he above shall me that master to on he of their me that you by each amen stooped for have bus the hope of such hand years they for death of faith who the night of replace nose he time back parables one tree and clean that but of still into the perfection its by the waters to the are eternal will scant into handsome whom of love never the best will giant depart and the mark of my what he mother for moment witness him train to hall'

# Character-level text generation

In [7]:
symbols, obs = prep_data("../data/2000.txt")

model = hmm.MultinomialHMM(n_components=50, n_iter=20)
model.fit(obs.reshape(-1, 1))
X, _ = model.sample(100)
X = X.flatten()
"".join([symbols[i] for i in X])

'isms ony t isg donmaonsat on the to oice is tlisesncome theawe hale on purepeas of wrass to the fand'

# Speaker identification with multiple models

In [8]:
def train_model(name, training_size):
    text = sum(talks[name][:training_size], start=[])
    dictionary = corpora.Dictionary([text])
    
    model = hmm.MultinomialHMM(n_components=10, n_iter=100)
    model.fit(prep_text(text, dictionary))
    return model, dictionary

In [9]:
# train 20 models, one for each speaker
training_size = 48

speakers = list(talks.keys())
models = Parallel(n_jobs=-1, verbose=20)(
    delayed(
        partial(train_model, training_size=training_size)
    )(name) for name in speakers
)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done   7 out of  20 | elapsed:  9.2min remaining: 17.0min
[Parallel(n_jobs=-1)]: Done   9 out of  20 | elapsed: 17.3min remaining: 21.2min
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed: 18.0min remaining: 14.8min
[Parallel(n_jobs=-1)]: Done  13 out of  20 | elapsed: 18.9min remaining: 10.2min
[Parallel(n_jobs=-1)]: Done  15 out of  20 | elapsed: 19.4min remaining:  6.5min
[Parallel(n_jobs=-1)]: Done  17 out of  20 | elapsed: 23.6min remaining:  4.2min
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 24.8min finished


In [11]:
def run_single_model(name, training_size, log_speaker_probs, speakers_to_classify, speaker_indices):
    counts = Counter()
    
    for talk in talks[name][training_size:]:
        max_score = -np.inf
        closest_speaker = None
        
        scores = np.array([
            model.score(prep_text(talk, dictionary)) for model, dictionary in [models[i] for i in speaker_indices]])
        
        # normalize using Bayes rule
        p_s_t = scores + log_speaker_probs
        
        closest_speaker = speakers_to_classify[np.argmax(p_s_t)]
        counts[closest_speaker] += 1
        
    accuracy = counts[name] / sum(counts.values())
        
    return counts, accuracy, scores


def eval_model(speakers_to_classify, training_size, quiet=False):
    # get normalization probabilities
    p_s = np.array([
        len(talks[name]) for name in speakers_to_classify
    ], dtype=float)
    log_speaker_probs = np.log(p_s / np.sum(p_s))
    
    speaker_indices = [
        speakers.index(speaker) for speaker in speakers_to_classify
    ]
    
    verbosity = 0 if quiet else 20
    results = Parallel(n_jobs=-1, verbose=verbosity)(
        delayed(
            partial(run_single_model, training_size=training_size,
                    log_speaker_probs=log_speaker_probs,
                    speakers_to_classify=speakers_to_classify,
                    speaker_indices=speaker_indices)
        )(name) for name in speakers_to_classify
    )

    total_correct = 0
    total = 0

    for speaker, (counts,accuracy,_) in zip(speakers_to_classify, results):
        total_correct += counts[speaker]
        total += sum(counts.values())

        if not quiet:
            print(f"{speaker}: {counts}; % correct = {accuracy}")
            print()

    print(f"Overall accuracy: {total_correct / total}")

In [13]:
for i in range(n_speakers - 2):
    speakers_to_classify = speakers[i:i+2]
    print(speakers_to_classify, end=": ")
    eval_model(speakers_to_classify, training_size, quiet=True)
    print()

['Thomas S. Monson', 'Gordon B. Hinckley']: Overall accuracy: 0.7264150943396226

['Gordon B. Hinckley', 'James E. Faust']: Overall accuracy: 0.33653846153846156

['James E. Faust', 'Boyd K. Packer']: Overall accuracy: 0.7931034482758621

['Boyd K. Packer', 'Henry B. Eyring']: Overall accuracy: 0.7428571428571429

['Henry B. Eyring', 'L. Tom Perry']: Overall accuracy: 0.9565217391304348

['L. Tom Perry', 'M. Russell Ballard']: Overall accuracy: 0.5692307692307692

['M. Russell Ballard', 'Russell M. Nelson']: Overall accuracy: 0.5833333333333334

['Russell M. Nelson', 'Dallin H. Oaks']: Overall accuracy: 0.6585365853658537

['Dallin H. Oaks', 'Spencer W. Kimball']: Overall accuracy: 0.6666666666666666

['Spencer W. Kimball', 'Ezra Taft Benson']: Overall accuracy: 0.45454545454545453

['Ezra Taft Benson', 'Dieter F. Uchtdorf']: Overall accuracy: 0.9523809523809523

['Dieter F. Uchtdorf', 'David B. Haight']: Overall accuracy: 0.5

['David B. Haight', 'Richard G. Scott']: Overall accuracy:

In [14]:
for i in range(n_speakers - 3):
    speakers_to_classify = speakers[i:i+3]
    print(speakers_to_classify, end=": ")
    eval_model(speakers_to_classify, training_size, quiet=True)
    print()

['Thomas S. Monson', 'Gordon B. Hinckley', 'James E. Faust']: Overall accuracy: 0.43869209809264303

['Gordon B. Hinckley', 'James E. Faust', 'Boyd K. Packer']: Overall accuracy: 0.3617886178861789

['James E. Faust', 'Boyd K. Packer', 'Henry B. Eyring']: Overall accuracy: 0.7647058823529411

['Boyd K. Packer', 'Henry B. Eyring', 'L. Tom Perry']: Overall accuracy: 0.7383177570093458

['Henry B. Eyring', 'L. Tom Perry', 'M. Russell Ballard']: Overall accuracy: 0.6804123711340206

['L. Tom Perry', 'M. Russell Ballard', 'Russell M. Nelson']: Overall accuracy: 0.49411764705882355

['M. Russell Ballard', 'Russell M. Nelson', 'Dallin H. Oaks']: Overall accuracy: 0.4057971014492754

['Russell M. Nelson', 'Dallin H. Oaks', 'Spencer W. Kimball']: Overall accuracy: 0.532258064516129

['Dallin H. Oaks', 'Spencer W. Kimball', 'Ezra Taft Benson']: Overall accuracy: 0.6296296296296297

['Spencer W. Kimball', 'Ezra Taft Benson', 'Dieter F. Uchtdorf']: Overall accuracy: 0.5238095238095238

['Ezra Taft

In [15]:
# evaluate the models on all the speakers
eval_model(speakers, training_size, quiet=True)

Overall accuracy: 0.21036106750392464


In [14]:
# test on training data for the first two speakers

top_n_speakers = 2

p_s = np.array([len(talks[name]) for name in speakers], dtype=float)
p_s /= np.sum(p_s)
log_p_s = np.log(p_s[:top_n_speakers])

def eval_model_train_set(name, training_size):
    counts = Counter()
    
    for talk in talks[name][:training_size]:
        max_score = -np.inf
        closest_speaker = None
        
        scores = np.array([model.score(prep_text(talk, dictionary)) for model, dictionary in models[:top_n_speakers]])
        
        # normalize using Bayes rule
        p_s_t = scores + log_p_s
        
        closest_speaker = speakers[np.argmax(p_s_t)]
        counts[closest_speaker] += 1
        
    accuracy = counts[name] / sum(counts.values())
        
    return counts, accuracy, scores


results = Parallel(n_jobs=-1, verbose=20)(
    delayed(
        partial(eval_model_train_set, training_size=training_size)
    )(name) for name in speakers[:top_n_speakers]
)

total_correct = 0
total = 0

for speaker, (counts, accuracy, _scores)in zip(speakers, results):
    total_correct += counts[speaker]
    total += sum(counts.values())
    
    print(f"{speaker}: {counts}; % correct = {accuracy}")
    print(_scores)
    print()

print(f"Overall accuracy: {total_correct / total}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.3s


Thomas S. Monson: Counter({'Thomas S. Monson': 40}); % correct = 1.0
[-16406.1503697  -25088.75081924]

Gordon B. Hinckley: Counter({'Gordon B. Hinckley': 38, 'Thomas S. Monson': 2}); % correct = 0.95
[-22884.90250718 -17270.91751101]

Overall accuracy: 0.975


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    4.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    4.5s finished


In [18]:
# use a large training set for the 2 most frequent speakers

training_size = 100
models = Parallel(n_jobs=-1, verbose=20)(
    delayed(
        partial(train_model, training_size=training_size)
    )(name) for name in speakers[:2]
)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 10.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 10.8min finished


In [19]:
eval_model(speakers[:2], training_size)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    4.9s


Thomas S. Monson: Counter({'Thomas S. Monson': 97, 'Gordon B. Hinckley': 10}); % correct = 0.9065420560747663

Gordon B. Hinckley: Counter({'Gordon B. Hinckley': 92, 'Thomas S. Monson': 15}); % correct = 0.8598130841121495

Overall accuracy: 0.883177570093458


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    6.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    6.1s finished
