In [33]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from hmmlearn import hmm
import string
from gensim import corpora
from gensim.utils import simple_preprocess
from time import time
from joblib import Parallel, delayed
from collections import Counter

from utils import prep_text, prep_data, vec_translate

# Speaker identification

In [2]:
# read in all talks for the 20 most frequent speakers
n_speakers = 20
summary = pd.read_json("../merged_summary_topics.json")
top_speakers = summary["Speaker"].value_counts()[:n_speakers].index.to_list()

talks = {}
for name in top_speakers:
    talks[name] = []
    for filename in summary[summary["Speaker"] == name]["File"]:
        with open("../" + filename, "r") as f:
            text = f.read()
            processed = simple_preprocess(text)
            if len(text):
                talks[name].append(processed)

In [10]:
# concatenate the first 10 of President Monson's talks
name = 'Thomas S. Monson'
text = sum(talks[name][:10], start=[])

# for training on the vocabulary of every talk in the dataset
"""corpus = sum(talks.values(), start=[])
dictionary = corpora.Dictionary(corpus)"""

# I was getting errors when training on the entire vocabulary (maybe it's
# too big?) so I switched to training on just 10 of Monson's talks
dictionary = corpora.Dictionary([text])

In [11]:
model = hmm.MultinomialHMM(n_components=5, n_iter=100)
model.fit(prep_text(text, dictionary))

MultinomialHMM(n_components=5, n_iter=100,
               random_state=RandomState(MT19937) at 0x7FE364551740)

In [12]:
# find the talk with the highest log probability
print("Score for a talk from Monson in the training data:",
     model.score(prep_text(talks[name][0], dictionary))
)
print("Score for a talk from Monson not in training data:",
      model.score(prep_text(talks[name][11], dictionary)))

max_score, max_name = -np.inf, None
for name in list(talks.keys()):
    score = model.score(prep_text(talks[name][0], dictionary))
    print(name, ":", score)
    
    if score > max_score:
        max_score, max_name = score, name

print(
    f"\nSpeaker with the maximum score: {max_name} with score = {max_score}"
)

Score for a talk from Monson in the training data: -12428.838065393451
Score for a talk from Monson not in training data: -19814.614154689396
Thomas S. Monson : -12428.838065393451
Gordon B. Hinckley : -19542.79811896064
James E. Faust : -6044.043404057589
Boyd K. Packer : -13173.54626312474
L. Tom Perry : -1671.975462819844
Henry B. Eyring : -13349.163098859106
M. Russell Ballard : -3148.2043479518215
Russell M. Nelson : -inf
Dallin H. Oaks : -4915.360349456056
Spencer W. Kimball : -28470.24038003811
Ezra Taft Benson : -21211.719588135784
Richard G. Scott : -3772.9260529611183
Dieter F. Uchtdorf : -9729.56067646322
David B. Haight : -16613.942504813876
Robert D. Hales : -10272.435007644102
Marion G. Romney : -19821.63431490711
Joseph B. Wirthlin : -6244.144251418623
Jeffrey R. Holland : -19461.97952079512
Howard W. Hunter : -15937.20690994325
Neal A. Maxwell : -inf

Speaker with the maximum score: L. Tom Perry with score = -1671.975462819844


# Text generation

In [6]:
# using the previously trained model, sample 100 words
" ".join([dictionary[i] for i in model.sample(100)[0].flatten()])

'be the work heard saints the frequently to nominate had spacious meetings in driving but you earth of the inward of for the he of philippines to of my point generation the eyes in truth the to will fame for it new the way of with you of in day of fame scriptures of with better of wonder the moment form neglected morn for and mother returning not the hour armentieres him receive lord can joseph daily an god we ago paul treat day parents wanted instant thee but be don and which and so character jesus of me was'

# Character-level text generation

In [None]:
symbols, obs = prep_data("../data/2000.txt")

model = hmm.MultinomialHMM(n_components=50, n_iter=20)
model.fit(obs.reshape(-1, 1))
X, _ = model.sample(100)
X = X.flatten()
"".join([symbols[i] for i in X])

# Speaker identification with multiple models

In [42]:
def train_model(name):
    text = sum(talks[name][:20], start=[])
    dictionary = corpora.Dictionary([text])
    
    model = hmm.MultinomialHMM(n_components=10, n_iter=100)
    model.fit(prep_text(text, dictionary))
    return model, dictionary

In [43]:
# train 20 models, one for each speaker

speakers = list(talks.keys())
models = Parallel(n_jobs=-1, verbose=20)(
    delayed(train_model)(s) for s in speakers
)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done   7 out of  20 | elapsed:  3.5min remaining:  6.5min
[Parallel(n_jobs=-1)]: Done   9 out of  20 | elapsed:  5.8min remaining:  7.1min
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed:  6.4min remaining:  5.2min
[Parallel(n_jobs=-1)]: Done  13 out of  20 | elapsed:  6.7min remaining:  3.6min
[Parallel(n_jobs=-1)]: Done  15 out of  20 | elapsed:  7.0min remaining:  2.3min
[Parallel(n_jobs=-1)]: Done  17 out of  20 | elapsed:  8.4min remaining:  1.5min
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  8.9min finished


In [44]:
# vanilla version

results = {speaker: Counter() for speaker in speakers}

for sample_name in speakers:
    
    match_counts = results[sample_name]
    
    for sample_talk in talks[sample_name][21:]:
        max_score = -np.inf
        closest_speaker = None
        for name, (model, dictionary) in zip(speakers, models):
            score = model.score(prep_text(sample_talk, dictionary))
            if score > max_score:
                max_score = score
                closest_speaker = name

        match_counts[closest_speaker] += 1
        
    print(f"{sample_name}: {match_counts}")

Thomas S. Monson: Counter({'Dieter F. Uchtdorf': 142, 'James E. Faust': 34, 'Thomas S. Monson': 5, 'Jeffrey R. Holland': 2, 'Joseph B. Wirthlin': 1, 'Henry B. Eyring': 1, 'Howard W. Hunter': 1})
Gordon B. Hinckley: Counter({'Dieter F. Uchtdorf': 149, 'James E. Faust': 22, 'Jeffrey R. Holland': 11, 'Henry B. Eyring': 2, 'Dallin H. Oaks': 1, 'Gordon B. Hinckley': 1})
James E. Faust: Counter({'Dieter F. Uchtdorf': 50, 'James E. Faust': 24, 'Dallin H. Oaks': 1, 'Jeffrey R. Holland': 1})
Boyd K. Packer: Counter({'Dieter F. Uchtdorf': 47, 'James E. Faust': 17, 'Jeffrey R. Holland': 1})
L. Tom Perry: Counter({'Dieter F. Uchtdorf': 50, 'James E. Faust': 7, 'Dallin H. Oaks': 4, 'Jeffrey R. Holland': 2, 'Henry B. Eyring': 1})
Henry B. Eyring: Counter({'Dieter F. Uchtdorf': 31, 'Henry B. Eyring': 27, 'James E. Faust': 1})
M. Russell Ballard: Counter({'Dieter F. Uchtdorf': 40, 'James E. Faust': 11, 'Jeffrey R. Holland': 4})
Russell M. Nelson: Counter({'Dieter F. Uchtdorf': 35, 'James E. Faust': 7,

In [51]:
# normalize the scores

results = {speaker: Counter() for speaker in speakers}

means = []
for i, speaker in enumerate(speakers):
    model, dictionary = models[i]
    scores = np.array([
        model.score(
            prep_text(sample_talk, dictionary)
        ) for sample_talk in talks[speaker][21:31]
    ])
    means.append(
        np.mean(scores[~np.isinf(scores)])
    )

print(means)

for sample_name in speakers:
    match_counts = results[sample_name]
    
    for sample_talk in talks[sample_name][31:]:
        max_score = -np.inf
        closest_speaker = None
        for i, (name, (model, dictionary)) in enumerate(zip(speakers, models)):
            score = model.score(prep_text(sample_talk, dictionary))
            
            # normalize the score
            score /= -means[i]
            
            if score > max_score:
                max_score = score
                closest_speaker = name

        match_counts[closest_speaker] += 1
        
    print(f"{sample_name}: {match_counts}")

[-16699.7608529009, -15264.880597051262, -16095.86670151952, -27018.362954403994, -17585.406658368818, -14708.7570076111, -22283.04027984301, -18282.18885105787, -16469.2796031591, -18989.52470941419, -16445.169897710388, -14173.090340209752, -14356.711871361362, -18550.862931306132, -19286.384240623545, -14376.90994249893, -20595.448855517425, -13561.742910434996, -12340.712832566376, -17514.095134405645]
Thomas S. Monson: Counter({'Russell M. Nelson': 116, 'Boyd K. Packer': 24, 'James E. Faust': 17, 'Joseph B. Wirthlin': 9, 'Thomas S. Monson': 8, 'M. Russell Ballard': 2})
Gordon B. Hinckley: Counter({'Russell M. Nelson': 115, 'Boyd K. Packer': 35, 'Joseph B. Wirthlin': 13, 'James E. Faust': 12, 'David B. Haight': 1})
James E. Faust: Counter({'Russell M. Nelson': 54, 'James E. Faust': 7, 'Boyd K. Packer': 4, 'Joseph B. Wirthlin': 1})
Boyd K. Packer: Counter({'Boyd K. Packer': 32, 'Russell M. Nelson': 22, 'James E. Faust': 1})
L. Tom Perry: Counter({'Russell M. Nelson': 33, 'Boyd K. Pa

In [69]:
# run it for just the top n speakers and see if it can distinguish them
n_speakers = slice(2)

results = {speaker: Counter() for speaker in speakers}

total_correct = 0
total = 0

for sample_name in speakers[n_speakers]:
    
    match_counts = results[sample_name]
    
    for sample_talk in talks[sample_name][21:]:
        max_score = -np.inf
        closest_speaker = None
        for name, (model, dictionary) in list(zip(speakers, models))[n_speakers]:
            score = model.score(prep_text(sample_talk, dictionary))
            if score > max_score:
                max_score = score
                closest_speaker = name

        match_counts[closest_speaker] += 1

    total_correct += match_counts[sample_name]
    total += sum(match_counts.values())
    accuracy = match_counts[sample_name] / sum(match_counts.values())
        
    print(f"{sample_name}: {match_counts}; % correct = {accuracy}")

print(f"Overall accuracy: {total_correct / total}")

Thomas S. Monson: Counter({'Thomas S. Monson': 150, 'Gordon B. Hinckley': 35, None: 1}); % correct = 0.8064516129032258
Gordon B. Hinckley: Counter({'Gordon B. Hinckley': 157, 'Thomas S. Monson': 29}); % correct = 0.8440860215053764
Overall accuracy: 0.8252688172043011
