In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from hmmlearn import hmm
import string
from gensim import corpora
from gensim.utils import simple_preprocess
from utils import prep_text, vec_translate, prep_data
import pickle

In [24]:
def go_through_topics(short=False):
    data = pd.read_csv("../merged_summary_topics.csv")
    total = data.shape[0]
    topic_list = [column for column in data.columns if column not in 
                  ['Year', 'Speaker', 'Title', 'File', 'Month', 'topic_lists', 'Train', 
                   'Month_letter', 'Month', 'Kicker', 'Unnamed: 0', 'Unnamed: 0.1']]
    topics_used = []
    dictionaries = []
    test_data = None
    for topic in topic_list:
        file_name = ('_').join(topic.split(' '))
        df = pd.read_csv(f"../Topic_Data/{file_name}.csv")
        #use only topics that have more than 50 talks 
        if df.shape[0] >=50:
            topics_used.append((topic, df.shape[0]))
        else:
            continue
        
        df_train = df[df['Train'] == 1]
        df_test = df[df['Train'] == 0]
        if test_data is None:
            test_data = df_test
        else:
            test_data.append(df_test)
        #read in all the talks on topic topic
        df_talks = []
        #go through all the talks in the training set
        for filename in df_train["File"]:
            with open("../" + filename, "r") as f:
                text = f.read()
                processed = simple_preprocess(text)
                if len(text):
                    df_talks.append(processed)
        #concatenate the talks
        df_text = sum(df_talks, start=[])

        #create the dictionary
        dictionary = corpora.Dictionary([df_text])
        dictionaries.append(dictionary)
        if short:
            continue
        else:
            #minimize the aic to choose the optimal number of components
            components, AIC = hyperparameter_states(df_text, dictionary, np.arange(2, 6), df_talks)

            #create the best model
            best_model = hmm.MultinomialHMM(n_components=components, n_iter=100)
            #train the model
            best_model.fit(prep_text(df_text, dictionary))

            #save the model
            with open(f"{topic}bestModel", 'wb') as file:
                pickle.dump(best_model, file)
        #unindent this once if I want to run more than one topic
    return topics_used, test_data, dictionaries, total

In [25]:
def hyperparameter_states(text, dictionary, list_of_states, talks, n=50):
    def calculate_aic(K, score):
        aic = 2*K - 2*score
        return aic
    best_aic = np.inf
    best_state = None
    for num in list_of_states:
        model = hmm.MultinomialHMM(n_components=num, n_iter=100, tol=1e-3)
        model.fit(prep_text(text, dictionary))
        score = model.score(prep_text(talks[-1], dictionary))
        new_aic = calculate_aic(num, score)
        if new_aic < best_aic:
            best_aic = new_aic
            best_state = num
    return best_state, best_aic

In [26]:
topics_used, test_data, dictionaries, total = go_through_topics(short=True)

In [20]:
topics_used

['Aaronic Priesthood',
 'Adam and Eve',
 'Articles of Faith',
 'Atonement',
 'Bible',
 'Book of Mormon',
 'Brigham Young',
 'Christianity',
 'Christmas',
 'Church activity',
 'Church attendance',
 'Church callings',
 'Church doctrine',
 'Church growth',
 'Church history',
 'Church leaders',
 'Church leadership',
 'Church meetings',
 'Church membership',
 'Church organization',
 'Creation',
 'Doctrine and Covenants',
 'Easter',
 'Ezra Taft Benson',
 'Fall',
 'First Presidency',
 'First Vision',
 'God the Father',
 'Godhead',
 'Gordon B. Hinckley',
 'Harold B. Lee',
 'Heavenly Father',
 'Holy Ghost',
 'Howard W. Hunter',
 'Jesus Christ',
 'Joseph Smith',
 'Light of Christ',
 'Melchizedek Priesthood',
 'Native Americans',
 'New Testament',
 'Passover',
 'Primary',
 'Quorum of the Twelve Apostles',
 'Quorums of Seventy',
 'Relief Society',
 'Restoration',
 'Resurrection',
 'Sabbath',
 'Satan',
 'Scouting',
 'Second Coming',
 'Spencer W. Kimball',
 'Sunday School',
 'Tabernacle Choir',
 'Te

In [27]:
def load_models(topics_used):
    models = []
    for (topic, num) in topics_used:
        with open(f"{topic}bestModel", 'rb') as file:
            model = pickle.load(file)
            models.append((model, topic, num))
    return models
models = load_models(topics_used)

In [28]:
def score_on_test(models, dictionaries, test_data, total=3465):
    test_text = []
    #collect a list of processed test talks

    for filename, topic_list, year, speaker, title in zip(test_data['File'], test_data['topic_lists'], 
                                                          test_data['Year'], test_data['Speaker'], test_data['Title']):
        with open("../" + filename, "r") as f:
            text = f.read()
            processed = simple_preprocess(text)
            if len(text):
                test_text.append((processed, topic_list, year, speaker, title))
    #for each talk, score each model and then
    best = {}
    for x_test, topic_list, year, speaker, title in test_text:
        max_score = -np.inf
        best_model = None
        the_topic = None
        correct = None
        for (model, topic, num), dictionary in zip(models, dictionaries):
            score = model.score(prep_text(x_test, dictionary)) + np.log((num/total))
            if score > max_score:
                max_score = score
                best_model = model
                the_topic = topic
                if the_topic in topic_list:
                    correct = True
                else:
                    correct = False
        best[('_').join([str(year), speaker, title])] = [the_topic, max_score, best_model, correct, topic_list]
    return best
                

In [29]:
best = score_on_test(models, dictionaries, test_data)

In [30]:
for key, val in zip(best.keys(), best.values()):
    print(key, ": ", val)
    print('\n')

1971_Joseph Fielding Smith_Out of the Darkness :  ['Restoration', -6908.2457368760915, MultinomialHMM(n_components=2, n_iter=100,
               random_state=RandomState(MT19937) at 0x7F908F8EF940), True, "['Restoration', 'Church doctrine', 'Atonement', 'plan of salvation']"]


1975_Gordon B. Hinckley_The Symbol of Christ :  ['Resurrection', -12428.288917161492, MultinomialHMM(n_components=2, n_iter=100,
               random_state=RandomState(MT19937) at 0x7F908F8EF840), True, "['Jesus Christ', 'spirituality', 'Resurrection', 'Atonement']"]


1977_Boyd K. Packer_The Mediator :  ['discipleship', -13983.146842017379, MultinomialHMM(n_components=2, n_iter=100,
               random_state=RandomState(MT19937) at 0x7F908D36C840), False, "['Jesus Christ', 'mercy', 'justice', 'Atonement']"]


1986_Henry B. Eyring_The Spark of Faith :  ['discipleship', -10567.02946149647, MultinomialHMM(n_components=2, n_iter=100,
               random_state=RandomState(MT19937) at 0x7F908D36C840), False, "['

In [31]:
num_correct = sum([val[3] for val in best.values()])

In [32]:
acc = num_correct / len(best)
acc

0.19718309859154928