In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from hmmlearn import hmm
import string
from gensim import corpora
from gensim.utils import simple_preprocess
from utils import prep_text, vec_translate, prep_data
import pickle

In [50]:
def go_through_topics():
    data = pd.read_csv("../merged_summary_topics.csv")
    topic_list = [column for column in data.columns if column not in 
                  ['Year', 'Speaker', 'Title', 'File', 'Month', 'topic_lists', 'Train', 
                   'Month_letter', 'Month', 'Kicker', 'Unnamed: 0', 'Unnamed: 0.1']]
    topics_used = []
    dictionaries = []
    test_data = None
    for topic in topic_list:
        print("open")
        file_name = ('_').join(topic.split(' '))
        df = pd.read_csv(f"../Topic_Data/{file_name}.csv")
        #use only topics that have more than 50 talks 
        if df.shape[0] >=50:
            topics_used.append(topic, df.shape[0])
        else:
            continue
        
        df_train = df[df['Train'] == 1]
        df_test = df[df['Train'] == 0]
        if test_data is None:
            test_data = df_test
        else:
            test_data.append(df_test)
        #read in all the talks on topic topic
        df_talks = []
        #go through all the talks in the training set
        for filename in df_train["File"]:
            with open("../" + filename, "r") as f:
                text = f.read()
                processed = simple_preprocess(text)
                if len(text):
                    df_talks.append(processed)
        #concatenate the talks
        df_text = sum(df_talks, start=[])

        #create the dictionary
        dictionary = corpora.Dictionary([df_text])
        dictionaries.append(dictionary)
        print("minimize")
        #minimize the aic to choose the optimal number of components
        components, AIC = hyperparameter_states(df_text, dictionary, np.arange(2, 6), df_talks)
        
        #create the best model
        best_model = hmm.MultinomialHMM(n_components=components, n_iter=100)
        print("fit")
        #train the model
        best_model.fit(prep_text(df_text, dictionary))
        
        #save the model
        with open(f"{topic}bestModel", 'wb') as file:
            pickle.dump(best_model, file)
        #unindent this once if I want to run more than one topic
        return topic_list, test_data, dictionaries

In [51]:
def hyperparameter_states(text, dictionary, list_of_states, talks, n=50):
    def calculate_aic(n, mse, num_params):
        aic = n * mse + 2 * num_params
        return aic
    best_aic = np.inf
    best_state = None
    for num in list_of_states:
        model = hmm.MultinomialHMM(n_components=num, n_iter=100, tol=1e-3)
        model.fit(prep_text(text, dictionary))
        score = model.score(prep_text(talks[-1], dictionary))
        if score < best_aic:
            best_aic = score
            best_state = num
    return best_state, best_aic

In [46]:
data = pd.read_csv("../merged_summary_topics.csv")
total = data.shape[0]
print(total)
topic_list = [column for column in data.columns if column not in 
              ['Year', 'Speaker', 'Title', 'File', 'Month', 'topic_lists', 'Train', 
               'Month_letter', 'Month', 'Kicker', 'Unnamed: 0', 'Unnamed: 0.1']]
topics_used = []
dictionaries = []
test_data = None
for topic in topic_list:
    file_name = ('_').join(topic.split(' '))
    df = pd.read_csv(f"../Topic_Data/{file_name}.csv")
    #use only topics that have more than 50 talks 
    if df.shape[0] >=50:
        topics_used.append((topic, df.shape[0]))
    else:
        continue

    df_train = df[df['Train'] == 1]
    df_test = df[df['Train'] == 0]
    if test_data is None:
        test_data = df_test
    else:
        test_data.append(df_test)
    #read in all the talks on topic topic
    df_talks = []
    #go through all the talks in the training set
    for filename in df_train["File"]:
        with open("../" + filename, "r") as f:
            text = f.read()
            processed = simple_preprocess(text)
            if len(text):
                df_talks.append(processed)
    #concatenate the talks
    df_text = sum(df_talks, start=[])

    #create the dictionary
    dictionary = corpora.Dictionary([df_text])
    dictionaries.append(dictionary)

3465


In [48]:
def load_models(topics_used):
    models = []
    for (topic, num) in topics_used:
        with open(f"{topic}bestModel", 'rb') as file:
            model = pickle.load(file)
            models.append((model, topic, num))
    return models
models = load_models(topics_used)

In [49]:
def score_on_test(models, dictionaries, test_data, total=3465):
    test_text = []
    #collect a list of processed test talks

    for filename, topic_list, year, speaker, title in zip(df_train['File'], df_train['topic_lists'], 
                                                          df_train['Year'], df_train['Speaker'], df_train['Title']):
        with open("../" + filename, "r") as f:
            text = f.read()
            processed = simple_preprocess(text)
            if len(text):
                test_text.append((processed, topic_list, year, speaker, title))
    #for each talk, score each model and then
    best = {}
    for x_test, topic_list, year, speaker, title in test_text:
        max_score = -np.inf
        best_model = None
        the_topic = None
        correct = None
        for (model, topic, num), dictionary in zip(models, dictionaries):
            score = model.score(prep_text(x_test, dictionary)) + np.log((num/total))
            if score > max_score:
                max_score = score
                best_model = model
                the_topic = topic
                if the_topic in topic_list:
                    correct = True
                else:
                    correct = False
        best[('_').join([str(year), speaker, title])] = [the_topic, max_score, best_model, correct, topic_list]
    return best
                

In [50]:
best = score_on_test(models, dictionaries, test_data)

In [51]:
for key, val in zip(best.keys(), best.values()):
    print(key, ": ", val)
    print('\n')

1971_Paul H. Dunn_Young People—Learn Wisdom in Thy Youth :  ['unity', -14310.182667352825, MultinomialHMM(n_components=2, n_iter=100,
               random_state=RandomState(MT19937) at 0x7F2F30531A40), False, "['obedience', 'youth']"]


1971_William H. Bennett_Help Needed in the Shaded Areas :  ['discipleship', -10467.027690875671, MultinomialHMM(n_components=2, n_iter=100,
               random_state=RandomState(MT19937) at 0x7F2F2A448440), False, "['spirituality', 'Holy Ghost', 'youth']"]


1971_Ezra Taft Benson_Satan’s Thrust—Youth :  ['family history', -18758.66862110726, MultinomialHMM(n_components=2, n_iter=100,
               random_state=RandomState(MT19937) at 0x7F2F2A448C40), False, "['music', 'youth', 'Satan']"]


1972_Vaughn J. Featherstone_A Challenge to the Priesthood :  ['prophets', -7113.644156639271, MultinomialHMM(n_components=2, n_iter=100,
               random_state=RandomState(MT19937) at 0x7F2F2FBFF040), True, "['prophets', 'priesthood', 'youth']"]


1972_Gordon

In [52]:
num_correct = sum([val[3] for val in best.values()])

In [53]:
acc = num_correct / len(best)
acc

0.38317757009345793