In [31]:
import numpy as np
import pandas as pd
import ast

from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error

#from nmf_recommender import NMFRecommender

DATA_ROOT = "../data/"

# Demonstration

Currently I build the nonnegative matrix with rows as talks (features) and columns as topics (users/viewers). The entries are either 1 or 0 based on whether or not the talk has the topic. I could try something slightly more advanced and weight the topic by how many times the topic word appears in the talk.

I also built a topics by speakers matrix, where the topics are the features and the columns are the speakers. In this case the topics are summed by how many talks the speaker gave with the topic. The recommender uses NMF to recommend new topics to the speakers.

In [51]:
%run nmf_recommender
recommender = NMFRecommender()
talk_data = recommender.load_data()
recommender.train_topics_vs_talks(rtol=1e-3, verbose=True)

Attempting to converge to residual less than 0.119507
Rank=3 factorization rmse=0.1079761664850063
Found a tolerable NMF factorization with rank=3


True

In [86]:
#Interpret the 3 groups all the talks belong to due to the rank 3 factorization
belong = recommender.talk_nmf.H.argmax(axis=0)
topic_groups = []
for i in range(recommender.talk_nmf.rank):
    group = []
    for j, jbelong in enumerate(belong):
        if jbelong == i:
            group.append(recommender.topic_lookup[j])
    print(f"Group {i+1}")
    print(group)
    print()
    topic_groups.append(group)

Group 1
['Restoration', 'Church doctrine', 'Atonement', 'plan of salvation', 'repentance', 'Resurrection', 'forgiveness', 'Jesus Christ', 'sin', 'Creation', 'holy land', 'judging', 'criticism', 'dispensations', 'U.S. Constitution', 'Book of Mormon', 'Christianity', 'languages', 'God the Father', 'literature', 'individual worth', 'sacrament', 'death', 'apostasy', 'consecration', 'Godhead', 'Passover', 'mercy', 'justice', 'foreordination', 'healing', 'discipleship', 'Native Americans', 'New Testament', 'good Samaritan', 'name of Church', 'offense', 'premortal existence', 'Church membership', 'institute', 'seminary', 'grace', 'Howard W. Hunter', 'Gordon B. Hinckley', 'Bible', 'Fall', 'covetousness', 'environment', 'religious freedom', 'addiction', 'Light of Christ', 'baptism', 'meekness', 'Second Coming', 'confidence', 'young single adults', 'Heavenly Father', 'confirmation', 'Easter', 'Church activity', 'promptings', 'holiness', 'modesty', 'Church leadership', 'gathering']

Group 2
['fai

In [78]:
topics = list(recommender.topic_indices.keys())
#for topic in np.random.choice(topics, 1):

topic = "Church doctrine"

talk_ids = recommender.recommend_talks(topic)

print(f"For the topic '{topic}', we recommend the following talks")
talk_data.loc[talk_ids, ["Year", "Month", "Speaker", "Title"]]

For the topic 'Church doctrine', we recommend the following talks


Unnamed: 0,Year,Month,Speaker,Title
3697,2001,10,James E. Faust,The Atonement: Our Greatest Hope
3342,1997,10,Robert D. Hales,In Remembrance of Jesus
3568,2000,4,Gary J. Coleman,“Are You Still Here?”
3603,2000,10,Robert D. Hales,The Covenant of Baptism: To Be in the Kingdom ...
3604,2000,10,D. Todd Christofferson,The Redemption of the Dead and the Testimony o...
3403,1998,4,Henry B. Eyring,That We May Be One
3357,1997,10,Jeffrey R. Holland,“He Hath Filled the Hungry with Good Things”
3294,1997,4,Joseph B. Wirthlin,“True to the Truth”
3563,2000,4,Dallin H. Oaks,Resurrection
3701,2001,10,"Christoffel Golden, Jr.",Our Father’s Plan


In [79]:
topic = "sealings"

talk_ids = recommender.recommend_talks(topic)

print(f"For the topic '{topic}', we recommend the following talks")
talk_data.loc[talk_ids, ["Year", "Month", "Speaker", "Title"]]

For the topic 'sealings', we recommend the following talks


Unnamed: 0,Year,Month,Speaker,Title
3397,1998,4,Gordon B. Hinckley,Living Worthy of the Girl You Will Someday Marry
3384,1998,4,James E. Faust,"“Search Me, O God, and Know My Heart”"
3570,2000,4,M. Russell Ballard,“How Is It with Us?”
3572,2000,4,Richard G. Scott,The Sanctity of Womanhood
3731,2001,10,Sheri L. Dew,Are We Not All Mothers?
3705,2001,10,Robert D. Hales,Fulfilling Our Duty to God
3460,1998,10,Virginia U. Jensen,Come to Relief Society
3374,1997,10,Thomas S. Monson,The Mighty Strength of the Relief Society
3676,2001,4,Joseph B. Wirthlin,The Law of the Fast
3482,1999,4,Russell M. Nelson,Our Sacred Duty to Honor Women


Interesting results. Currently, the recommender is not allowed to recommend talks from the actual topic, only from similar topics. Maybe that is not the best way to interpret the results.

From this factorization we could give candidates for talks that should have been given other topic labels but weren't for some reason.

We could also look at a talk and see which topic is most likely be its main topic: "Another way to interpret W H is to look at a feature and determine who is most likely to buy that item. " ([NMF lab](http://acme.byu.edu/wp-content/uploads/2020/11/NMF.pdf), page 4). Or we could flip our consideration of talks as features and topics and users to talks as users and topics as features, look at a topic and see which talk is most likely to belong to that topic, or in other words, represent that topic.

In [53]:
recommender.train_speakers_vs_topics(rtol=1e-3, max_iter=2000, verbose=True)

Attempting to converge to residual less than 0.285065
Rank=3 factorization rmse=0.42330226218652445
Rank=4 factorization rmse=0.40218199074151284
Rank=5 factorization rmse=0.38284798101031153
Rank=6 factorization rmse=0.3670122813000323
Rank=7 factorization rmse=0.35556857046074747
Rank=8 factorization rmse=0.3431658432432702
Rank=9 factorization rmse=0.3322378841704212
Rank=10 factorization rmse=0.32231325103603886
Rank=11 factorization rmse=0.31395411748869945
Rank=12 factorization rmse=0.3064281886561987
Rank=13 factorization rmse=0.2987362220855996
Rank=14 factorization rmse=0.2911767530848876
Rank=15 factorization rmse=0.28513999867430784
Rank=16 factorization rmse=0.2783153994810441
Found a tolerable NMF factorization with rank=16


True

In [49]:
recommender.speaker_indices

{'Gordon B. Hinckley': 0,
 'Thomas S. Monson': 1,
 'James E. Faust': 2,
 'Boyd K. Packer': 3,
 'L. Tom Perry': 4,
 'Henry B. Eyring': 5,
 'M. Russell Ballard': 6,
 'Russell M. Nelson': 7,
 'Dallin H. Oaks': 8,
 'Spencer W. Kimball': 9,
 'Ezra Taft Benson': 10,
 'Marion G. Romney': 11,
 'Dieter F. Uchtdorf': 12,
 'David B. Haight': 13,
 'Richard G. Scott': 14,
 'Robert D. Hales': 15,
 'Neal A. Maxwell': 16,
 'Joseph B. Wirthlin': 17,
 'Jeffrey R. Holland': 18,
 'Howard W. Hunter': 19,
 'N. Eldon Tanner': 20,
 'Marvin J. Ashton': 21,
 'Bruce R. McConkie': 22,
 'David A. Bednar': 23,
 'D. Todd Christofferson': 24,
 'Victor L. Brown': 25,
 'Mark E. Petersen': 26,
 'Quentin L. Cook': 27,
 'Neil L. Andersen': 28,
 'LeGrand Richards': 29,
 'Marion D. Hanks': 30,
 'H. Burke Peterson': 31,
 'Barbara B. Smith': 32,
 'Vaughn J. Featherstone': 33,
 'Loren C. Dunn': 34,
 'Franklin D. Richards': 35,
 'A. Theodore Tuttle': 36,
 'Hartman Rector, Jr.': 37,
 'Harold B. Lee': 38,
 'Elaine S. Dalton': 39,

In [76]:
truncate = True
for speaker in ["Chieko N. Okazaki", "Kazuhiko  Yamashita", "Gary E. Stevenson", "Russell M. Nelson",
               "Gerrit W. Gong"]:
    sp_topics = recommender.speaker_topics[speaker]
    print(f"In the past, {speaker} has spoken on the following topics:")
    j = 0
    for topic in sp_topics:
        count = sp_topics[topic]
        if count > 0:
            print(f"\t{topic: <24}\t{count} time(s)")
            j += 1
        if truncate and j > 10:
            print("\t...")
            break
    print()
    print("Thus we recommend the following new topics based on the choices of other speakers:")
    for topic in recommender.recommend_topics_to_speaker(speaker):
        print(f"\t{topic}")
    print()
    print("----------------------------------------------------------------------------------")
    print()

In the past, Chieko N. Okazaki has spoken on the following topics:
	Atonement               	1 time(s)
	faith                   	3 time(s)
	education               	2 time(s)
	agency                  	1 time(s)
	family                  	1 time(s)
	love                    	3 time(s)
	spirituality            	1 time(s)
	Holy Ghost              	2 time(s)
	charity                 	1 time(s)
	Jesus Christ            	3 time(s)
	service                 	3 time(s)
	...

Thus we recommend the following new topics based on the choices of other speakers:
	adversity
	testimony
	priesthood
	repentance
	discipleship
	prayer
	covenants
	obedience
	home
	children

----------------------------------------------------------------------------------

In the past, Kazuhiko  Yamashita has spoken on the following topics:
	conversion              	1 time(s)
	missionary work         	2 time(s)
	love                    	1 time(s)
	example                 	1 time(s)
	disabilities            	1 time(s)
	service

The first two speakers have given less talks over the years so for them its interesting to see how the topics recommended relate to the topics they've already spoken on.

It should be noted for Gary E Stevenson, we recommended the new topic "spirituality". In 2019 he gave a talk (not in our dataset) called "Your Priesthood Playbook" with the topics "Aaronic Priesthood", "spirituality", and "discipleship". 

For Gerrit W Gong, the topic "Resurrection" was recommended and in April 2020 (again, not in our dataset) he gave a talk titled "Hosanna and Hallelujah" which had "Resurrection" as a topic.

For the most part the recommendations don't line up. This is probably because speakers are willing to revisit previously spoken on topics, and because they are directed by revelation and the changing needs of the church, not by how other speakers have spoken previously.

# Testing my code

This code is also found in the `nmf_recommender.py` file.

In [5]:
meta_data = pd.read_csv(DATA_ROOT + "caleb_merged_topics.csv", index_col=0)
#Drops talks which don't have topics
meta_data.dropna(subset=["Topics"], inplace=True)
#Map the stringified topic lists to python lists; keep dtype as object
meta_data["Topics"] = meta_data["Topics"].map(ast.literal_eval)
meta_data

Unnamed: 0,Year,Month,Speaker,Title,File,Kicker,Topics
939,1971,4,Joseph Fielding Smith,Out of the Darkness,data/2000.txt,Address delivered at general conference Saturd...,"[Restoration, Church doctrine, Atonement, plan..."
940,1971,4,Harold B. Lee,The Iron Rod,data/2001.txt,Address delivered at general conference Sunday...,"[faith, education, testimony, obedience, conve..."
941,1971,4,N. Eldon Tanner,“Choose You This Day”,data/2002.txt,Address delivered at general conference Sunday...,"[repentance, obedience, plan of salvation, age..."
942,1971,4,Spencer W. Kimball,"Voices of the Past, of the Present, of the Future",data/2003.txt,Address delivered at general conference Saturd...,"[morality, marriage]"
943,1971,4,Marvin J. Ashton,Love of the Right,data/2006.txt,,"[Word of Wisdom, family]"
...,...,...,...,...,...,...,...
5180,2018,10,Matthew L. Carpenter,Wilt Thou Be Made Whole?,data/8357.txt,,"[Jesus Christ, repentance, adversity, healing]"
5181,2018,10,Dale G. Renlund,Choose You This Day,data/8358.txt,,"[Jesus Christ, repentance, plan of salvation, ..."
5182,2018,10,Jack N. Gerard,Now Is the Time,data/8359.txt,,"[perspective, truth, priorities, revelation]"
5184,2018,10,Gary E. Stevenson,Shepherding Souls,data/8360.txt,,"[Jesus Christ, ministering, activation]"


In [6]:
meta_data["Topics"].dtype

dtype('O')

In [24]:
topic_count = {}
for topics in meta_data["Topics"]:
    for topic in topics:
        if topic in topic_count:
            topic_count[topic] += 1
        else:
            topic_count[topic] = 1

topic_indices = {topic:i for i, topic in enumerate(topic_count)}
topic_lookup = {i:topic for i, topic in enumerate(topic_count)}

print(topic_count)

{'Restoration': 127, 'Church doctrine': 9, 'Atonement': 254, 'plan of salvation': 231, 'faith': 474, 'education': 87, 'testimony': 309, 'obedience': 384, 'conversion': 118, 'evil': 32, 'repentance': 251, 'agency': 158, 'morality': 166, 'marriage': 125, 'Word of Wisdom': 58, 'family': 392, 'Resurrection': 82, 'temptation': 71, 'Satan': 69, 'anger': 10, 'forgiveness': 89, 'missionary work': 322, 'Quorums of Seventy': 4, 'success': 16, 'tithing': 53, 'wealth': 5, 'prophecy': 30, 'priesthood': 374, 'preparation': 85, 'teaching': 119, 'character': 30, 'leadership': 70, 'fellowshipping': 49, 'youth': 141, 'brotherhood': 14, 'fasting': 30, 'love': 348, 'Church organization': 45, 'respect': 23, 'home': 117, 'goals': 29, 'spirituality': 266, 'Holy Ghost': 306, 'religion': 7, 'temple work': 79, 'charity': 87, 'Jesus Christ': 783, 'last days': 15, 'example': 160, 'parents': 33, 'disabilities': 18, 'elderly': 13, 'service': 413, 'house of Israel': 13, 'patriarchal blessings': 11, 'honesty': 80, 'g

In [23]:
speakers = meta_data["Speaker"].value_counts()
#print(len(speakers))
#speakers
print(speakers[speakers.index.str.contains("Yama")])
print(len(speakers))

min_talks = 2
# I want to include my favorite Japanese General authority
#Elder Kazuhiko Yamashita
top_speakers = speakers[speakers >= min_talks]
print(len(top_speakers))
top_speakers

Kazuhiko  Yamashita    2
Name: Speaker, dtype: int64
458
306


Gordon B. Hinckley    211
Thomas S. Monson      207
James E. Faust        100
Boyd K. Packer         89
L. Tom Perry           87
                     ... 
David R. Stone          2
Dale E. Miller          2
Erich W. Kopischke      2
James B. Martino        2
Kevin W. Pearson        2
Name: Speaker, Length: 306, dtype: int64

In [30]:
#FYI I flipped how I was doing this because I want the topics to be the features and the speakers to be the 'users'
#This factorization would recommend talks for speakers to speak on that were similar to what they've spoken on previously.
nrows = len(top_speakers.index)
ncols = len(topic_count)

V_speakers = np.zeros((nrows, ncols))

speaker_indices = {speaker:i for i, speaker in enumerate(top_speakers.index)}
speaker_lookup = {i:speaker for i, speaker in enumerate(top_speakers.index)}

for i, speaker in enumerate(top_speakers.index):
    sp_topic_counts = {topic:0 for topic in topic_count}
    #Count the number of times the speaker has talked on a given topic
    for topics in meta_data.loc[meta_data["Speaker"] == speaker, "Topics"]:
        for topic in topics:
            sp_topic_counts[topic] += 1
    
    #Store the aggregate topic counts for the speaker in the nonnegative matrix
    for topic in sp_topic_counts:
        topic_col = topic_indices[topic]
        V_speakers[i, topic_col] = sp_topic_counts[topic]
        
print(V_speakers)
print(np.linalg.norm(V_speakers))

[[14.  1.  9. ...  0.  0.  0.]
 [ 2.  0.  8. ...  0.  0.  0.]
 [ 3.  0.  7. ...  0.  0.  0.]
 ...
 [ 0.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]]
285.0649048900969
