In [1]:
import pandas as pd
from math import nan
import re

In [2]:
summary = pd.read_csv("../data/summary.csv", index_col=0)
topic_data = pd.read_csv("../data/topic_data.csv")

In [3]:
summary.loc[:, "Month"] = summary["Month"].map(lambda m: 4 if m == 'A' else 10 if m == "O" else nan)
summary

Unnamed: 0,Year,Month,Speaker,Title,File,Kicker
1,1942,4,Heber J. Grant,Personal Testimony of the Lord’s Providence,data/1.txt,
10,1942,4,LeGrand Richards,Leadership in the Aaronic Priesthood,data/10.txt,
100,1943,10,"J. Reuben Clark, Jr.",Untitled,data/100.txt,
1000,1959,4,Harold B. Lee,"The Gospel, a Solid Wall of Truth",data/1000.txt,
1001,1959,4,David O. McKay,Training of Youth,data/1001.txt,
...,...,...,...,...,...,...
995,1959,4,Stephen L Richards,What It Means To Be a Christian,data/995.txt,
996,1959,4,Henry D. Taylor,Gratitude,data/996.txt,
997,1959,4,S. Dilworth Young,Heed the Whisperings of the Spirit,data/997.txt,
998,1959,4,Sterling W. Sill,Show Us the Father,data/998.txt,


In [4]:
topic_data

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,13,14,15,16,17,18,19,20,21,22
0,Finding Joy in Christ:2020/10,Jesus Christ,joy,service,sacrament,,,,,,...,,,,,,,,,,
1,The Culture of Christ:2020/10,Jesus Christ,conversion,Church membership,,,,,,,...,,,,,,,,,,
2,We Talk of Christ:2020/10,Jesus Christ,missionary work,Second Coming,,,,,,,...,,,,,,,,,,
3,The Exquisite Gift of the Son:2020/10,Jesus Christ,repentance,adversity,Atonement,,,,,,...,,,,,,,,,,
4,"Tested, Proved, and Polished:2020/10",Jesus Christ,faith,adversity,Atonement,plan of salvation,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3687,The Family—A Divine Blessing:1974/04,family,,,,,,,,,...,,,,,,,,,,
3688,Prepare the Heart of Your Son:1982/10,fatherhood,,,,,,,,,...,,,,,,,,,,
3689,Boys Need Men:1974/04,fatherhood,,,,,,,,,...,,,,,,,,,,
3690,Selflessness: A Pattern for Happiness:1985/04,generosity,,,,,,,,,...,,,,,,,,,,


In [5]:
def parse_topic_data_row(row):
    pieces = row[0].rsplit(":", 1)
    title = pieces[0]
    #title = title.replace('“', '"')
    #title = title.replace("”", '"')
    title = title.replace("&amp;", "&")
    year, month = pieces[1].split("/")
    year, month = int(year), int(month)
    topic_list = [topic for topic in row[1:] if pd.notna(topic)]
    return title, year, month, topic_list
    
topics = {}
entries = []

for i in topic_data.index:
    title, year, month, topic_list = parse_topic_data_row(topic_data.iloc[i])
    for topic in topic_list:
        if topic in topics:
            topics[topic] += 1
        else:
            topics[topic] = 1
    entries.append((title, year, month, topic_list))
    
#summary.loc["Title"] = summary["Title"].str.replace('“', '"')
#summary.loc["Title"] = summary["Title"].str.replace('”', '"')

In [6]:
topics

{'Jesus Christ': 817,
 'joy': 148,
 'service': 423,
 'sacrament': 71,
 'conversion': 124,
 'Church membership': 12,
 'missionary work': 331,
 'Second Coming': 5,
 'repentance': 260,
 'adversity': 282,
 'Atonement': 265,
 'faith': 500,
 'plan of salvation': 239,
 'spirituality': 285,
 'Holy Ghost': 312,
 'scripture study': 117,
 'revelation': 24,
 'temple work': 85,
 'Church organization': 51,
 'Restoration': 141,
 'belonging': 2,
 'discipleship': 80,
 'name of Church': 6,
 'fasting': 31,
 'example': 163,
 'Resurrection': 85,
 'temple': 173,
 'temples': 173,
 'family': 403,
 'peace': 115,
 'covenants': 180,
 'Book of Mormon': 131,
 'commandments': 117,
 'commitment': 65,
 'humility': 86,
 'love': 357,
 'activation': 84,
 'opposition': 20,
 'holiness': 2,
 'First Vision': 3,
 'general conference': 63,
 'gathering': 4,
 'Zion': 12,
 'God the Father': 90,
 'priesthood': 381,
 'ministering': 22,
 'scriptures': 108,
 'sacrifice': 88,
 'Church meetings': 15,
 'reverence': 21,
 'hope': 62,
 'g

In [7]:
merge_data = pd.DataFrame.from_records(entries, columns =['Title', 'Year', 'Month', 'Topics']) 

In [8]:
merge_data

Unnamed: 0,Title,Year,Month,Topics
0,Finding Joy in Christ,2020,10,"[Jesus Christ, joy, service, sacrament]"
1,The Culture of Christ,2020,10,"[Jesus Christ, conversion, Church membership]"
2,We Talk of Christ,2020,10,"[Jesus Christ, missionary work, Second Coming]"
3,The Exquisite Gift of the Son,2020,10,"[Jesus Christ, repentance, adversity, Atonement]"
4,"Tested, Proved, and Polished",2020,10,"[Jesus Christ, faith, adversity, Atonement, pl..."
...,...,...,...,...
3687,The Family—A Divine Blessing,1974,4,[family]
3688,Prepare the Heart of Your Son,1982,10,[fatherhood]
3689,Boys Need Men,1974,4,[fatherhood]
3690,Selflessness: A Pattern for Happiness,1985,4,[generosity]


In [9]:
def find_mismatched_titles(merge_data, summary):
    mismatched = []
    #check for discrepancies
    for i in merge_data.index:
        row = merge_data.iloc[i]
        title = row["Title"]
        year = int(row["Year"])
        month = int(row["Month"])

        found = False
        for title2 in summary["Title"]:
            if title == title2:
                found = True
                break
        #webcrawling from scriptures.byu.edu doesn't have the latest talks
        if not found and year <= 2018:
            mismatched.append((title, year, month))
    return mismatched

def report_mismatched(merge_data, summary):
    mismatched = find_mismatched_titles(merge_data, summary)
    print(f"Mismatched titles:{len(mismatched)}/{len(merge_data.index)}")
    print()
    for mismatch in mismatched:
        title = mismatch[0]
        if "Statistical Report" not in title and "Church Auditing" not in title \
        and "The Sustaining of Church Officers" not in title:
            print(mismatch)
    return mismatched
        
mismatched = report_mismatched(merge_data, summary)

Mismatched titles:81/3692

('Faith—the Choice is Yours', 2010, 10)
('“The Great and Wonderful Love”', 2006, 10)
('Be Prepared … Be Ye Strong from Henceforth', 2005, 10)
('On Zion’s Hill', 2005, 10)
('“By What Power … Have Ye Done This?”', 1998, 10)
('Pioneers of the Future: “Be Not Afraid, Only Believe”', 1997, 10)
('“Remember … Thy Church, O Lord”', 1996, 4)
('“Come unto Christ, and Be Perfected in Him”', 1988, 4)
('The Refiner’s Fire', 1979, 4)
('“If Christ Had My Opportunities …”', 2005, 10)
('“These … Were Our Examples”', 1991, 10)
('“She Is Not Afraid of the Snow for Her Household …”', 1976, 10)
('By Divine Design', 2017, 10)
('Do Unto Others …', 1977, 4)
('“In … Counsellors There Is Safety”', 1990, 10)
('“Crickets” Can Be Destroyed through Spirituality', 1990, 10)
('But If Not …', 2004, 4)
('“Behold, the Enemy Is Combined” (D&C 38:12)', 1993, 4)
('But Be Ye Doers of the Word', 1977, 4)
('“There Is the Light”', 1976, 10)
('God Moves in a Mysterious Way His Wonders to Perform', 197

In [10]:
#export the mismatched titles so I can find the corresponding titles and add them to the file
"""
if not os.path.exists("data/title_renames.tsv"):
    with open("data/title_renames.tsv", 'w') as f:
        for mismatch in mismatched:
            title = mismatch[0]
            if "Statistical Report" not in title and "Church Auditing" not in title \
            and "The Sustaining of Church Officers" not in title:
                f.write(title+'\n')
"""
#add talk years

#read in lines
with open("data/title_renames.tsv", 'r') as f:
    rows = []
    for i, line in enumerate(f):
        entries = line.strip().split('\t')
        for mismatch in mismatched[i:]:
            if mismatch[0] == entries[0]:
                rows.append([str(mismatch[1])] + entries)
#write out lines
with open("data/title_renames2.tsv", 'w') as f:
    for row in rows:
        f.write("\t".join(row) +'\n')

In [11]:
with open("data/title_renames2.tsv") as f:
    for line in f:
        entries = line.strip().split('\t')
        year = int(entries[0])
        titles = entries[1:3]
        #Which title is correct
        pref_index = int(entries[3])

        if pref_index == 0: #topic data title is correct
            summary.loc[(summary["Title"] == titles[1]) & (summary["Year"] == year), "Title"] = titles[0]
        elif pref_index == 1: #summary data title is correct
            merge_data.loc[(merge_data["Title"] == titles[0]) & (merge_data["Year"] == year), "Title"] = titles[1]
            
#Check to see if mismatch was fixed
mismatched = report_mismatched(merge_data, summary)

Mismatched titles:32/3692

('“She Is Not Afraid of the Snow for Her Household …”', 1976, 10)
('The Family: A Proclamation to the World', 1995, 10)
('The Reconstitution of the First Quorum of the Seventy', 1976, 10)


In [12]:
raise ValueError("break here")

ValueError: break here

In [13]:
for colname in ["Year", "Month", "Title"]:
    print(colname, summary[colname].dtype, merge_data[colname].dtype)

Year int64 int64
Month int64 int64
Title object object


In [17]:
merged = summary.merge(merge_data, on=["Year", "Month", "Title"], how='left')
merged

Unnamed: 0,Year,Month,Speaker,Title,File,Kicker,Topics
0,1942,4,Heber J. Grant,Personal Testimony of the Lord’s Providence,data/1.txt,,
1,1942,4,LeGrand Richards,Leadership in the Aaronic Priesthood,data/10.txt,,
2,1943,10,"J. Reuben Clark, Jr.",Untitled,data/100.txt,,
3,1959,4,Harold B. Lee,"The Gospel, a Solid Wall of Truth",data/1000.txt,,
4,1959,4,David O. McKay,Training of Youth,data/1001.txt,,
...,...,...,...,...,...,...,...
5360,1959,4,Stephen L Richards,What It Means To Be a Christian,data/995.txt,,
5361,1959,4,Henry D. Taylor,Gratitude,data/996.txt,,
5362,1959,4,S. Dilworth Young,Heed the Whisperings of the Spirit,data/997.txt,,
5363,1959,4,Sterling W. Sill,Show Us the Father,data/998.txt,,


In [18]:
merged[merged["Year"] == 2018]

Unnamed: 0,Year,Month,Speaker,Title,File,Kicker,Topics
5109,2018,4,Henry B. Eyring,Solemn Assembly,data/8293.txt,,"[Church organization, prophets]"
5110,2018,4,M. Russell Ballard,Precious Gifts from God,data/8294.txt,,"[Jesus Christ, faith, prophets, sabbath, servi..."
5111,2018,4,Brian K. Taylor,Am I a Child of God?,data/8295.txt,,"[Jesus Christ, divine nature, God the Father, ..."
5112,2018,4,Larry J. Echo Hawk,"Even as Christ Forgives You, So Also Do Ye",data/8296.txt,,"[Jesus Christ, forgiveness]"
5113,2018,4,Gary E. Stevenson,The Heart of a Prophet,data/8297.txt,,"[Church organization, prophets, teaching, Firs..."
...,...,...,...,...,...,...,...
5180,2018,10,Matthew L. Carpenter,Wilt Thou Be Made Whole?,data/8357.txt,,"[Jesus Christ, repentance, adversity, healing]"
5181,2018,10,Dale G. Renlund,Choose You This Day,data/8358.txt,,"[Jesus Christ, repentance, plan of salvation, ..."
5182,2018,10,Jack N. Gerard,Now Is the Time,data/8359.txt,,"[perspective, truth, priorities, revelation]"
5184,2018,10,Gary E. Stevenson,Shepherding Souls,data/8360.txt,,"[Jesus Christ, ministering, activation]"


In [19]:
merged.to_csv("data/caleb_merged_topics.csv")

Code for finding the mismatched titles

In [None]:
search_title = lambda df, title, regex=False: df[df["Title"].str.contains(title, regex=regex, case=False, flags=re.IGNORECASE)]

In [None]:
#Not surprisingly, the statistical reports are not included
search_title(summary, "Statistical Report")

In [None]:
search_title(summary, "the Choice is Yours")

In [None]:
search_title(summary, "The Great and Wonderful Love")

In [None]:
search_title(summary, "Be Ye Strong from Henceforth")

In [None]:
search_title(summary, "On Zion")

In [None]:
search_title(summary, "Have Ye Done This?")

In [None]:
search_title(summary, "Pioneers of the Future:")["Title"].item()

In [None]:
search_title(summary, "Thy Church, O Lord")

In [None]:
search_title(summary, "Come unto Christ, and be perfected in Him")

In [None]:
search_title(summary, "The Refiner")

In [None]:
search_title(summary, "If Christ Had My Opportunities")

In [None]:
search_title(summary, "Were Our Examples")

In [None]:
search_title(summary, "She Is Not Afraid of the Snow for Her Household")

In [None]:
search_title(summary, "Divine Design")

In [None]:
search_title(summary, "Do Unto Others")

In [None]:
search_title(summary, "Counsellors There Is Safety")

In [None]:
search_title(summary, "Can Be Destroyed through Spirituality")

In [None]:
search_title(summary, "But If Not")

In [None]:
search_title(summary, "Behold, the Enemy Is Combined")

In [None]:
search_title(summary, "But Be Ye Doers of the Word")

In [None]:
search_title(summary, "There Is the Light")

In [None]:
search_title(summary, "God Moves in a Mysterious Way His Wonders to Perform")["Title"].item()

In [None]:
search_title(summary, "Salvation for the Dead")

In [None]:
search_title(summary, "To Be Learned Is Good If")

In [None]:
search_title(summary, "Judge Not, That Ye Be Not Judged")

In [None]:
search_title(summary, "The Standard of Truth Has Been Erected")

In [None]:
search_title(summary, "Ponder the Path of Thy Feet")

In [None]:
search_title(summary, "Draw Near unto Me")

In [None]:
search_title(summary, "Pour You Out a Blessing")

In [None]:
search_title(summary, "Mother Heart")

In [None]:
search_title(summary, "Church Welfare Services Basic Principles")

In [None]:
search_title(summary, "Relief Societys Role in Welfare Services")

In [None]:
search_title(summary, "A Proclamation to the World")

In [None]:
search_title(summary, "Be Ye Clean that Bear the Vessels of the Lord")

In [None]:
search_title(summary, "Thou Mayest Choose for Thyself")

In [None]:
search_title(summary, "Give Heed unto the Prophet")

In [None]:
search_title(summary, "My Words")

In [None]:
search_title(summary, "Seeing the Five A")

In [None]:
search_title(summary, "By Way of Invitation")

In [None]:
search_title(summary, "Then Teach")

In [None]:
search_title(summary, "Did I tell You")

In [None]:
search_title(summary, "That I May Heal You")

In [None]:
search_title(summary, "Solutions from the Scriptures")

In [None]:
search_title(summary, "Repent of")

In [None]:
search_title(summary, "I Will Go and Do…")

In [None]:
search_title(summary, "Even As I Also Overcame")

In [None]:
search_title(summary, "What Went Ye Out… to See?")

In [None]:
search_title(summary, "Except the Lord Build the House…")

In [None]:
search_title(summary, "Report")
summary.loc[2838, "Title"]

In [None]:
search_title(summary, "The Reconstitution")

In [None]:
search_title(summary, "for Boys")

In [None]:
search_title(summary, "Anxiously Engaged")

In [None]:
search_title(summary, "Her Children Arise Up")

In [None]:
#%run preprocess.py

In [None]:
#create_train_test_split()

In [None]:
testdf = pd.DataFrame.from_dict({"a":["hello", "world"], "b":["engaged", "redeemer"]})
testdf

In [None]:
testdf.loc[testdf["a"] == "hello", "a"] = "hi"

In [None]:
testdf

In [None]:
(testdf["a"]  == "world") & (testdf["b"]  == "redeemer")

In [None]:
testdf.loc[(testdf["a"]  == "world") & (testdf["b"]  == "redeemer")]