In [1]:
import pandas as pd
import numpy as np
import os,glob
from matplotlib import pyplot as plt

In [5]:
os.listdir('../web/scriptures.byu.edu/')

['content', 'citation_index']

In [227]:
def modify_topics(t):
    ''' 
    Create a field/column with a list of the topics for that talk
    
    Modify the topics csv so that the title of the Talk will be split into the appropriate dates
    
    create a one hot encoding for the topics
    - so it becomes easy to search for specific topics, or topics for specific speakers
    '''
    # just remove the NaN 
    # topics_columns = t.fillna(0).iloc[:,range(1,24)].values
    topics_columns = t.fillna(0).iloc[:,1:].values
    topics_columns = [[x for x in row if x != 0] for row in topics_columns]
    t['topic_lists'] = topics_columns
    
    colon = np.array([x[-8] == ':' for x in t[t.columns[0]]])
    assert (colon.astype(int).sum() / t.shape[0]) == 1.0,'there are places where the colon isnt the -8th element in string'
    t['Title'] = [x[:-8] for x in t[t.columns[0]]]
    t['Year'] = [int(x[-7:-3]) for x in t[t.columns[0]]]
    t['Month'] = [int(x[-2:]) for x in t[t.columns[0]]]
    
    #create one hot encoding for topics
    # this was a really fast process
    all_topics = set()
    for i in range(0,23):
        all_topics = all_topics.union(set(t[str(i)].unique()))
    #print(len(all_topics))
    all_topics.remove(np.nan)
    #print(len(all_topics))
    z = np.zeros((t.shape[0],len(all_topics)))
    Z = pd.DataFrame(z,columns = sorted(list(all_topics)))
    #print(t.shape,Z.shape)
    t = pd.concat([t,Z],axis=1)
    #print(t.shape)
    
    # my algorithm for the one hot encoding will be to iterate through the rows, 
    #an for the list of topics for that row to add on in the appropriate column
    # this ran in just a few seconds
    column_names = list(t.columns)
    for i in range(t.shape[0]):
        individual_topics = t.iloc[i]['topic_lists']
        for topic in individual_topics:
            t.iloc[i,column_names.index(topic)] += 1
            
    #note that temples and temple have the exact same talks associate, so it's useless to keep both
    t.drop(columns='temples',inplace=True)
            
    return t
    

In [228]:
def cols():
    l = [x for x in t.columns]
    print(l)
    pass

In [220]:
#t for topics
t = pd.read_csv('../data/topic_data.csv')
print(t.shape)
print(t.columns)
t = modify_topics(t)
print('t shape after one hot encoding topics',t.shape)
# t.tail(10)

(3692, 24)
Index(['Unnamed: 0', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
       '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22'],
      dtype='object')


In [221]:
# t.head()

In [223]:
t.iloc[0][['Jesus Christ', 'joy', 'service', 'sacrament']]

Jesus Christ    1.0
joy             1.0
service         1.0
sacrament       1.0
Name: 0, dtype: object

# Note that many instances are "Sustaining of General Authorities, Area Seventies, and General Officers"
- `t.iloc[1274]['Unnamed: 0']`

In [95]:
print(t.loc[t.Title == 'The Sustaining of Church Officers'].shape[0])

print(t.loc[t.Title == 'Sustaining of General Authorities, Area Seventies, and General Officers'].shape[0])


16
2


In [52]:
t.iloc[1274]['Unnamed: 0']

'Sustaining of General Authorities, Area Seventies, and General Officers:2020/10'

In [37]:
t[t.columns[-3:]].tail()

Unnamed: 0,Title,Year,Month
3687,The Family—A Divine Blessing,1974,4
3688,Prepare the Heart of Your Son,1982,10
3689,Boys Need Men,1974,4
3690,Selflessness: A Pattern for Happiness,1985,4
3691,“A Meaningful Celebration”,1987,10


# LOOK AT SUMMARY.CSV

In [8]:
glob.glob('../data/*.csv')
# os.listdir('../data/')

['../data/summary.csv', '../data/topic_data.csv']

In [69]:
#s for summary
s = pd.read_csv('../data/summary.csv',index_col=0)
print(s.shape)
s.rename(columns={'Month':'Month_letter'},inplace=True)
print(s.columns)
s['Month'] = s['Month_letter'].map({'A':4,'O':10})

(4409, 6)
Index(['Year', 'Month_letter', 'Speaker', 'Title', 'File', 'Kicker'], dtype='object')


In [72]:
s.tail()

Unnamed: 0,Year,Month_letter,Speaker,Title,File,Kicker,Month
995,1959,A,Stephen L Richards,What It Means To Be a Christian,data/995.txt,,4
996,1959,A,Henry D. Taylor,Gratitude,data/996.txt,,4
997,1959,A,S. Dilworth Young,Heed the Whisperings of the Spirit,data/997.txt,,4
998,1959,A,Sterling W. Sill,Show Us the Father,data/998.txt,,4
999,1959,A,Thorpe B. Isaacson,Responsibility for the Guidance of Youth,data/999.txt,,4


In [73]:
s.dtypes

Year             int64
Month_letter    object
Speaker         object
Title           object
File            object
Kicker          object
Month            int64
dtype: object

# NOTE THAT THE SUMMARY ONLY GOES UP TO OCTOBER 2018
- `print(s.Year.max())`

In [79]:
temp = t.loc[(t.Year == 2018) & (t.Month == 10)]
print(temp.shape)
print('there is more because it includes sustainings')
temp[['Title','Month']]

(34, 27)
there is more because it includes sustainings


Unnamed: 0,Title,Month
32,The Correct Name of the Church,10
33,Our Campfire of Faith,10
34,Wounded,10
35,Choose You This Day,10
36,All Must Take upon Them the Name Given of the ...,10
37,Gather Together in One All Things in Christ,10
38,The Vision of the Redemption of the Dead,10
39,Taking upon Ourselves the Name of Jesus Christ,10
40,Firm and Steadfast in the Faith of Christ,10
41,Becoming a Shepherd,10


In [89]:
# s.loc[(s.Year == 2018) & (s.Month_letter == 'O') & (s.Speaker == 'Henry B. Eyring')]

# thus 7% of the talks in summary.csv have an incorrect title

In [97]:
len(comma)

3692

In [96]:
comma = np.array([',' in title for title in t['Title']])
# only 0.07
comma.astype(int).sum() / len(comma)

0.07313109425785481

In [77]:
temp2 = s.loc[(s.Year == 2018) & (s.Month_letter == 'O')]
print(temp2.shape)
temp2[['Speaker','Title']]

(33, 7)


Unnamed: 0,Speaker,Title
8329,Russell M. Nelson,Opening Remarks
8330,Quentin L. Cook,Deep and Lasting Conversion to Heavenly Father...
8331,M. Joseph Brough,Lift Up Your Head and Rejoice
8332,Steven R. Bangerter,Laying the Foundation of a Great Work
8333,Ronald A. Rasband,Be Not Troubled
8334,David A. Bednar,Gather Together in One All Things in Christ
8335,Dallin H. Oaks,Truth and the Plan
8336,D. Todd Christofferson,Firm and Steadfast in the Faith of Christ
8337,Dean M. Davies,Come
8338,Ulisses Soares,One in Christ


# one hot encode the topics

In [98]:
t.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,16,17,18,19,20,21,22,Title,Year,Month
0,Finding Joy in Christ:2020/10,Jesus Christ,joy,service,sacrament,,,,,,...,,,,,,,,Finding Joy in Christ,2020,10
1,The Culture of Christ:2020/10,Jesus Christ,conversion,Church membership,,,,,,,...,,,,,,,,The Culture of Christ,2020,10
2,We Talk of Christ:2020/10,Jesus Christ,missionary work,Second Coming,,,,,,,...,,,,,,,,We Talk of Christ,2020,10
3,The Exquisite Gift of the Son:2020/10,Jesus Christ,repentance,adversity,Atonement,,,,,,...,,,,,,,,The Exquisite Gift of the Son,2020,10
4,"Tested, Proved, and Polished:2020/10",Jesus Christ,faith,adversity,Atonement,plan of salvation,,,,,...,,,,,,,,"Tested, Proved, and Polished",2020,10


In [109]:
# just to verify I'm selecting the appropriate topic column names
topic_column_names = t.iloc[:,range(1,24)].columns
topic_column_names = [int(x) for x in topic_column_names]
topic_column_names == [x for x in range(0,23)]

True

In [134]:
# just remove the NaN 
# topics_columns = t.fillna(0).iloc[:,range(1,24)].values
topics_columns = t.fillna(0).iloc[:,1:].values
topics_columns = [[x for x in row if x != 0] for row in topics_columns]
t['topic_lists'] = topics_columns

In [137]:
topics_columns = [[x for x in row if x != 0] for row in topics_columns]

In [1]:
# topics_columns