NB: This code can only be used if you have the required dataset, which is not public.
Thus, the original dataset is not included.

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

# NB: user needs to install xlrd

In [2]:
from tf.app import use
A = use("etcbc/bhsa", hoist=globals())

**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


In [3]:
verbs_dataset = pd.read_excel("verbs_dataset.xls") # needs the original dataset to go on

In [4]:
verbs_dataset.head(5)

Unnamed: 0,id,book,info_refer,info_reads,info_means,texttype,syn_vbinit,syn_gcb4vb,syn_realis,syn_affneg,...,soc_north,gc2,book2,vb_aspect,obj_def2,gc_samesame2,gc_parsame2,verse,gc3binary,unique
0,1.0,genesis,Gen 01:09,?el ma:qom exad,to one place,narrative,verb first,GC not before vb,irrealis,affirm,...,not yet,prep or prep plus prep,genesis,imperfective,,,,prose,1,unique or representative
1,2.0,genesis,Gen 02:19,?el ha?adam,to the man,narrative,verb first,GC not before vb,realis,affirm,...,not yet,prep or prep plus prep,genesis,perfective,not explicit,,,prose,1,unique or representative
2,3.0,genesis,Gen 02:22,?el ha?adam,to the man,narrative,verb first,GC not before vb,realis,affirm,...,not yet,prep or prep plus prep,genesis,perfective,pronoun,,,prose,1,unique or representative
3,4.0,genesis,Gen 04:03,le-YHWH,to YHWH,narrative,verb first,GC not before vb,realis,affirm,...,not yet,prep or prep plus prep,genesis,perfective,indef NP,,,prose,1,unique or representative
4,5.0,genesis,Gen 06:18,?el hate:bah,to the ark,n/sp,verb first,GC not before vb,irrealis,affirm,...,not yet,prep or prep plus prep,genesis,imperfective,,,,prose,1,unique or representative


In [7]:
# print the verb root info column
verbs_dataset.info_verbroot

# extract all the different verbs
# verbs_dataset.info_verbroot.unique()

0               q.w.h "gather"
1                 b.w.? "come"
2                 b.w.? "come"
3                 b.w.? "come"
4                 b.w.? "come"
                 ...          
3120              b.w.? "come"
3121              b.w.? "come"
3122              .l.h "go up"
3123              b.w.? "come"
3124    g.l.h "carry to exile"
Name: info_verbroot, Length: 3125, dtype: object

In [16]:
# save to a csv file
verbs_dataset.info_verbroot.value_counts().to_csv("verbs_counts.csv", sep=";")

# show the file
pd.read_csv("verbs_counts.csv", sep=";")

Unnamed: 0.1,Unnamed: 0,info_verbroot
0,"b.w.? ""come""",1177
1,"h.l.k ""go""",290
2,"sh.w.b ""return""",248
3,".l.h ""go up""",242
4,"sh.l.x ""send""",193
...,...,...
75,"sh.x.t ""press""",1
76,"l.q.t ""gather""",1
77,"d.r.k ""tread""",1
78,"p.sh.t ""raid""",1


In [17]:
# bcv stands for book, chapter, verse

bcv_verbs = verbs_dataset.groupby("info_verbroot").first().info_refer.values
info_verbroot = verbs_dataset.groupby("info_verbroot").first().index
#bcv_verbs

In [18]:
# How to retrieve book, chapter, name for each verb

# retrieve a section book chapter verse from the column refer in the dataset
book_names = {
    '1Ch':'1_Chronicles',
    '1Kgs':'1_Kings',
    '1Sam':'1_Samuel',
    '2Ch':'2_Chronicles',
    '2Kgs':'2_Kings',
    '2Sam':'2_Samuel',
    'Amos':'Amos',
    'Dan':'Daniel',
    'Deut':'Deuteronomy',
    'Qoh':'Ecclesiastes', 
    'Est':'Esther',
    'Ex':'Exodus',
    'Ezek':'Ezekiel',
    'Ezra':'Ezra',
    'Gen':'Genesis',
    'Hab':'Habakkuk', # absent from the dataset
    'Hag':'Haggai',
    'Hosea':'Hosea', # absent from the dataset
    'Isa':'Isaiah',
    'Jer':'Jeremiah',
    'Job':'Job',
    'Joel':'Joel',
    'Jonah':'Jonah',
    'Josh':'Joshua',
    'Jud':'Judges',
    'Lam':'Lamentations', # absent from the dataset
    'Lev':'Leviticus',
    'Mal':'Malachi',
    'Micah':'Micah', # absent from the dataset
    'Nahum':'Nahum', # absent from the dataset
    'Neh':'Nehemiah',
    'Num':'Numbers',
    'Obad':'Obadiah', # absent from the dataset
    'Prov':'Proverbs', # absent from the dataset
    'Ps':'Psalms',
    'Rut':'Ruth',
    'Song_of_songs':'Song_of_songs', # absent from the dataset
    'Zech':'Zechariah',
    'Zephaniah':'Zephaniah', # absent from the dataset
}
# sections contains groups of book chapter verse

sections = []

# separate book from chapter:verse and add the data to the sections list
for item in bcv_verbs:
    sections.append(item.split(sep=" "))

# how to split the data into a usable section book, chapter, verse
for item in sections:
    # split chapter from verse
    item[1:] = item[1].split(sep=":")
    
    # transform chapter:verse to integers
    item[1] = int(item[1])
    item[2] = int(item[2])
    
    # retrieve the ETCBC book name from the dictionary and replace it in sections
    item[0] = book_names[item[0]]
    
print(sections)

# define a function to retrieve the lexeme fronm a section book, chapter, verse
def verse_lex(section):
    verse = (T.nodeFromSection(section))
    return " ".join([F.lex.v(w) for w in L.d(verse, "word")])

verses_lexemes = []

for section in sections:
    verses_lexemes.append(verse_lex(section))

[['Genesis', 12, 6], ['Genesis', 13, 1], ['Genesis', 24, 20], ['Genesis', 12, 8], ['Isaiah', 6, 6], ['Genesis', 6, 21], ['1_Chronicles', 12, 9], ['2_Kings', 3, 26], ['Genesis', 27, 43], ['Genesis', 2, 19], ['Judges', 20, 45], ['Judges', 20, 43], ['Esther', 6, 12], ['1_Kings', 18, 42], ['2_Kings', 15, 29], ['Joshua', 10, 18], ['Genesis', 11, 31], ['Judges', 7, 13], ['Ezekiel', 39, 28], ['Judges', 11, 3], ['Genesis', 12, 15], ['Judges', 1, 34], ['Genesis', 18, 6], ['Genesis', 19, 17], ['Judges', 4, 6], ['Leviticus', 5, 9], ['Zechariah', 14, 4], ['Numbers', 5, 23], ['Deuteronomy', 30, 1], ['Daniel', 9, 21], ['2_Samuel', 14, 14], ['Genesis', 27, 22], ['Daniel', 8, 4], ['Deuteronomy', 4, 27], ['1_Samuel', 2, 14], ['Genesis', 14, 10], ['Genesis', 12, 9], ['Genesis', 43, 34], ['1_Samuel', 14, 26], ['Genesis', 38, 1], ['Exodus', 9, 33], ['Joshua', 4, 18], ['Genesis', 14, 10], ['Genesis', 24, 27], ['2_Kings', 9, 33], ['Genesis', 24, 49], ['Ezekiel', 17, 21], ['Genesis', 28, 14], ['Judges', 20, 

In [19]:
# list(zip(info_verbroot, sections, verses_lexemes))
pd.DataFrame({"info_verbroot": info_verbroot, "bhsa_lex": [""]*len(info_verbroot), "sections": sections, "verses_lexemes": verses_lexemes}).to_csv("info_verbroot_with_lexemes.csv", sep=";", index=False)

In [20]:
# try out the verse_lex function
verse_lex(['Isaiah', 6, 6])

'W <WP[ >L >XD/ MN H FRP/ W B JD/ RYPH/ B MLQXJM/ LQX[ MN <L H MZBX/'

In [21]:
df = pd.read_csv("verb_occ_count_bhsa_lex.csv", sep=";")
df

# sort the verb_occurrences values in a descending order
df[["bhsa_lex", "verb_occurrences"]].sort_values("verb_occurrences", ascending=False).to_csv("verb_count_etcbc_lex.csv", sep=";", index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'verb_occ_count_bhsa_lex.csv'