In [38]:
from pprint import pprint
import pandas as pd

In [13]:

from tf.app import use

A = use("etcbc/bhsa", hoist=globals())
Fmt = F
Lmt = L
Tmt = T

B = use("etcbc/dss", checkout="clone", version="1.8", hoist=globals())
Fdss = F
Ldss = L
Tdss = T

# here we rename the functions F L and T so the DSS functions does not overwrite the BHSA functions
# checkout="clone" is to look for the data set in the folder ~/github

**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
,,,
book,39.0,10938.21,100.0
chapter,929.0,459.19,100.0
lex,9230.0,46.22,100.0
verse,23213.0,18.38,100.0
half_verse,45179.0,9.44,100.0
sentence,63717.0,6.7,100.0
sentence_atom,64514.0,6.61,100.0
clause,88131.0,4.84,100.0
clause_atom,90704.0,4.7,100.0


**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
,,,
scroll,1001.0,1428.81,100.0
lex,10450.0,129.14,94.0
fragment,11182.0,127.91,100.0
line,52895.0,27.04,100.0
clause,125.0,12.85,0.0
cluster,101099.0,6.68,47.0
phrase,315.0,5.1,0.0
word,500995.0,2.81,99.0
sign,1430241.0,1.0,100.0


What we are going to do now: for each occurrence of halak, we want to collect tf ID, book, chapter, verse, scroll name, stem, consonant rep. of the word, person number gender

otype: specifies the object type that we are looking for

In [59]:
halak_info = {}

book_names = set()

for word in Fmt.otype.s("word"):
    if Fmt.lex.v(word) == "HLK[":
        book, chapter, verse = Tmt.sectionFromNode(word)
        book_names.add(book)
        stem = Fmt.vs.v(word)
        verbal_tense = Fmt.vt.v(word)
        consonant_rep = Fmt.g_cons.v(word)
        language = Fmt.language.v(word)
        features = [word, book, str(chapter), str(verse), "MT", stem, verbal_tense, consonant_rep, language]
        halak_info[word] = features

Structure in the DSS dataset: instead of book chpter verse being object typem they are features of word
book is a feature: word level feature, give the biblical book where the word occurs

In [61]:
for word in Fdss.otype.s("word"):
    if Fdss.lex_etcbc.v(word) == "HLK[":
        book = Fdss.book_etcbc.v(word)
        chapter = Fdss.chapter.v(word)
        verse = Fdss.verse.v(word)
        scroll = Ldss.u(word, "scroll")[0]
        scroll_name = Tdss.scrollName(scroll)
        stem = Fdss.vs_etcbc.v(word)
        verbal_tense = Fdss.vt_etcbc.v(word)
        consonant_rep = Fdss.g_cons.v(word)
        language = Fdss.lang_etcbc.v(word)
        if book and book in book_names:   
            features = [word, book, chapter, verse, scroll_name, stem, verbal_tense, consonant_rep, language]
            halak_info[word] = features


In [62]:
pprint(halak_info)

{935: [935, 'Genesis', '2', '14', 'MT', 'qal', 'ptca', 'HLK', 'Hebrew'],
 1321: [1321, 'Genesis', '3', '8', 'MT', 'hit', 'ptca', 'MTHLK', 'Hebrew'],
 1449: [1449, 'Genesis', '3', '14', 'MT', 'qal', 'impf', 'TLK', 'Hebrew'],
 2424: [2424, 'Genesis', '5', '22', 'MT', 'hit', 'wayq', 'JTHLK', 'Hebrew'],
 2455: [2455, 'Genesis', '5', '24', 'MT', 'hit', 'wayq', 'JTHLK', 'Hebrew'],
 2781: [2781, 'Genesis', '6', '9', 'MT', 'hit', 'perf', 'HTHLK', 'Hebrew'],
 3407: [3407, 'Genesis', '7', '18', 'MT', 'qal', 'wayq', 'TLK', 'Hebrew'],
 3598: [3598, 'Genesis', '8', '3', 'MT', 'qal', 'infa', 'HLWK', 'Hebrew'],
 3634: [3634, 'Genesis', '8', '5', 'MT', 'qal', 'infa', 'HLWK', 'Hebrew'],
 4451: [4451, 'Genesis', '9', '23', 'MT', 'qal', 'wayq', 'JLKW', 'Hebrew'],
 5460: [5460, 'Genesis', '11', '31', 'MT', 'qal', 'infc', 'LKT', 'Hebrew'],
 5489: [5489, 'Genesis', '12', '1', 'MT', 'qal', 'impv', 'LK', 'Hebrew'],
 5532: [5532, 'Genesis', '12', '4', 'MT', 'qal', 'wayq', 'JLK', 'Hebrew'],
 5540: [5540, 'Genes

In [63]:
len(halak_info)

2280

In [64]:
halak_dataframe = pd.DataFrame(halak_info).T

In [65]:
halak_dataframe.columns = ["word", "book", "chapter", "verse", "scroll_name", "stem", "verbal_tense", "consonant_rep", "language"]

In [66]:
halak_dataframe.head()

Unnamed: 0,word,book,chapter,verse,scroll_name,stem,verbal_tense,consonant_rep,language
935,935,Genesis,2,14,MT,qal,ptca,HLK,Hebrew
1321,1321,Genesis,3,8,MT,hit,ptca,MTHLK,Hebrew
1449,1449,Genesis,3,14,MT,qal,impf,TLK,Hebrew
2424,2424,Genesis,5,22,MT,hit,wayq,JTHLK,Hebrew
2455,2455,Genesis,5,24,MT,hit,wayq,JTHLK,Hebrew


In [67]:
halak_dataframe.stem.value_counts()

qal     2067
hit       93
hif       72
piel      38
peal       4
nif        2
haf        2
pael       2
Name: stem, dtype: int64

In [68]:
halak_dataframe.language.value_counts()

Hebrew     2272
Aramaic       8
Name: language, dtype: int64

In [69]:
pd.crosstab(halak_dataframe.stem, halak_dataframe.language)

language,Aramaic,Hebrew
stem,Unnamed: 1_level_1,Unnamed: 2_level_1
haf,2,0
hif,0,72
hit,0,93
nif,0,2
pael,2,0
peal,4,0
piel,0,38
qal,0,2067


In [72]:
halak_dataframe.to_csv("halak_data.csv", sep="\t", index=False)

In [74]:
pd.read_csv("halak_data.csv", sep="\t")

Unnamed: 0,word,book,chapter,verse,scroll_name,stem,verbal_tense,consonant_rep,language
0,935,Genesis,2,14,MT,qal,ptca,HLK,Hebrew
1,1321,Genesis,3,8,MT,hit,ptca,MTHLK,Hebrew
2,1449,Genesis,3,14,MT,qal,impf,TLK,Hebrew
3,2424,Genesis,5,22,MT,hit,wayq,JTHLK,Hebrew
4,2455,Genesis,5,24,MT,hit,wayq,JTHLK,Hebrew
...,...,...,...,...,...,...,...,...,...
2275,2107819,Judges,4,8,Xjudges,qal,perf,HLKTJ,Hebrew
2276,2107823,Judges,4,8,Xjudges,qal,impf,TLKJ,Hebrew
2277,2107826,Judges,4,8,Xjudges,qal,impf,>LK,Hebrew
2278,2107830,Judges,4,9,Xjudges,qal,infa,HLK,Hebrew
