In [9]:
from tf.app import use
A = use("etcbc/bhsa", hoist=globals())

**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


In [10]:
from pprint import pprint
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

In [11]:
studied_books = ["Genesis", "Esther", "Joshua", "Psalms", "Job", "Jonah", "Song_of_songs"]

In [12]:
{T.sectionFromNode(book)[0] for book in F.otype.s("book")}

{'1_Chronicles',
 '1_Kings',
 '1_Samuel',
 '2_Chronicles',
 '2_Kings',
 '2_Samuel',
 'Amos',
 'Daniel',
 'Deuteronomy',
 'Ecclesiastes',
 'Esther',
 'Exodus',
 'Ezekiel',
 'Ezra',
 'Genesis',
 'Habakkuk',
 'Haggai',
 'Hosea',
 'Isaiah',
 'Jeremiah',
 'Job',
 'Joel',
 'Jonah',
 'Joshua',
 'Judges',
 'Lamentations',
 'Leviticus',
 'Malachi',
 'Micah',
 'Nahum',
 'Nehemiah',
 'Numbers',
 'Obadiah',
 'Proverbs',
 'Psalms',
 'Ruth',
 'Song_of_songs',
 'Zechariah',
 'Zephaniah'}

In [13]:
{F.vt.v(word) for word in F.otype.s("word")}

{'NA', 'impf', 'impv', 'infa', 'infc', 'perf', 'ptca', 'ptcp', 'wayq'}

In [14]:
verbal_tenses = ['impf', 'impv', 'infa', 'infc', 'perf', 'ptca', 'ptcp', 'wayq']

In [15]:
column_vt = ['NA', 'impf', 'impv', 'infa', 'infc', 'perf', 'ptca', 'ptcp', 'wayq']

In [16]:
clause_dict = {}

def get_genre(book_name):
    book_genre = {
        "Genesis": "prose", 
        "Esther": "prose", 
        "Joshua": "prose", 
        "Psalms": "poetry", 
        "Job": "poetry", 
        "Jonah": "indef", 
        "Song_of_songs": "indef",}
    return book_genre[book_name]

def get_clause_length(clause_id):
    words = L.d(clause_id, "word")
    return len(words)  

def get_verbal_tense(clause_id):
    vt = "NA"
    words = L.d(clause_id, "word")
    for word in words:
        if F.vt.v(word) in verbal_tenses:
            vt = F.vt.v(word)
    return vt      

def get_average_phrase_length(clause_id):
    phrase_lengths = []
    phrases = L.d(clause_id, "phrase")
    for phrase_id in phrases:
        words = L.d(phrase_id, "word")
        phrase_lengths.append(len(words))
    average_phrase_length = sum(phrase_lengths) / len(phrase_lengths)
    return average_phrase_length

for clause in F.otype.s("clause"):
    book, chapter, verse = T.sectionFromNode(clause)
    if book in studied_books: 
        clause_length = get_clause_length(clause)
        av_len_phr = get_average_phrase_length(clause)
        genre = get_genre(book)
        verbal_tense = get_verbal_tense(clause)
        vt_values = [0 if verbal_tense != tense else 1 for tense in column_vt]
        clause_info = [clause, book, chapter, verse, clause_length, av_len_phr, genre] + vt_values
        clause_dict[clause] = clause_info
   

In [17]:
clause_df = pd.DataFrame(clause_dict).T

In [18]:
clause_df.columns = ["clause", "book", "chapter", "verse", "clause_length", "av_len_phr", "genre", 'NA', 'impf', 'impv', 'infa', 'infc', 'perf', 'ptca', 'ptcp', 'wayq']

In [19]:
clause_df.head()

Unnamed: 0,clause,book,chapter,verse,clause_length,av_len_phr,genre,NA,impf,impv,infa,infc,perf,ptca,ptcp,wayq
427559,427559,Genesis,1,1,11,2.75,prose,0,0,0,0,0,1,0,0,0
427560,427560,Genesis,1,2,7,1.75,prose,0,0,0,0,0,1,0,0,0
427561,427561,Genesis,1,2,5,1.666667,prose,1,0,0,0,0,0,0,0,0
427562,427562,Genesis,1,2,8,2.0,prose,0,0,0,0,0,0,1,0,0
427563,427563,Genesis,1,3,3,1.0,prose,0,0,0,0,0,0,0,0,1


In [20]:
set(clause_df.genre)

{'indef', 'poetry', 'prose'}

In [21]:
indef_books = clause_df[clause_df.genre == "indef"]

In [22]:
indef_books.head()

Unnamed: 0,clause,book,chapter,verse,clause_length,av_len_phr,genre,NA,impf,impv,infa,infc,perf,ptca,ptcp,wayq
487411,487411,Jonah,1,1,8,2.0,indef,0,0,0,0,0,0,0,0,1
487412,487412,Jonah,1,1,2,2.0,indef,0,0,0,0,1,0,0,0,0
487413,487413,Jonah,1,2,1,1.0,indef,0,0,1,0,0,0,0,0,0
487414,487414,Jonah,1,2,7,3.5,indef,0,0,1,0,0,0,0,0,0
487415,487415,Jonah,1,2,3,1.0,indef,0,0,1,0,0,0,0,0,0


In [23]:
training_set = clause_df[clause_df.genre != "indef"]

training_set_2 = training_set.drop(["clause", "book", "chapter", "verse"], axis = 1)

y = training_set_2.pop("genre")

print(y)

427559    prose
427560    prose
427561    prose
427562    prose
427563    prose
          ...  
506006    prose
506007    prose
506008    prose
506009    prose
506010    prose
Name: genre, Length: 19312, dtype: object


In [24]:
training_set_2

Unnamed: 0,clause_length,av_len_phr,NA,impf,impv,infa,infc,perf,ptca,ptcp,wayq
427559,11,2.75,0,0,0,0,0,1,0,0,0
427560,7,1.75,0,0,0,0,0,1,0,0,0
427561,5,1.666667,1,0,0,0,0,0,0,0,0
427562,8,2.0,0,0,0,0,0,0,1,0,0
427563,3,1.0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
506006,9,2.25,1,0,0,0,0,0,0,0,0
506007,5,1.666667,1,0,0,0,0,0,0,0,0
506008,5,1.666667,0,0,0,0,0,0,0,1,0
506009,4,1.333333,0,0,0,0,0,0,1,0,0


In [25]:
clf = RandomForestClassifier(max_depth=4, random_state=0)
clf.fit(training_set_2, y)

In [26]:
importances = clf.feature_importances_
print(importances)

[2.99409086e-01 9.39136253e-02 4.58275519e-03 1.48569449e-01
 7.82358484e-03 3.74907665e-04 3.07710291e-02 5.94780971e-03
 5.62260904e-03 1.04021967e-04 4.02881123e-01]


In [27]:
indef_books_2 = indef_books.drop(["clause", "book", "chapter", "verse", "genre"], axis = 1)

In [28]:
indef_books_2

Unnamed: 0,clause_length,av_len_phr,NA,impf,impv,infa,infc,perf,ptca,ptcp,wayq
487411,8,2.0,0,0,0,0,0,0,0,0,1
487412,2,2.0,0,0,0,0,1,0,0,0,0
487413,1,1.0,0,0,1,0,0,0,0,0,0
487414,7,3.5,0,0,1,0,0,0,0,0,0
487415,3,1.0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
503752,1,1.0,0,0,1,0,0,0,0,0,0
503753,1,1.0,0,0,1,0,0,0,0,0,0
503754,1,1.0,1,0,0,0,0,0,0,0,0
503755,5,1.25,0,0,1,0,0,0,0,0,0


In [29]:
prediction_genre = clf.predict(indef_books_2)

In [30]:
book_list = list(indef_books.book)

In [31]:
jonah_poetry = 0
jonah_prose = 0
song_poetry = 0
song_prose = 0

for pred, book in zip(prediction_genre, book_list):
    if pred == "poetry" and book == "Jonah":
        jonah_poetry += 1
    elif pred == "poetry" and book == "Song_of_songs":
        song_poetry += 1
    elif pred == "prose" and book == "Jonah":
        jonah_prose += 1
    elif pred == "prose" and book == "Song_of_songs":
        song_prose += 1

In [32]:
print(jonah_poetry, jonah_prose, song_poetry, song_prose)

111 124 382 74
