In [1]:
import pandas as pd
from tfob import TFOb, get_bhsa

BHSA = get_bhsa()

**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


### 1.Create two lists with the verses from MT Isaiah and 1QIsaa

#### 1.1 List of wanted motion verbs

In [2]:
motion_verbs = ['BW>[','HLK[','JY>[','JRD[','<BR[',
                '<LH[','CWB[','>TH[','BRX[','GJX[',
                'GLH[','GLL[','DXP[','DLG[','HWH[',
                'XWC[','XLP[','XSH[','VB<[','VWF[',
                'MHR[','MWC[','NGC[','NHR[','NWX[',
                'NWS[','NXT[','NVP[','NS<[','NPL[',
                'NTK[','SBB[','SWR[','SLQ[','<WZ[',
                '<WP[','PNH[','PF<[','YWP[','Y<D[',
                'QHL[','QPY[','QRB[','RWY[','FVH[',
                'CWX[','CWR=[','CVP[','CQQ[','T>R[',
                'T<H[']

freq_verbs = ['BW>[','HLK[','JY>[','JRD[','<BR[',
                '<LH[','CWB[','>TH[','BRX[','GLH[',
                'HWH[','XLP[','XSH[','MHR[','NGC[',
                'NWX[','NWS[','NS<[','NPL[','NTK[',
                'SBB[','SWR[','<WP[','PNH[','PC<[',
                'QHL[','QRB[','RWY[','CVP[','T<H[']

In [4]:
#set(motion_verbs) - set(freq_verbs )

#### 1.2 List of occurrences of the motion verbs (MT Isaiah)

In [3]:
verbs_bhsa = TFOb.all("word", BHSA).filter_in(lex=motion_verbs)
verbs_bhsa

<word_10909 "TWY> J<WPP TWY> J<LH JY> SBB SWBB HLK JNXHW JB> JPL JB>H MTHLK TLK CWBK TCWB JB> HBJ> JPLW NPLW [...] JY> HSB JB> H<BJRWNJ J<BJRHW JWLJKHW JSJRHW JSB JBJ>HW <LH HLJKW HBJ> JB>HW CWB <LWT J<L HBJ> JGL J<BR J<L">

In [13]:
#TESTING ZONE 
#verbs_bhsa[0]
#TFOb(215297, BHSA).to_verses.text
#TFOb(215297, BHSA).to_verses.pretty()

### 2. Important function

In [7]:
# Create a dataset with the occurrences

def find_complements(verb):
    """Find the complement of a verb. If no match, returns None"""
    if verb.source.name == "BHSA":
        complements = verb.to_clauses.to_phrases.filter(function="Cmpl")
        return complements
    
    # if the verb is not BHSA, it's DSS
    verb_bhsa = find_bhsa_verb(verb)

    # Check if verses are identical
    if verb_bhsa and is_lex_identical(verb_bhsa.to_verses, find_dss_verse(verb)):
        return find_complements(verb_bhsa)
        

### 3. Generate dataset

In [8]:
items = [] # create an empty list to store all the information for each occ.

       
for verb in verbs_bhsa:
    
    # Add MT as "scroll" for the BHSA
    scroll = "MT"
    verse = verb.to_verses
    dir_he_dss_verse = ""
    sign_info = ""
    
    
    # Filling in the complement column
    complements = find_complements(verb)
    if not complements:
        complements = ["no complement"]

    for complement in complements:        
        if complement == "no complement":
            dir_he = ""
        else: 
            dir_he = int("H" in complement.to_words.uvf)
               
    
        # Collect information about the following variables:    
        item = {
            "verb_id": verb.ids[0], 
            "lex": verb.lex[0], 
            "scroll": scroll,
            "book": verb.book[0], 
            "chapter": verb.chapter[0], 
            "verse_num": verb.verse[0],
            "gcons_verb": verb.g_cons[0],
            "gcons_verse": str(verse),
            "gcons_clause": str(verb.to_clauses),
            "subject": str(verb.to_clauses.to_phrases.filter(function="Subj")),
            "complement": complement,
            "dir_he": dir_he,
            "dir_he_dss": dir_he_dss_verse,
            "sign_info": sign_info,
            "stem": verb.vs[0],
            "tense": verb.vt[0],
        }
        
        if complement != "no complement":
            prepositions = complement.to_words.filter(sp="prep")
            n = 0
            for preposition in prepositions:
                n += 1
                item[f"preposition_{n}"] = str(preposition)

        items.append(item)

### 4. Create the dataframe with pandas

In [14]:
df = pd.DataFrame(items).fillna("")
df.sort_values(["book", "chapter", "verse_num"], ascending=[True, True, True], ignore_index=True, inplace=True)

In [18]:
df

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,gcons_clause,subject,...,sign_info,stem,tense,preposition_1,preposition_2,preposition_3,preposition_4,preposition_5,preposition_6,preposition_7
0,391367,JY>[,MT,1_Chronicles,1,12,JY>W,W >T PTRSJM W >T KSLXJM >CR JY>W M CM PLCTJM W...,>CR JY>W M CM PLCTJM,PLCTJM,...,,qal,perf,M,,,,,,
1,396716,NWS[,MT,1_Chronicles,10,1,JNS,W PLCTJM NLXMW B JFR>L W JNS >JC JFR>L M PNJ P...,W JNS >JC JFR>L M PNJ PLCTJM,>JC JFR>L,...,,qal,wayq,M,,,,,,
2,396723,NPL[,MT,1_Chronicles,10,1,JPLW,W PLCTJM NLXMW B JFR>L W JNS >JC JFR>L M PNJ P...,W JPLW XLLJM B HR GLB<,,...,,qal,wayq,,,,,,,
3,396944,BW>[,MT,1_Chronicles,10,12,JBJ>WM,W JQWMW KL >JC XJL W JF>W >T GWPT C>WL W >T GW...,W JBJ>WM JBJCH,,...,,hif,wayq,,,,,,,
4,396991,SBB[,MT,1_Chronicles,10,14,JSB,W L> DRC B JHWH W JMJTHW W JSB >T H MLWKH L DW...,W JSB >T H MLWKH L DWJD BN JCJ,,...,,hif,wayq,L,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11232,303999,SWR[,MT,Zephaniah,3,15,HSJR,HSJR JHWH MCPVJK PNH >JBK MLK JFR>L JHWH B QRB...,HSJR JHWH MCPVJK,JHWH,...,,hif,perf,,,,,,,
11233,304002,PNH[,MT,Zephaniah,3,15,PNH,HSJR JHWH MCPVJK PNH >JBK MLK JFR>L JHWH B QRB...,PNH >JBK,>JBK,...,,piel,perf,,,,,,,
11234,303776,QRB[,MT,Zephaniah,3,2,QRBH,L> CM<H B QWL L> LQXH MWSR B JHWH L> BVXH >L >...,>L >LHJH L> QRBH,,...,,qal,perf,>L,,,,,,
11235,304089,BW>[,MT,Zephaniah,3,20,>BJ>,B <T H HJ> >BJ> >TKM W B <T QBYJ >TKM KJ >TN >...,B <T H HJ> >BJ> >TKM,,...,,hif,impf,B,,,,,,


### Save the dataframe into a file: the whole BHSA

In [19]:
df.to_csv("data/bhsa_all_verbs.csv", index=False)

### Save another csv file without the occurrences of Isaiah + complement, for easier merging with the Isaiah dataset

In [11]:
df2 = pd.DataFrame(items).fillna("")
df2.sort_values(["book", "chapter", "verse_num"], ascending=[True, True, True], ignore_index=True, inplace=True)

In [12]:
# Drop the rows from Isaiah having a complement different than "no complement" (already present in the other dataset)

for index, row in df2.iterrows():
    if row["complement"] != "no complement" and row["book"] == "Isaiah":
        df2.drop(index, inplace=True)

In [13]:
# Drop the dir_he_dss column (droped in the other dataset)

df2 = df2.drop("dir_he_dss", axis=1)

In [14]:
df2.columns

Index(['verb_id', 'lex', 'scroll', 'book', 'chapter', 'verse_num',
       'gcons_verb', 'gcons_verse', 'gcons_clause', 'subject', 'complement',
       'dir_he', 'sign_info', 'stem', 'tense', 'preposition_1',
       'preposition_2', 'preposition_3', 'preposition_4', 'preposition_5',
       'preposition_6', 'preposition_7'],
      dtype='object')

In [15]:
# Show the occurrences where DSS verse = BHSA verse
# df[(df.scroll != "MT") & (df.complement != "")]
# df[(df.complement != "")]
df2[(df2.book == "Isaiah") & (df2.complement != "no complement")]

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,gcons_clause,subject,...,sign_info,stem,tense,preposition_1,preposition_2,preposition_3,preposition_4,preposition_5,preposition_6,preposition_7


In [16]:
df2.to_csv("data/bhsa_to_merge_with_isaiah.csv", index=False)