In [1]:
import pandas as pd
from tfob import TFOb, get_bhsa

BHSA = get_bhsa()

**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


In [2]:
from collections import Counter

### 1.Create two lists with the verses from MT Isaiah and 1QIsaa

#### 1.1 List of wanted motion verbs

In [3]:
motion_verbs = ['BW>[','HLK[','JY>[','JRD[','<BR[',
                '<LH[','CWB[','>TH[','BRX[','GJX[',
                'GLH[','GLL[','DXP[','DLG[','HWH[',
                'XWC[','XLP[','XSH[','VB<[','VWF[',
                'MHR[','MWC[','NGC[','NHR[','NWX[',
                'NWS[','NXT[','NVP[','NS<[','NPL[',
                'NTK[','SBB[','SWR[','SLQ[','<WZ[',
                '<WP[','PNH[','PF<[','YWP[','Y<D[',
                'QHL[','QPY[','QRB[','RWY[','FVH[',
                'CWX[','CWR=[','CVP[','CQQ[','T>R[',
                'T<H[']

freq_verbs = ['BW>[','HLK[','JY>[','JRD[','<BR[',
                '<LH[','CWB[','>TH[','BRX[','GLH[',
                'HWH[','XLP[','XSH[','MHR[','NGC[',
                'NWX[','NWS[','NS<[','NPL[','NTK[',
                'SBB[','SWR[','<WP[','PNH[','PC<[',
                'QHL[','QRB[','RWY[','CVP[','T<H[']

#### 1.2 List of occurrences of the motion verbs (MT Isaiah)

In [4]:
verbs_bhsa = TFOb.all("word", BHSA).filter_in(lex=motion_verbs)
verbs_bhsa

<word_10909 "TWY> J<WPP TWY> J<LH JY> SBB SWBB HLK JNXHW JB> JPL JB>H MTHLK TLK CWBK TCWB JB> HBJ> JPLW NPLW [...] JY> HSB JB> H<BJRWNJ J<BJRHW JWLJKHW JSJRHW JSB JBJ>HW <LH HLJKW HBJ> JB>HW CWB <LWT J<L HBJ> JGL J<BR J<L">

In [97]:
#TESTING ZONE 
#verbs_bhsa[0]
#TFOb(215297, BHSA).to_verses.text
#TFOb(215297, BHSA).to_verses.pretty()
verb = verbs_bhsa[29]
cmpl = verb.to_clauses.to_phrases.filter_in(function=["Cmpl", "Loca"])
print(verb.book[0], verb.chapter[0], verb.verse[0])
print(verb, ",", verb.to_clauses)
print(cmpl.text, cmpl.to_words.prs)
print(cmpl.to_words.sp)
print(cmpl.to_phrases.det)
print(cmpl.to_words.nametype)

Genesis 6 20
JB>W , CNJM M KL JB>W >LJK
אֵלֶ֖יךָ  ['K']
['prep']
['det']
[None]


In [10]:
vb = [nametype for nametype in verbs_bhsa.to_verses.to_words.nametype if nametype]
print(len(vb))
print(len(verbs_bhsa.to_verses.to_words.filter_in(sp=["nmpr"])))

13647
12653


In [11]:
vb2 = [(word.sp[0], word.nametype[0]) for word in verbs_bhsa.to_clauses.to_phrases.filter_in(function=["Cmpl", "Loca"]).to_words.filter_in(sp=["subs", "nmpr"])]
Counter(vb2)

Counter({('subs', None): 6116,
         ('nmpr', 'topo'): 1311,
         ('nmpr', 'pers'): 769,
         ('nmpr', 'pers,gens,topo'): 257,
         ('nmpr', 'gens'): 11,
         ('nmpr', 'pers,gens'): 8,
         ('nmpr', 'topo,pers'): 2,
         ('nmpr', 'pers,god'): 1})

In [12]:
vb3 = [(word.sp[0], word.nametype[0]) for word in verbs_bhsa.to_clauses.to_phrases.filter_in(function=["Cmpl", "Loca"]).to_words.filter_in(sp=["subs", "nmpr", "prde", "prps", "prin"])]
Counter(vb3)

Counter({('subs', None): 6116,
         ('nmpr', 'topo'): 1311,
         ('nmpr', 'pers'): 769,
         ('nmpr', 'pers,gens,topo'): 257,
         ('prde', None): 115,
         ('nmpr', 'gens'): 11,
         ('prps', 'ppde'): 10,
         ('prin', None): 9,
         ('nmpr', 'pers,gens'): 8,
         ('nmpr', 'topo,pers'): 2,
         ('nmpr', 'pers,god'): 1})

### 2. Important function

In [221]:
# Create a dataset with the occurrences

def find_complements(verb):
    """Find the complement of a verb. If no match, returns None"""
    if verb.source.name == "BHSA":
        complements = verb.to_clauses.to_phrases.filter_in(function=["Cmpl", "Loca"])
        return complements
    
    # if the verb is not BHSA, it's DSS
    verb_bhsa = find_bhsa_verb(verb)

    # Check if verses are identical
    if verb_bhsa and is_lex_identical(verb_bhsa.to_verses, find_dss_verse(verb)):
        return find_complements(verb_bhsa)
        
def find_cmpl_anim(cmpl):
    """Based on the nametype value, returns 'anim' or 'inanim' for one complement of a given verb. 
    If the nametype does not provide a clear distinction, returns 'check'.
    If there is no complement to the verb, returns None."""
    
    anim = ["pers", "gens", "god"]
    inanim = ["mens", "topo"]

    nouns = cmpl.to_words.filter_in(sp=["subs", "nmpr"])
    nouns_animacy = []
    for noun in nouns:
        nt = noun.nametype[0]
        if nt in anim:
            nouns_animacy.append("anim")
        elif nt in inanim:
            nouns_animacy.append("inanim")
    return " ".join(nouns_animacy)


def find_cmpl_individuation(cmpl):
    """Returns whether the nouns in a cmpl are substantives or proper nouns."""
    cmpl_indiv = []
    for word in cmpl.to_words.filter_in(sp=["subs","nmpr"]):
        cmpl_indiv.append(word.sp[0])
   
    # Handle cases when the individuation is a pronominal suffixe (prsf)    
    simplified_cmpl = []
   
    if len(cmpl.to_words) == 1:
        if cmpl.to_words.sp[0] == "prep":
            if cmpl.to_words.prs not in ["n/a", "absent"]:
                cmpl_indiv.append("prsf")
        else:
            cmpl_indiv.append(cmpl.to_words.sp[0])
        
    # cmpl_indiv empty string if empty    
    if len(cmpl_indiv) == 0:
        cmpl_indiv.append("")
        
    return cmpl_indiv[0]

def find_cmpl_nametype(cmpl):
    words = cmpl.to_words
    cmpl_nt = []
    for word in words:
        if word.nametype[0]:
            nt = word.nametype[0]
            cmpl_nt.append(nt)
            
    if find_cmpl_individuation(cmpl) == "prsf":
        cmpl_nt.append("prsf")
        
    return " ".join(cmpl_nt)


def find_cmpl_individuation(cmpl):
    """Returns whether the nouns in a cmpl are substantives or proper nouns."""
    cmpl_indiv = []
    for word in cmpl.to_words.filter_in(sp=["subs","nmpr"]):
        cmpl_indiv.append(word.sp[0])
   
    # Handle cases when the individuation is a pronominal suffixe (prsf)    
    simplified_cmpl = []
   
    if len(cmpl.to_words) == 1:
        if cmpl.to_words.sp[0] == "prep":
            if cmpl.to_words.prs not in ["n/a", "absent"]:
                cmpl_indiv.append("prsf")
        else:
            cmpl_indiv.append(cmpl.to_words.sp[0])
        
    # cmpl_indiv empty string if empty    
    if len(cmpl_indiv) == 0:
        cmpl_indiv.append("")
        
    return cmpl_indiv[0]

def find_cmpl_construction(cmpl):
    """Returns the construction of the complement: vc (for verbal complement), 
    prep for prepositional complement and dir_he for complement with directive-he 
    (or a combination of those)."""
    
    cmpl_construction = []
    
    for word in cmpl.to_words:
        if word.filter(sp="prep"):
            cmpl_construction.append("prep")
        elif "H" in word.uvf:
            cmpl_construction.append("dir-he")
            
    if cmpl_construction == []:
        return "vc"
    else:
        return " + ".join(cmpl_construction)
    

In [222]:
verb = verbs_bhsa[29]
cmpl = find_complements(verb)
anim = find_cmpl_anim(cmpl)
nt = find_cmpl_nametype(cmpl)
indiv = find_cmpl_individuation(cmpl)
constr = find_cmpl_construction(cmpl)

print(cmpl.to_clauses, cmpl, anim, nt, indiv, constr)
#print(find_cmpl_det(cmpl))

CNJM M KL JB>W >LJK >LJK  prsf prsf prep


In [223]:
len(cmpl.to_words)

1

In [224]:
find_cmpl_individuation(cmpl)

'prsf'

In [225]:
def find_cmpl_complex(cmpl):
    simplified_cmpl = []
    
    #print(cmpl.to_words.prs)
   
    for word in cmpl.to_words:
        if word.sp[0] != "prep" and word.sp[0] != "art":
            simplified_cmpl.append(word)
    
    if len(simplified_cmpl) == 0 or (len(simplified_cmpl) == 1 and simplified_cmpl[0].to_words.prs[0] in ["n/a", "absent"]):
        cmpl_complex = "simple"
    else:
        cmpl_complex = "complex"
        
    return cmpl_complex

In [226]:
find_cmpl_complex(cmpl)
#print(cmpl)

'simple'

In [227]:
cmpl.to_words.filter(sp="prep")

<word_1 ">LJK">

### 3. Generate dataset

In [228]:
items = [] # create an empty list to store all the information for each occ.

       
for verb in verbs_bhsa:
    
    # Add MT as "scroll" for the BHSA
    scroll = "MT"
    verse = verb.to_verses
    dir_he_dss_verse = ""
    sign_info = ""
    
    
    # Filling in the complement column
    complements = find_complements(verb)
    if not complements:
        complements = ["no complement"]
        

    for complement in complements:    
        if complement == "no complement":
            dir_he = cmpl_anim = cmpl_nt = cmpl_det = cmpl_indiv = cmpl_constr = cmpl_complex = ""
            cmpl_lex = "no complement"
          
        else: 
            cmpl_lex = str(" ".join(complement.to_words.lex))
            dir_he = int("H" in complement.to_words.uvf)
            cmpl_anim = find_cmpl_anim(complement)
            cmpl_nt = find_cmpl_nametype(complement)
            cmpl_det = cmpl.det[0]
            cmpl_indiv = find_cmpl_individuation(complement)
            cmpl_constr = find_cmpl_construction(complement)
            cmpl_complex = find_cmpl_complex(complement)
            
    
        # Collect information about the following variables:    
        item = {
            "verb_id": verb.ids[0], 
            "lex": verb.lex[0], 
            "scroll": scroll,
            "book": verb.book[0], 
            "chapter": int(verb.chapter[0]), 
            "verse_num": int(verb.verse[0]),
            "gcons_verb": verb.g_cons[0],
            "gcons_verse": str(verse),
            "gcons_clause": str(verb.to_clauses),
            "subject": str(verb.to_clauses.to_phrases.filter(function="Subj")),
            "complement": str(complement),
            "cmpl_lex": cmpl_lex,
            "dir_he": dir_he,
            "cmpl_constr": cmpl_constr,
            "cmpl_nt": cmpl_nt,
            "cmpl_anim": cmpl_anim,
            "cmpl_det": cmpl_det,
            "cmpl_indiv": cmpl_indiv,
            "cmpl_complex": cmpl_complex,
            "dir_he_dss": dir_he_dss_verse,
            "sign_info": sign_info,
            "stem": verb.vs[0],
            "tense": verb.vt[0],
        }
        
        if complement != "no complement":
            prepositions = complement.to_words.filter(sp="prep").lex
            n = 0
            for preposition in prepositions:
                n += 1
                item[f"preposition_{n}"] = str(preposition)

        items.append(item)

### 4. Create the dataframe with pandas

In [229]:
df = pd.DataFrame(items).fillna("")
df.sort_values(["book", "chapter", "verse_num"], ascending=[True, True, True], ignore_index=True, inplace=True)

In [230]:
pd.set_option('display.max_columns', None)
#df[df.book == "Exodus"]
#df[df.cmpl_complex == "complex"]

df

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,gcons_clause,subject,complement,cmpl_lex,dir_he,cmpl_constr,cmpl_nt,cmpl_anim,cmpl_det,cmpl_indiv,cmpl_complex,dir_he_dss,sign_info,stem,tense,preposition_1,preposition_2,preposition_3,preposition_4,preposition_5,preposition_6,preposition_7
0,391367,JY>[,MT,1_Chronicles,1,12,JY>W,W >T PTRSJM W >T KSLXJM >CR JY>W M CM PLCTJM W...,>CR JY>W M CM PLCTJM,PLCTJM,M CM,MN CM,0,prep,,,det,,simple,,,qal,perf,MN,,,,,,
1,392084,BW>[,MT,1_Chronicles,2,21,B>,W >XR B> XYRWN >L BT MKJR >BJ GL<D W HW> LQXH ...,W >XR B> XYRWN >L BT MKJR >BJ GL<D,XYRWN,>L BT MKJR >BJ GL<D,>L BT/ MKJR/ >B/ GL<D/,0,prep,"pers,gens topo",inanim,det,subs,complex,,,qal,perf,>L,,,,,,
2,392509,JY>[,MT,1_Chronicles,2,53,JY>W,W MCPXWT QRJT_J<RJM H JTRJ W H PWTJ W H CMTJ W...,M >LH JY>W H YR<TJ W H >CT>LJ,H YR<TJ W H >CT>LJ,M >LH,MN >LH,0,prep,,,det,,simple,,,qal,perf,MN,,,,,,
3,392539,BW>[,MT,1_Chronicles,2,55,B>JM,W MCPXWT SPRJM JCBW J<BY TR<TJM CM<TJM FWKTJM ...,H B>JM M XMT >BJ BJT RKB,,M XMT >BJ BJT RKB,MN XMT===/ >B/ BJT/ RKB==/,0,prep,pers pers,anim anim,det,nmpr,complex,,,qal,ptca,MN,,,,,,
4,392967,BW>[,MT,1_Chronicles,4,10,JB>,W JQR> J<BY L >LHJ JFR>L L >MR >M BRK TBRKNJ W...,W JB> >LHJM,>LHJM,no complement,no complement,,,,,,,,,,hif,wayq,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11326,303935,SWR[,MT,Zephaniah,3,11,>SJR,B JWM H HW> L> TBWCJ M KL <LJLTJK >CR PC<T BJ ...,KJ >Z >SJR M QRBK <LJZJ G>WTK,,M QRBK,MN QRB/,0,prep,,,det,subs,complex,,,hif,impf,MN,,,,,,
11327,303958,XSH[,MT,Zephaniah,3,12,XSW,W HC>RTJ B QRBK <M <NJ W DL W XSW B CM JHWH,W XSW B CM JHWH,,B CM JHWH,B CM/ JHWH/,0,prep,pers,anim,det,subs,complex,,,qal,perf,B,,,,,,
11328,303999,SWR[,MT,Zephaniah,3,15,HSJR,HSJR JHWH MCPVJK PNH >JBK MLK JFR>L JHWH B QRB...,HSJR JHWH MCPVJK,JHWH,no complement,no complement,,,,,,,,,,hif,perf,,,,,,,
11329,304002,PNH[,MT,Zephaniah,3,15,PNH,HSJR JHWH MCPVJK PNH >JBK MLK JFR>L JHWH B QRB...,PNH >JBK,>JBK,no complement,no complement,,,,,,,,,,piel,perf,,,,,,,


In [231]:
set(df.cmpl_indiv)

{'', 'adjv', 'advb', 'inrg', 'nmpr', 'prsf', 'subs', 'verb'}

In [232]:
set(df.cmpl_nt)

{'',
 'gens',
 'pers',
 'pers pers',
 'pers pers pers',
 'pers pers pers pers',
 'pers pers pers pers pers',
 'pers pers pers pers pers pers pers pers pers pers pers pers',
 'pers pers pers pers,gens,topo',
 'pers pers pers,gens,topo',
 'pers pers topo',
 'pers pers,gens,topo',
 'pers topo',
 'pers topo topo',
 'pers topo topo topo',
 'pers,gens',
 'pers,gens topo',
 'pers,gens,topo',
 'pers,gens,topo pers',
 'pers,gens,topo pers topo',
 'pers,gens,topo pers,gens pers,gens,topo',
 'pers,gens,topo pers,gens,topo',
 'pers,gens,topo pers,gens,topo pers,gens',
 'pers,gens,topo pers,gens,topo topo',
 'pers,gens,topo topo',
 'pers,gens,topo topo pers,gens,topo',
 'pers,gens,topo topo topo',
 'pers,god',
 'ppde',
 'prsf',
 'topo',
 'topo pers',
 'topo pers pers',
 'topo pers pers,gens,topo',
 'topo pers topo',
 'topo pers,gens,topo',
 'topo pers,gens,topo pers,gens,topo',
 'topo topo',
 'topo topo topo',
 'topo topo topo topo',
 'topo topo topo topo pers',
 'topo topo topo topo topo',
 'topo,

### Save the dataframe into a file: the whole BHSA

In [208]:
df.to_csv("data/bhsa_all_verbs.csv", index=False)

### Save another csv file without the occurrences of Isaiah + complement, for easier merging with the Isaiah dataset

In [215]:
df2 = pd.DataFrame(items).fillna("")
df2.sort_values(["book", "chapter", "verse_num"], ascending=[True, True, True], ignore_index=True, inplace=True)

In [210]:
# Drop the rows from Isaiah having a complement different than "no complement" (already present in the other dataset)

for index, row in df2.iterrows():
    if row["complement"] != "no complement" and row["book"] == "Isaiah":
        df2.drop(index, inplace=True)

In [216]:
# Drop the dir_he_dss column (droped in the other dataset)

df2 = df2.drop("dir_he_dss", axis=1)

In [217]:
df2.columns

Index(['verb_id', 'lex', 'scroll', 'book', 'chapter', 'verse_num',
       'gcons_verb', 'gcons_verse', 'gcons_clause', 'subject', 'complement',
       'cmpl_lex', 'dir_he', 'cmpl_constr', 'cmpl_nt', 'cmpl_anim', 'cmpl_det',
       'cmpl_indiv', 'cmpl_complex', 'sign_info', 'stem', 'tense',
       'preposition_1', 'preposition_2', 'preposition_3', 'preposition_4',
       'preposition_5', 'preposition_6', 'preposition_7'],
      dtype='object')

In [218]:
# Show the occurrences where DSS verse = BHSA verse
# df[(df.scroll != "MT") & (df.complement != "")]
# df[(df.complement != "")]
#df2[(df2.book == "Isaiah") & (df2.complement != "no complement")]
df2[(df2.book == "Isaiah") & (df2.verb_id == 216346)]

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,gcons_clause,subject,complement,cmpl_lex,dir_he,cmpl_constr,cmpl_nt,cmpl_anim,cmpl_det,cmpl_indiv,cmpl_complex,sign_info,stem,tense,preposition_1,preposition_2,preposition_3,preposition_4,preposition_5,preposition_6,preposition_7
6455,216346,<WP[,MT,Isaiah,11,14,<PW,W <PW B KTP PLCTJM JMH JXDW JBZW >T BNJ QDM >D...,W <PW B KTP PLCTJM JMH,,PLCTJM,PLCTJ/,0,vc,,,det,adjv,simple,,qal,perf,,,,,,,
6456,216346,<WP[,MT,Isaiah,11,14,<PW,W <PW B KTP PLCTJM JMH JXDW JBZW >T BNJ QDM >D...,W <PW B KTP PLCTJM JMH,,JMH,JM/,1,dir-he,,,det,subs,simple,,qal,perf,,,,,,,


In [214]:
len(df2)

11072

In [16]:
df2.to_csv("data/bhsa_to_merge_with_isaiah.csv", index=False)