In [1]:
import pandas as pd
from tfob import TFOb, get_bhsa

BHSA = get_bhsa()

**Locating corpus resources ...**

Name,# of nodes,# slots / node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


In [2]:
from collections import Counter

### 1.Create two lists with the verses from MT Isaiah and 1QIsaa

#### 1.1 List of wanted motion verbs

In [3]:
motion_verbs = ['BW>[','HLK[','JY>[','JRD[','<BR[',
                '<LH[','CWB[','>TH[','BRX[','GJX[',
                'GLH[','GLL[','DXP[','DLG[','HWH=[',
                'XWC[','XLP[','XSH[','VB<[','VWF[',
                'MHR[','MWC[','NGC[','NHR[','NWX[',
                'NWS[','NXT[','NVP[','NS<[','NPL[',
                'NTK[','SBB[','SWR[','SLQ[','<WZ[',
                '<WP[','PNH[','PF<[','YWP[','Y<D[',
                'QHL[','QPY[','QRB[','RWY[','FVH[',
                'CWX[','CWR=[','CVP[','CQQ[','T>R[',
                'T<H[']

freq_verbs = ['BW>[','HLK[','JY>[','JRD[','<BR[',
                '<LH[','CWB[','>TH[','BRX[','GLH[',
                'HWH[','XLP[','XSH[','MHR[','NGC[',
                'NWX[','NWS[','NS<[','NPL[','NTK[',
                'SBB[','SWR[','<WP[','PNH[','PC<[',
                'QHL[','QRB[','RWY[','CVP[','T<H[']

In [4]:
predicates = ["Pred", "PreO", "PreS"]

#### Get all the occurrences of the verbs in the list, in the BHSA dataset (when their phrase has a predicate function)

In [5]:
verbs_bhsa = TFOb.all("phrase", BHSA).filter_in(function=predicates).to_words.filter_in(lex=motion_verbs)

In [26]:
verbs_bhsa[2]

<word_1 "TWY>">

In [6]:
#TESTING ZONE 
#verbs_bhsa[0]
#TFOb(215297, BHSA).to_verses.text
#TFOb(215297, BHSA).to_verses.pretty()
verb = verbs_bhsa[5]
cmpl = verb.to_clauses.to_phrases.filter(function="Cmpl")
print(verb.book[0], verb.chapter[0], verb.verse[0])
print(verb, ",", verb.to_clauses)
print(cmpl.text, cmpl.to_words.prs)
print(cmpl.to_words.sp)
print(cmpl.to_phrases.det)
print(cmpl.to_words.nametype)

Genesis 2 19
JB> , W JB> >L H >DM
אֶל־הָ֣אָדָ֔ם  ['absent', 'n/a', 'absent']
['prep', 'art', 'subs']
['det']
[None, None, None]


In [7]:
#Counter(lexemes)

In [8]:
vb = [nametype for nametype in verbs_bhsa.to_verses.to_words.nametype if nametype]
print(len(vb))
print(len(verbs_bhsa.to_verses.to_words.filter_in(sp=["nmpr"])))

12606
11749


In [9]:
vb2 = [(word.sp[0], word.nametype[0]) for word in verbs_bhsa.to_clauses.to_phrases.filter_in(function=["Cmpl", "Loca"]).to_words.filter_in(sp=["subs", "nmpr"])]
Counter(vb2)

Counter({('subs', None): 5404,
         ('nmpr', 'topo'): 1213,
         ('nmpr', 'pers'): 705,
         ('nmpr', 'pers,gens,topo'): 236,
         ('nmpr', 'gens'): 11,
         ('nmpr', 'pers,gens'): 7,
         ('nmpr', 'topo,pers'): 2,
         ('nmpr', 'pers,god'): 1})

In [10]:
vb3 = [(word.sp[0], word.nametype[0]) for word in verbs_bhsa.to_clauses.to_phrases.filter_in(function=["Cmpl", "Loca"]).to_words.filter_in(sp=["subs", "nmpr", "prde", "prps", "prin"])]
Counter(vb3)

Counter({('subs', None): 5404,
         ('nmpr', 'topo'): 1213,
         ('nmpr', 'pers'): 705,
         ('nmpr', 'pers,gens,topo'): 236,
         ('prde', None): 97,
         ('nmpr', 'gens'): 11,
         ('prps', 'ppde'): 9,
         ('prin', None): 9,
         ('nmpr', 'pers,gens'): 7,
         ('nmpr', 'topo,pers'): 2,
         ('nmpr', 'pers,god'): 1})

### 2. Important function

In [27]:
# Create a dataset with the occurrences

def find_complements(verb):
    """Find the complement of a verb. If no match, returns None"""
    if verb.source.name == "BHSA":
        complements = verb.to_clauses.to_phrases.filter(function="Cmpl")
        return complements
    
    # if the verb is not BHSA, it's DSS
    verb_bhsa = find_bhsa_verb(verb)

    # Check if verses are identical
    if verb_bhsa and is_lex_identical(verb_bhsa.to_verses, find_dss_verse(verb)):
        return find_complements(verb_bhsa)
        
def find_cmpl_anim(cmpl):
    """Based on the nametype value, returns 'anim' or 'inanim' for one complement of a given verb. 
    If the nametype does not provide a clear distinction, returns 'check'.
    If there is no complement to the verb, returns None."""
    
    anim = ["pers", "gens", "god"]
    inanim = ["mens", "topo"]

    nouns = cmpl.to_words.filter_in(sp=["subs", "nmpr"])
    nouns_animacy = []
    for noun in nouns:
        nt = noun.nametype[0]
        if nt in anim:
            nouns_animacy.append("anim")
        elif nt in inanim:
            nouns_animacy.append("inanim")
    return " ".join(nouns_animacy)


def find_cmpl_individuation(cmpl):
    """Returns whether the nouns in a cmpl are substantives or proper nouns."""
    cmpl_indiv = []
    for word in cmpl.to_words.filter_in(sp=["subs","nmpr"]):
        cmpl_indiv.append(word.sp[0])
   
    # Handle cases when the individuation is a pronominal suffixe (prsf)    
    simplified_cmpl = []
   
    if len(cmpl.to_words) == 1:
        if cmpl.to_words.sp[0] == "prep":
            if cmpl.to_words.prs not in ["n/a", "absent"]:
                cmpl_indiv.append("prsf")
        else:
            cmpl_indiv.append(cmpl.to_words.sp[0])
        
    # cmpl_indiv empty string if empty    
    if len(cmpl_indiv) == 0:
        cmpl_indiv.append("")
        
    return cmpl_indiv[0]

def find_cmpl_nametype(cmpl):
    words = cmpl.to_words
    cmpl_nt = []
    for word in words:
        if word.nametype[0]:
            nt = word.nametype[0]
            cmpl_nt.append(nt)
            
    if find_cmpl_individuation(cmpl) == "prsf":
        cmpl_nt.append("prsf")
        
    return " ".join(cmpl_nt)


def find_cmpl_individuation(cmpl):
    """Returns whether the nouns in a cmpl are substantives or proper nouns."""
    cmpl_indiv = []
    for word in cmpl.to_words.filter_in(sp=["subs","nmpr"]):
        cmpl_indiv.append(word.sp[0])
   
    # Handle cases when the individuation is a pronominal suffixe (prsf)    
    simplified_cmpl = []
   
    if len(cmpl.to_words) == 1:
        if cmpl.to_words.sp[0] == "prep":
            if cmpl.to_words.prs not in ["n/a", "absent"]:
                cmpl_indiv.append("prsf")
        else:
            cmpl_indiv.append(cmpl.to_words.sp[0])
        
    # cmpl_indiv empty string if empty    
    if len(cmpl_indiv) == 0:
        cmpl_indiv.append("")
        
    return cmpl_indiv[0]

def find_cmpl_construction(cmpl):
    """Returns the construction of the complement: vc (for verbal complement), 
    prep for prepositional complement and dir_he for complement with directive-he 
    (or a combination of those)."""
    
    cmpl_construction = []
    
    for word in cmpl.to_words:
        if word.filter(sp="prep"):
            cmpl_construction.append("prep")
        elif "H" in word.uvf:
            cmpl_construction.append("dir-he")
            
    if cmpl_construction == []:
        return "vc"
    else:
        return " + ".join(cmpl_construction)
    
    
def find_cmpl_complex(cmpl):
    simplified_cmpl = []
    
    #print(cmpl.to_words.prs)
   
    for word in cmpl.to_words:
        if word.sp[0] != "prep" and word.sp[0] != "art":
            simplified_cmpl.append(word)
    
    if len(simplified_cmpl) == 0 or (len(simplified_cmpl) == 1 and simplified_cmpl[0].to_words.prs[0] in ["n/a", "absent"]):
        cmpl_complex = "simple"
    else:
        cmpl_complex = "complex"
        
    return cmpl_complex

In [47]:
verb = verbs_bhsa[29]
cmpl = find_complements(verb)
anim = find_cmpl_anim(cmpl)
nt = find_cmpl_nametype(cmpl)
indiv = find_cmpl_individuation(cmpl)
constr = find_cmpl_construction(cmpl)

print(cmpl.to_clauses, cmpl, anim, nt, indiv, constr)

     vc


### 3. Generate dataset

In [36]:
items = [] # create an empty list to store all the information for each occ.

       
for verb in verbs_bhsa:
    
    # Add MT as "scroll" for the BHSA
    scroll = "MT"
    verse = verb.to_verses
    dir_he_dss_verse = ""
    sign_info = ""
    
    
    # Filling in the complement column
    complements = find_complements(verb)
    if not complements:
        complements = ["no complement"]
        

    for complement in complements:    
        if complement == "no complement":
            dir_he = cmpl_anim = cmpl_nt = cmpl_det = cmpl_indiv = cmpl_constr = cmpl_complex = ""
            cmpl_lex = "no complement"
          
        else: 
            cmpl_lex = str(" ".join(complement.to_words.lex))
            dir_he = int("H" in complement.to_words.uvf)
            cmpl_anim = find_cmpl_anim(complement)
            cmpl_nt = find_cmpl_nametype(complement)
            cmpl_det = complement.det[0]
            cmpl_indiv = find_cmpl_individuation(complement)
            cmpl_constr = find_cmpl_construction(complement)
            cmpl_complex = find_cmpl_complex(complement)
            
    
        # Collect information about the following variables:    
        item = {
            "verb_id": verb.ids[0], 
            "lex": verb.lex[0], 
            "scroll": scroll,
            "book": verb.book[0], 
            "chapter": int(verb.chapter[0]), 
            "verse_num": int(verb.verse[0]),
            "gcons_verb": verb.g_cons[0],
            "gcons_verse": str(verse),
            "gcons_clause": str(verb.to_clauses),
            "subject": str(verb.to_clauses.to_phrases.filter(function="Subj")),
            "complement": str(complement),
            "cmpl_lex": cmpl_lex,
            "cmpl_translation": "",
            "dir_he": dir_he,
            "cmpl_constr": cmpl_constr,
            "cmpl_nt": cmpl_nt,
            "cmpl_anim": cmpl_anim,
            "cmpl_det": cmpl_det,
            "cmpl_indiv": cmpl_indiv,
            "cmpl_complex": cmpl_complex,
            "dir_he_dss": dir_he_dss_verse,
            "sign_info": sign_info,
            "stem": verb.vs[0],
            "tense": verb.vt[0],
            "motion_type": "",
            "comments": "",
        }
        
        if complement != "no complement":
            prepositions = complement.to_words.filter(sp="prep").lex
            n = 0
            for preposition in prepositions:
                n += 1
                item[f"preposition_{n}"] = str(preposition)

        items.append(item)

### 4. Create the dataframe with pandas

In [37]:
df = pd.DataFrame(items).fillna("")
df.sort_values(["book", "chapter", "verse_num"], ascending=[True, True, True], ignore_index=True, inplace=True)

In [38]:
pd.set_option('display.max_columns', None)
#df[df.book == "Exodus"]
df[df.cmpl_complex == "complex"]

#df

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,gcons_clause,subject,complement,cmpl_lex,cmpl_translation,dir_he,cmpl_constr,cmpl_nt,cmpl_anim,cmpl_det,cmpl_indiv,cmpl_complex,dir_he_dss,sign_info,stem,tense,motion_type,comments,preposition_1,preposition_2,preposition_3,preposition_4,preposition_5,preposition_6,preposition_7
1,392084,BW>[,MT,1_Chronicles,2,21,B>,W >XR B> XYRWN >L BT MKJR >BJ GL<D W HW> LQXH ...,W >XR B> XYRWN >L BT MKJR >BJ GL<D,XYRWN,>L BT MKJR >BJ GL<D,>L BT/ MKJR/ >B/ GL<D/,,0,prep,"pers,gens topo",inanim,det,subs,complex,,,qal,perf,,,>L,,,,,,
4,393357,HLK[,MT,1_Chronicles,4,39,JLKW,W JLKW L MBW> GDR <D L MZRX H GJ> L BQC MR<H L...,W JLKW L MBW> GDR <D L MZRX H GJ>,,L MBW> GDR,L MBW>/ GDR==/,,0,prep,topo,inanim,det,subs,complex,,,qal,wayq,,,L,,,,,,
6,393437,HLK[,MT,1_Chronicles,4,42,HLKW,W MHM MN BNJ CM<WN HLKW L HR F<JR >NCJM XMC M>...,W MHM MN BNJ CM<WN HLKW L HR F<JR >NCJM XMC M>WT,MHM MN BNJ CM<WN,L HR F<JR,L HR/ F<JR====/,,0,prep,topo,inanim,det,subs,complex,,,qal,perf,,,L,,,,,,
9,393610,NPL[,MT,1_Chronicles,5,10,JPLW,W B JMJ C>WL <FW MLXMH <M H HGR>JM W JPLW B JD...,W JPLW B JDM,,B JDM,B JD/,,0,prep,,,det,subs,complex,,,qal,wayq,,,B,,,,,,
12,393928,BW>[,MT,1_Chronicles,5,26,JBJ>M,W J<R >LHJ JFR>L >T RWX PWL MLK >CWR W >T RWX ...,W JBJ>M L XLX W XBWR W HR> W NHR GWZN <D H JWM...,,L XLX W XBWR W HR> W NHR GWZN,L XLX/ W XBWR/ W HR>/ W NHR/ GWZN/,,0,prep,topo topo topo topo,inanim inanim inanim inanim,det,nmpr,complex,,,hif,wayq,,,L,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9886,309241,<LH[,MT,Zechariah,14,13,<LTH,W HJH B JWM H HW> THJH MHWMT JHWH RBH BHM W HX...,W <LTH JDW <L JD R<HW,JDW,<L JD R<HW,<L JD/ R<=/,,0,prep,,,det,subs,complex,,,qal,perf,,,<L,,,,,,
9899,303776,QRB[,MT,Zephaniah,3,2,QRBH,L> CM<H B QWL L> LQXH MWSR B JHWH L> BVXH >L >...,>L >LHJH L> QRBH,,>L >LHJH,>L >LHJM/,,0,prep,,,det,subs,complex,,,qal,perf,,,>L,,,,,,
9900,303935,SWR[,MT,Zephaniah,3,11,>SJR,B JWM H HW> L> TBWCJ M KL <LJLTJK >CR PC<T BJ ...,KJ >Z >SJR M QRBK <LJZJ G>WTK,,M QRBK,MN QRB/,,0,prep,,,det,subs,complex,,,hif,impf,,,MN,,,,,,
9901,303958,XSH[,MT,Zephaniah,3,12,XSW,W HC>RTJ B QRBK <M <NJ W DL W XSW B CM JHWH,W XSW B CM JHWH,,B CM JHWH,B CM/ JHWH/,,0,prep,pers,anim,det,subs,complex,,,qal,perf,,,B,,,,,,


In [42]:
filtered_df = df[~((df.preposition_1 == "MN") & (df.preposition_2 == "") & (df.preposition_3 == ""))]
filtered_df = filtered_df[~((filtered_df.preposition_1 == "MN") & (filtered_df.preposition_2 == "MN") & (filtered_df.preposition_3 == ""))]
filtered_df = filtered_df[~((filtered_df.preposition_1 == "MN") & (filtered_df.preposition_2 == "MN") & (filtered_df.preposition_3 == "MN"))]
len(filtered_df)
len(filtered_df)

8799

In [45]:
len(df)

9905

### Save the dataframe into a file: the whole BHSA

In [46]:
filtered_df.to_csv("data/bhsa_all_verbs_without_min.csv", index=False)

In [None]:
#set(df.book)