In [1]:
import pandas as pd
from tfob import TFOb, get_bhsa

BHSA = get_bhsa()

**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


In [2]:
from collections import Counter

### 1.Create two lists with the verses from MT Isaiah and 1QIsaa

#### 1.1 List of wanted motion verbs

In [3]:
motion_verbs = ['BW>[','HLK[','JY>[','JRD[','<BR[',
                '<LH[','CWB[','>TH[','BRX[','GJX[',
                'GLH[','GLL[','DXP[','DLG[','HWH[',
                'XWC[','XLP[','XSH[','VB<[','VWF[',
                'MHR[','MWC[','NGC[','NHR[','NWX[',
                'NWS[','NXT[','NVP[','NS<[','NPL[',
                'NTK[','SBB[','SWR[','SLQ[','<WZ[',
                '<WP[','PNH[','PF<[','YWP[','Y<D[',
                'QHL[','QPY[','QRB[','RWY[','FVH[',
                'CWX[','CWR=[','CVP[','CQQ[','T>R[',
                'T<H[']

freq_verbs = ['BW>[','HLK[','JY>[','JRD[','<BR[',
                '<LH[','CWB[','>TH[','BRX[','GLH[',
                'HWH[','XLP[','XSH[','MHR[','NGC[',
                'NWX[','NWS[','NS<[','NPL[','NTK[',
                'SBB[','SWR[','<WP[','PNH[','PC<[',
                'QHL[','QRB[','RWY[','CVP[','T<H[']

#### 1.2 List of occurrences of the motion verbs (MT Isaiah)

In [4]:
verbs_bhsa = TFOb.all("word", BHSA).filter_in(lex=motion_verbs)
verbs_bhsa

<word_10909 "TWY> J<WPP TWY> J<LH JY> SBB SWBB HLK JNXHW JB> JPL JB>H MTHLK TLK CWBK TCWB JB> HBJ> JPLW NPLW [...] JY> HSB JB> H<BJRWNJ J<BJRHW JWLJKHW JSJRHW JSB JBJ>HW <LH HLJKW HBJ> JB>HW CWB <LWT J<L HBJ> JGL J<BR J<L">

In [5]:
#TESTING ZONE 
#verbs_bhsa[0]
#TFOb(215297, BHSA).to_verses.text
#TFOb(215297, BHSA).to_verses.pretty()
verb = verbs_bhsa[8]
cmpl = verb.to_clauses.to_phrases.filter(function="Loca")
print(verb.book[0], verb.chapter[0], verb.verse[0])
print(cmpl.text)
print(cmpl.to_words.sp)
print(cmpl.to_phrases.det)
print(cmpl.to_words.nametype)

Genesis 2 15

[]
[]
[]


### 2. Important function

In [6]:
# Create a dataset with the occurrences

def find_complements(verb):
    """Find the complement of a verb. If no match, returns None"""
    if verb.source.name == "BHSA":
        complements = verb.to_clauses.to_phrases.filter(function="Loca")
        return complements
    
    # if the verb is not BHSA, it's DSS
    verb_bhsa = find_bhsa_verb(verb)

    # Check if verses are identical
    if verb_bhsa and is_lex_identical(verb_bhsa.to_verses, find_dss_verse(verb)):
        return find_complements(verb_bhsa)
        
def find_cmpl_anim(cmpl):
    """Based on the nametype value, returns 'anim' or 'inanim' for one complement of a given verb. 
    If the nametype does not provide a clear distinction, returns 'check'.
    If there is no complement to the verb, returns None."""
    
    anim = ["pers", "gens", "god"]
    inanim = ["mens", "topo"]

    nouns = cmpl.to_words.filter_in(sp=["subs", "nmpr"])
    nouns_animacy = []
    for noun in nouns:
        nt = noun.nametype[0]
        if nt in anim:
            nouns_animacy.append("anim")
        elif nt in inanim:
            nouns_animacy.append("inanim")
    return " ".join(nouns_animacy)

def find_cmpl_nametype(cmpl):
    words = cmpl.to_words
    cmpl_nt = []
    for word in words:
        if word.nametype[0]:
            nt = word.nametype[0]
            cmpl_nt.append(nt)
    return " ".join(cmpl_nt)


def find_cmpl_individuation(cmpl):
    """Returns whether the nouns in a cmpl are substantives or proper nouns."""
    cmpl_indiv = []
    for word in cmpl.to_words.filter_in(sp=["subs","nmpr"]):
        cmpl_indiv.append(word.sp[0])
        
    if len(cmpl_indiv) == 0:
        cmpl_indiv.append("")
        
    return cmpl_indiv[0]

def find_cmpl_construction(cmpl):
    """Returns the construction of the complement: vc (for verbal complement), 
    prep for prepositional complement and dir_he for complement with directive-he 
    (or a combination of those)."""
    
    cmpl_construction = []
    
    for word in cmpl.to_words:
        if word.filter(sp="prep"):
            cmpl_construction.append("prep")
        elif "H" in word.uvf:
            cmpl_construction.append("dir-he")
            
    if cmpl_construction == []:
        return "vc"
    else:
        return " + ".join(cmpl_construction)
    
def find_cmpl_complex(cmpl):
    simplified_cmpl = []

    for word in cmpl.to_words:
        if word.sp[0] != "prep" and word.sp[0] != "art":
            simplified_cmpl.append(word)

    if len(simplified_cmpl) <= 1:
        cmpl_complex = "simple"
    else:
        cmpl_complex = "complex"
        
    return cmpl_complex

### 3. Generate dataset

In [7]:
items = [] # create an empty list to store all the information for each occ.

       
for verb in verbs_bhsa:
    
    # Add MT as "scroll" for the BHSA
    scroll = "MT"
    verse = verb.to_verses
    dir_he_dss_verse = ""
    sign_info = ""
    
    
    # Filling in the complement column
    complements = find_complements(verb)
    if not complements:
        complements = ["no complement"]
        

    for complement in complements:    
        if complement == "no complement":
            dir_he = cmpl_anim = cmpl_nt = cmpl_det = cmpl_indiv = cmpl_constr = cmpl_complex = ""
            cmpl_lex = "no complement"
          
        else: 
            cmpl_lex = str(" ".join(complement.to_words.lex))
            dir_he = int("H" in complement.to_words.uvf)
            cmpl_anim = find_cmpl_anim(complement)
            cmpl_nt = find_cmpl_nametype(complement)
            cmpl_det = cmpl.det
            cmpl_indiv = find_cmpl_individuation(complement)
            cmpl_constr = find_cmpl_construction(complement)
            cmpl_complex = find_cmpl_complex(complement)
            
    
        # Collect information about the following variables:    
        item = {
            "verb_id": verb.ids[0], 
            "lex": verb.lex[0], 
            "scroll": scroll,
            "book": verb.book[0], 
            "chapter": int(verb.chapter[0]), 
            "verse_num": int(verb.verse[0]),
            "gcons_verb": verb.g_cons[0],
            "gcons_verse": str(verse),
            "gcons_clause": str(verb.to_clauses),
            "subject": str(verb.to_clauses.to_phrases.filter(function="Subj")),
            "complement": str(complement),
            "cmpl_lex": cmpl_lex,
            "dir_he": dir_he,
            "cmpl_constr": cmpl_constr,
            "cmpl_nt": cmpl_nt,
            "cmpl_anim": cmpl_anim,
            "cmpl_det": cmpl_det,
            "cmpl_indiv": cmpl_indiv,
            "cmpl_complex": cmpl_complex,
            "dir_he_dss": dir_he_dss_verse,
            "sign_info": sign_info,
            "stem": verb.vs[0],
            "tense": verb.vt[0],
        }
        
        if complement != "no complement":
            prepositions = complement.to_words.filter(sp="prep").lex
            n = 0
            for preposition in prepositions:
                n += 1
                item[f"preposition_{n}"] = str(preposition)

        items.append(item)

### 4. Create the dataframe with pandas

In [8]:
df = pd.DataFrame(items).fillna("")
df.sort_values(["book", "chapter", "verse_num"], ascending=[True, True, True], ignore_index=True, inplace=True)

In [16]:
pd.set_option('display.max_columns', None)
#df[df.book == "Exodus"]
#df[df.cmpl_complex == "complex"]

df_loca = df[df.complement != "no complement"]
len(df_loca)

183

In [14]:
df_loca[df_loca.book == "Isaiah"]

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,gcons_clause,subject,complement,cmpl_lex,dir_he,cmpl_constr,cmpl_nt,cmpl_anim,cmpl_det,cmpl_indiv,cmpl_complex,dir_he_dss,sign_info,stem,tense,preposition_1,preposition_2,preposition_3,preposition_4,preposition_5,preposition_6
6154,214306,JY>[,MT,Isaiah,7,3,Y>,W J>MR JHWH >L JC<JHW Y> N> L QR>T >XZ >TH W C...,Y> N> >TH W C>R_JCWB BNK >L QYH T<LT H BRKH H ...,>TH W C>R_JCWB BNK,>L QYH T<LT H BRKH H <LJWNH >L MSLT FDH KWBS,>L QYH=/ T<LH/ H BRKH=/ H <LJWN/ >L MSLH/ FDH/...,0,prep + prep,,,[],subs,complex,,,qal,impv,>L,>L,,,,
6204,216346,<WP[,MT,Isaiah,11,14,<PW,W <PW B KTP PLCTJM JMH JXDW JBZW >T BNJ QDM >D...,W <PW B KTP PLCTJM JMH,,JMH,JM/,1,dir-he,,,[],subs,simple,,,qal,perf,,,,,,
6234,217322,XSH[,MT,Isaiah,14,32,JXSW,W MH J<NH ML>KJ GWJ KJ JHWH JSD YJWN W BH JXSW...,W BH JXSW <NJJ <MW,<NJJ <MW,BH,B,0,prep,,,[],,simple,,,qal,impf,B,,,,,
6261,218744,BW>[,MT,Isaiah,21,1,B>,MF> MDBR JM K SWPWT B NGB L XLP M MDBR B> M >R...,K SWPWT B NGB M MDBR B> M >RY NWR>H,,B NGB,B H NGB/,0,prep,,,[],subs,simple,,,qal,perf,B,,,,,
6286,219569,NWX[,MT,Isaiah,23,12,JNWX,W J>MR L> TWSJPJ <WD L <LWZ H M<CQH BTWLT BT Y...,GM CM L> JNWX LK,,GM CM,GM CM,0,vc,,,[],,complex,,,qal,impf,,,,,,
6315,220911,CWB[,MT,Isaiah,28,6,MCJBJ,W L RWX MCPV L JWCB <L H MCPV W L GBWRH MCJBJ ...,MCJBJ MLXMH C<RH,,C<RH,C<R/,1,dir-he,,,[],subs,simple,,,hif,ptca,,,,,,
6359,222571,JRD[,MT,Isaiah,31,4,JRD,KJ KH >MR JHWH >LJ K >CR JHGH H >RJH W H KPJR ...,KN JRD JHWH YB>WT <L HR YJWN W <L GB<TH,JHWH YB>WT,<L HR YJWN W <L GB<TH,<L HR/ YJWN==/ W <L GB<H/,0,prep + prep,topo,inanim,[],subs,complex,,,qal,impf,<L,<L,,,,
6411,224403,NPL[,MT,Isaiah,37,7,HPLTJW,HNNJ NWTN BW RWX W CM< CMW<H W CB >L >RYW W HP...,W HPLTJW B XRB B >RYW,,B >RYW,B >RY/,0,prep,,,[],subs,simple,,,hif,perf,B,,,,,
6451,225609,PNH[,MT,Isaiah,40,3,PNW,QWL QWR> B MDBR PNW DRK JHWH JCRW B <RBH MSLH ...,B MDBR PNW DRK JHWH,,B MDBR,B H MDBR/,0,prep,,,[],subs,simple,,,piel,impv,B,,,,,
6617,231658,<LH[,MT,Isaiah,57,7,<LJT,<L HR GBH W NF> FMT MCKBK GM CM <LJT L ZBX ZBX,GM CM <LJT,,GM CM,GM CM,0,vc,,,[],,complex,,,qal,perf,,,,,,


### Save the dataframe into a file: the whole BHSA

In [11]:
#df_loca.to_csv("data/bhsa_all_verbs_locatives.csv", index=False)

### Save another csv file without the occurrences of Isaiah + complement, for easier merging with the Isaiah dataset

In [24]:
df2 = pd.DataFrame(items).fillna("")
df2.sort_values(["book", "chapter", "verse_num"], ascending=[True, True, True], ignore_index=True, inplace=True)

In [25]:
# Drop the rows from Isaiah having a complement different than "no complement" (already present in the other dataset)

for index, row in df2.iterrows():
    if row["complement"] != "no complement" and row["book"] == "Isaiah":
        df2.drop(index, inplace=True)

In [26]:
# Drop the dir_he_dss column (droped in the other dataset)

df2 = df2.drop("dir_he_dss", axis=1)

In [27]:
df2.columns

Index(['verb_id', 'lex', 'scroll', 'book', 'chapter', 'verse_num',
       'gcons_verb', 'gcons_verse', 'gcons_clause', 'subject', 'complement',
       'cmpl_lex', 'dir_he', 'cmpl_constr', 'cmpl_nt', 'cmpl_anim', 'cmpl_det',
       'cmpl_indiv', 'cmpl_complex', 'sign_info', 'stem', 'tense',
       'preposition_1', 'preposition_2', 'preposition_3', 'preposition_4',
       'preposition_5', 'preposition_6'],
      dtype='object')

In [28]:
# Show the occurrences where DSS verse = BHSA verse
# df[(df.scroll != "MT") & (df.complement != "")]
# df[(df.complement != "")]
df2[(df2.book == "Isaiah") & (df2.complement != "no complement")]

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,gcons_clause,subject,complement,cmpl_lex,dir_he,cmpl_constr,cmpl_nt,cmpl_anim,cmpl_det,cmpl_indiv,cmpl_complex,sign_info,stem,tense,preposition_1,preposition_2,preposition_3,preposition_4,preposition_5,preposition_6
