In [1]:
import pandas as pd

from itertools import chain
from collections import Counter

from tfob import TFOb, get_xb

In [2]:
XB = get_xb()

   |     0.19s T det                  from ~/text-fabric-data/github/ETCBC/extrabiblical/tf/0.2


In [3]:
pd.set_option("display.max_columns", None)

### 1.Create a dataset with all motion verbs (in predicative phrases)

#### 1.1 List of wanted motion verbs, adapted for the extrabiblical database

In [34]:
motion_verbs = ['BW>[','HLK[','JY>[','JRD[','<BR[',
                '<LH[','CWB[','>TH[','BRX[','GJX[',
                'GLH[','GLL[','DXP[','DLG[','HWH=[',
                'XWC[','XLP[','XSH[','VB<[','VWF[',
                'MHR[','MWC[','NGC[','NHR[','NWX[',
                'NWS[','NXT[','NVP[','NS<[','NPL[',
                'NTK[','SBB[','SWR[','SLQ[','<WZ[',
                '<WP[','PNH[','PF<[','YWP[','Y<D[',
                'QHL[','QPY[','QRB[','RWY[','FVH[',
                'CWX[','CWR=[','CVP[','CQQ[','T>R[',
                'T<H[']

motion_verbs = [verb.replace("[", "") for verb in motion_verbs]
#motion_verbs

#### 1.2 List of the scrolls I want in the dataset

In [35]:
books = TFOb.all("book", XB).book
books.remove("Shirata")
books.remove("Pirqe")
books

['1QH',
 '1QM',
 '1QS',
 'Kuntillet_Ajrud',
 'Arad',
 'Balaam',
 'Ketef_Hinnom',
 'Lachish',
 'Mesha_Stela',
 'Mesad_Hashavyahu',
 'Siloam']

In [36]:
TFOb.all("book", XB)._levels

['to_books',
 'to_chapters',
 'to_half_verses',
 'to_verses',
 'to_sentences',
 'to_sentence_atoms',
 'to_clauses',
 'to_clause_atoms',
 'to_phrases',
 'to_phrase_atoms',
 'to_subphrases',
 'to_words']

In [37]:
# Keep only the phrases being predicates

predicates = ["Pred", "PreO", "PreS"]

phrases = TFOb.all("book", XB).filter_in(book=books).to_phrases.filter_in(function=predicates)
# List of verbs in predicative phrases
verbs = phrases.to_words.filter_in(lex=motion_verbs)

verbs

<word_358 "HTGWLLTJ HTNPL HTHLKW GLJTH JCWB JCWB TPJL TGJCN NGLTH HWGCTJ >GJCN >BJ> NCJB SWR THLK HGJC HLKW TBJ> HTNPL HGJC [...] JCJB JB> JRD B> J<L JB> HCB J> JXLP JCB LK >HLK RD >RD JCB JB> JCB HCB HCBT JLKW">

In [38]:
Counter(verbs.book)

Counter({'1QH': 119,
         '1QS': 119,
         '1QM': 90,
         'Balaam': 7,
         'Lachish': 7,
         'Mesha_Stela': 7,
         'Mesad_Hashavyahu': 4,
         'Arad': 3,
         'Ketef_Hinnom': 1,
         'Siloam': 1})

### 2. Necessary functions

In [42]:
def clean(g_cons):
    return g_cons.replace("_", " ").replace("׳", "").replace("'", "")


def find_clause(verb):
    """Find the complement of a verb. If no match, returns None"""
    clause = verb.to_clauses.to_clauses
    return clause


def find_complements(verb):
    """Find the complement of a verb. If no match, returns None"""
    complements = verb.to_clauses.to_phrases.filter(function="Cmpl")
    return complements


def find_subject(verb):
    """Find the subject of a verb. If no match, returns None"""
    subjects = verb.to_clauses.to_phrases.filter(function="Subj")
    assert len(subjects) <= 1
    return subjects


def find_cmpl_individuation(cmpl):
    """Returns whether the nouns in a cmpl are substantives or proper nouns."""
    cmpl_indiv = []
    for word in cmpl.to_words.filter_in(sp=["subs","nmpr"]):
        cmpl_indiv.append(word.sp[0])
   
    # Handle cases when the individuation is a pronominal suffixe (prsf)    
    simplified_cmpl = []
   
    if len(cmpl.to_words) == 1:
        if cmpl.to_words.sp[0] == "prep":
            if cmpl.to_words.prs not in ["n/a", "absent"]:
                cmpl_indiv.append("prsf")
        else:
            cmpl_indiv.append(cmpl.to_words.sp[0])
        
    # cmpl_indiv empty string if empty    
    if len(cmpl_indiv) == 0:
        cmpl_indiv.append("")
        
    return cmpl_indiv[0]


def find_cmpl_construction(cmpl):
    """Returns the construction of the complement: vc (for verbal complement), 
    prep for prepositional complement and dir_he for complement with directive-he 
    (or a combination of those)."""
    
    cmpl_construction = []
    
    for word in cmpl.to_words:
        if word.filter(sp="prep"):
            cmpl_construction.append("prep")
        elif "H" in word.uvf:
            cmpl_construction.append("dir-he")
            
    if cmpl_construction == []:
        return "vc"
    else:
        return " + ".join(cmpl_construction)
    

def find_cmpl_complex(cmpl):
    simplified_cmpl = []
    
    #print(cmpl.to_words.prs)
   
    for word in cmpl.to_words:
        if word.sp[0] != "prep" and word.sp[0] != "art":
            simplified_cmpl.append(word)
    
    if len(simplified_cmpl) == 0 or (len(simplified_cmpl) == 1 and simplified_cmpl[0].to_words.prs[0] in ["n/a", "absent"]):
        cmpl_complex = "simple"
    else:
        cmpl_complex = "complex"
        
    return cmpl_complex

### 3. Generate the dataset

In [62]:
complements = []

for verb in verbs:
    cmpl = verb.to_clauses.to_phrases.filter(function="Cmpl")
    if cmpl:
        complements.append(cmpl)
        
len(complements)    

203

In [63]:
# Create a dataset with the occurrences


items = [] # create an empty list to store all the information for each occ.

       
for verb in verbs:
    
    # Add the scroll name
    scroll = verb.book[0]
    verse = verb.to_verses
    sign_info = ""
    clause = find_clause(verb)
    subject = find_subject(verb)
    complements = find_complements(verb)
    dir_he_dss_verse = ""    # Filling in the complement column
    complements = find_complements(verb)
    
    if not complements:
        complements = ["no complement"]
        

    for complement in complements:    
        if complement == "no complement":
            dir_he = cmpl_anim = cmpl_nt = cmpl_det = cmpl_indiv = cmpl_constr = cmpl_complex = ""
            cmpl_lex = "no complement"
          
        else: 
            cmpl_lex = str(" ".join(complement.to_words.lex))
            dir_he = int("H" in complement.to_words.uvf)
            #cmpl_anim = find_cmpl_anim(complement)
            #cmpl_nt = find_cmpl_nametype(complement)
            cmpl_det = complement.det[0]
            cmpl_indiv = find_cmpl_individuation(complement)
            cmpl_constr = find_cmpl_construction(complement)
            cmpl_complex = find_cmpl_complex(complement)
            
    
        # Collect information about the following variables:    
        item = {
            "verb_id": verb.ids[0], 
            "lex": verb.lex[0], 
            "scroll": scroll,
            "book": verb.book[0], 
            "chapter": int(verb.chapter[0]), 
            "verse_num": int(verb.verse[0]),
            "gcons_verb": verb.g_cons[0],
            "gcons_verse": str(verse),
            "gcons_clause": str(verb.to_clauses),
            "subject": str(verb.to_clauses.to_phrases.filter(function="Subj")),
            "complement": str(complement),
            "cmpl_lex": cmpl_lex,
            "cmpl_translation": "",
            "dir_he": dir_he,
            "cmpl_constr": cmpl_constr,
            #"cmpl_nt": cmpl_nt,
            #"cmpl_anim": cmpl_anim,
            "cmpl_det": cmpl_det,
            "cmpl_indiv": cmpl_indiv,
            "cmpl_complex": cmpl_complex,
            #"dir_he_dss": dir_he_dss_verse,
            "sign_info": sign_info,
            "stem": verb.vs[0],
            "tense": verb.vt[0],
            "motion_type": "",
            "comments": "",
        }
        
        if complement != "no complement":
            prepositions = complement.to_words.filter(sp="prep").lex
            n = 0
            for preposition in prepositions:
                n += 1
                item[f"preposition_{n}"] = str(preposition)

        items.append(item)

### 3.4 Create the dataset with Pandas

In [64]:
df = pd.DataFrame(items).fillna("")
df.sort_values(["book", "chapter", "verse_num"], ascending=[True, True, True], ignore_index=True, inplace=True)

In [65]:
df

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,gcons_clause,subject,complement,cmpl_lex,cmpl_translation,dir_he,cmpl_constr,cmpl_det,cmpl_indiv,cmpl_complex,sign_info,stem,tense,motion_type,comments,preposition_1,preposition_2,preposition_3,preposition_4
0,206,GLL,1QH,1QH,3,9,HTGWLLTJ,KJ B NDH HTGWLLTJ W M SWD RMH Y>TJ W L> NLWJTJ...,KJ B NDH HTGWLLTJ,,B NDH,B NDH,,0,prep,und,subs,simple,,hit,perf,,,B,,,
1,354,NPL,1QH,1QH,4,1,HTNPL,MZMWR L MFKJL L HTNPL L PNJ >L M<FJ >L W L HBJ...,L HTNPL L PNJ >L,,L PNJ >L,L PNH >L==,,0,prep,und,subs,complex,,hit,infc,,,L,,,
2,376,HLK,1QH,1QH,4,1,HTHLKW,MZMWR L MFKJL L HTNPL L PNJ >L M<FJ >L W L HBJ...,HTHLKW,,no complement,no complement,,,,,,,,hit,perf,,,,,,
3,447,GLH,1QH,1QH,4,4,GLJTH,>TH HW>L QDWCJM W B RZJ PL> K HWD< BWR KBWD K ...,>TH GLJTH DRKJ >MT W M<FJ R< XWKMH W >WLT,>TH,no complement,no complement,,,,,,,,qal,perf,,,,,,
4,709,CWB,1QH,1QH,4,19,JCWB,KJ W DBR K L> JCWB >XWR,W DBR K L> JCWB >XWR,DBR K,>XWR,>XWR,,0,vc,,subs,simple,,qal,impf,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364,24631,JRD,Mesha_Stela,Mesha_Stela,1,37,RD,W XWRNN JCB BT DWD B WQ >C W J>MR L J KMC RD H...,RD,,no complement,no complement,,,,,,,,qal,impv,,,,,,
365,24636,JRD,Mesha_Stela,Mesha_Stela,1,38,>RD,W >RD W>L TXM B QR W >XZ H W JCB B H KMC B JMJ...,W >RD W>L TXM B QR,,B QR,B H QR==,,0,prep,det,subs,simple,,qal,wayq,,,B,,,
366,24646,CWB,Mesha_Stela,Mesha_Stela,1,38,JCB,W >RD W>L TXM B QR W >XZ H W JCB B H KMC B JMJ...,W JCB B H KMC B JMJ,KMC,B H,B H=,,0,prep,det,,simple,,hif,wayq,,,B,,,
367,39838,HLK,Siloam,Siloam,1,1,JLKW,DBR H NQBH W ZH HJH DBR H NQBH B <WD H XYBM MN...,W JLKW H MJM MN H MWY> >L H BRKH B M>TJM W >LP...,H MJM,MN H MWY>,MN H MWY>,,0,prep,det,subs,simple,,qal,wayq,,,MN,,,


### 3.5 Save the dataset in a csv file

In [66]:
df.to_csv("data/extrabiblical_all_verbs.csv", index=False)

In [67]:
df_1 = df[(df.scroll == "1QH") | (df.scroll == "1QM") | (df.scroll == "1QS")].copy().reset_index()

In [68]:
df_1.to_csv("data/1qs_1qm_1qh_all_verbs.csv", index=False)