In [3]:
import pandas as pd

from itertools import chain

from tfob import TFOb, get_xb

XB = get_xb()

In [4]:
from collections import Counter

### 1.Create a dataset with all motion verbs (in predicative phrases)

#### 1.1 List of wanted motion verbs, adapted for the extrabiblical database

In [5]:
motion_verbs = ['BW>[','HLK[','JY>[','JRD[','<BR[',
                '<LH[','CWB[','>TH[','BRX[','GJX[',
                'GLH[','GLL[','DXP[','DLG[','HWH[',
                'XWC[','XLP[','XSH[','VB<[','VWF[',
                'MHR[','MWC[','NGC[','NHR[','NWX[',
                'NWS[','NXT[','NVP[','NS<[','NPL[',
                'NTK[','SBB[','SWR[','SLQ[','<WZ[',
                '<WP[','PNH[','PF<[','YWP[','Y<D[',
                'QHL[','QPY[','QRB[','RWY[','FVH[',
                'CWX[','CWR=[','CVP[','CQQ[','T>R[',
                'T<H[']

motion_verbs = [verb.replace("[", "") for verb in motion_verbs]
#motion_verbs

#### 1.2 List of the scrolls I want in the dataset

In [6]:
books = TFOb.all("book", XB).book
books.remove("Shirata")
books.remove("Pirqe")
books

['1QH',
 '1QM',
 '1QS',
 'Kuntillet_Ajrud',
 'Arad',
 'Balaam',
 'Ketef_Hinnom',
 'Lachish',
 'Mesha_Stela',
 'Mesad_Hashavyahu',
 'Siloam']

In [7]:
TFOb.all("book", XB)._levels

['to_books',
 'to_chapters',
 'to_half_verses',
 'to_verses',
 'to_sentences',
 'to_sentence_atoms',
 'to_clauses',
 'to_clause_atoms',
 'to_phrases',
 'to_phrase_atoms',
 'to_subphrases',
 'to_words']

In [8]:
# Keep only the phrases being predicates

phrases = TFOb.all("book", XB).filter_in(book=books).to_phrases.filter(function="Pred")
# List of verbs in predicative phrases
verbs = phrases.to_words.filter_in(lex=motion_verbs)

verbs

<word_358 "HTGWLLTJ HTNPL HTHLKW GLJTH JCWB JCWB TPJL TGJCN NGLTH HWGCTJ >GJCN >BJ> NCJB SWR THLK HGJC HLKW TBJ> HTNPL HGJC [...] JCJB JB> JRD B> J<L JB> HCB J> JXLP JCB LK >HLK RD >RD JCB JB> JCB HCB HCBT JLKW">

In [9]:
Counter(verbs.book)

Counter({'1QH': 119,
         '1QS': 119,
         '1QM': 90,
         'Balaam': 7,
         'Lachish': 7,
         'Mesha_Stela': 7,
         'Mesad_Hashavyahu': 4,
         'Arad': 3,
         'Ketef_Hinnom': 1,
         'Siloam': 1})

### 2. Necessary functions

In [30]:
def clean(g_cons):
    return g_cons.replace("_", " ").replace("׳", "").replace("'", "")


def find_clause(verb):
    """Find the complement of a verb. If no match, returns None"""
    clause = verb.to_clauses.to_clauses
    return clause


def find_complements(verb):
    """Find the complement of a verb. If no match, returns None"""
    complements = verb.to_clauses.to_phrases.filter(function="Cmpl")
    return complements


def find_subject(verb):
    """Find the subject of a verb. If no match, returns None"""
    subjects = verb.to_clauses.to_phrases.filter(function="Subj")
    assert len(subjects) <= 1
    return subjects

### 3. Generate the dataset

In [31]:
complements = []

for verb in verbs:
    cmpl = verb.to_clauses.to_phrases.filter(function="Cmpl")
    if cmpl:
        complements.append(cmpl)
        
len(complements)    

203

In [32]:
# Create a dataset with the occurrences


items = [] # create an empty list to store all the information for each occ.

       
for verb in verbs:
    
    # Add the scroll name
    scroll = verb.book[0]
    verse = verb.to_verses
    sign_info = ""
    clause = find_clause(verb)
    subject = find_subject(verb)
    complements = find_complements(verb)
    dir_he_dss_verse = ""
    
    if not complements:
        complements = [""]
        dir_he = ""


    for complement in complements: 
        
        if complement != "":
            dir_he = int("H" in complement.to_words.uvf[0])
    
        # Collect information about the following variables:    
        item = {
            "verb_id": verb.ids[0], 
            "lex": verb.lex[0], 
            "scroll": scroll,
            "book": verb.book[0], 
            "chapter": verb.chapter[0], 
            "verse_num": verb.verse[0],
            "gcons_verb": clean(verb.g_cons[0]),
            "gcons_verse": clean(str(verse)),
            "gcons_clause": clean(str(find_clause(verb))),
            "subject": clean(str(subject)),
            "complement": clean(str(complement)),
            "dir_he": dir_he,
            "sign_info": sign_info,
            "stem": verb.vs[0],
            "tense": verb.vt[0],
        }
        
        if complement != "": 
            prepositions = complement.to_words.filter(sp="prep").lex
            n = 0
            for preposition in prepositions:
                n += 1
                item[f"preposition_{n}"] = str(preposition)
                
        items.append(item)

### 3.4 Create the dataset with Pandas

In [16]:
df = pd.DataFrame(items).fillna("")
df.sort_values(["book", "chapter", "verse_num"], ascending=[True, True, True], ignore_index=True, inplace=True)

In [17]:
df

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,gcons_clause,subject,complement,dir_he,sign_info,stem,tense,preposition_1,preposition_2,preposition_3,preposition_4
0,3132,NPL,1QH,1QH,10,13,NPLW,W PXJM VMNW L NPC J NPLW B M,NPLW B M,,B M,0,,qal,perf,B,,,
1,3058,SBB,1QH,1QH,10,7,SBBW,W >NJ >MRTJ XNW <LJ GBWRJM SBBW M B KL KLJ MLX...,SBBW M B KL KLJ MLXMWT M,,,,,qal,perf,,,,
2,3354,BW>,1QH,1QH,12,5,B>W,KJ> B>W BNJM <D MCBRJ MWT W HRJT GBR HYRH B XB...,KJ> B>W BNJM <D MCBRJ MWT,BNJM,<D MCBRJ MWT,0,,qal,perf,<D,,,
3,3376,GJX,1QH,1QH,12,6,JGJX,KJ> B MCBRJ MWT TMLJV ZKR W B XBLJ C>WL JGJX M...,W B XBLJ C>WL JGJX M KWR HRJH PL> JW<Y <M GBWRT W,PL> JW<Y <M GBWRT W,M KWR HRJH,0,,qal,impf,MN,,,
4,3393,XWC,1QH,1QH,12,8,HXJCW,HXJCW KWL MCBRJM W XBLJ MRY B MWLDJ HM W PLYWT...,HXJCW KWL MCBRJM,KWL MCBRJM,,,,hif,perf,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364,24646,CWB,Mesha_Stela,Mesha_Stela,1,38,JCB,W >RD W>L TXM B QR W >XZ H W JCB B H KMC B JMJ...,W JCB B H KMC B JMJ,KMC,B H,0,,hif,wayq,B,,,
365,24234,XLP,Mesha_Stela,Mesha_Stela,1,6,JXLP,W JXLP H BN H W J>MR GM H> ><NW >T M>B,W JXLP H BN H,,,,,qal,wayq,,,,
366,24284,CWB,Mesha_Stela,Mesha_Stela,1,9,JCB,W JCB B H JM H W XYJ JMJ BN H >RB<N CT W JCB H...,W JCB H KMC B JMJ,KMC,,,,hif,wayq,,,,
367,39838,HLK,Siloam,Siloam,1,1,JLKW,DBR H NQBH W ZH HJH DBR H NQBH B <WD H XYBM MN...,W JLKW H MJM MN H MWY> >L H BRKH B M>TJM W >LP...,H MJM,MN H MWY>,0,,qal,wayq,MN,,,


### 3.5 Save the dataset in a csv file

In [None]:
df.to_csv("data/extrabiblical_all_verbs.csv", index=False)