In [1]:
import pandas as pd

from itertools import chain

from tfob import TFOb, XB

**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
scroll,1001,1428.81,100
lex,10450,129.14,94
fragment,11182,127.91,100
line,52895,27.04,100
clause,125,12.85,0
cluster,101099,6.68,47
phrase,315,5.1,0
word,500995,2.81,99
sign,1430241,1.0,100


**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


### 1.Create a dataset with all motion verbs (in predicative phrases)

#### 1.1 List of wanted motion verbs

In [2]:
BOOK = "1QM"

motion_verbs = ['BW>[','HLK[','JY>[','JRD[','<BR[',
                '<LH[','CWB[','>TH[','BRX[','GJX[',
                'GLH[','GLL[','DXP[','DLG[','HWH[',
                'XWC[','XLP[','XSH[','VB<[','VWF[',
                'MHR[','MWC[','NGC[','NHR[','NWX[',
                'NWS[','NXT[','NVP[','NS<[','NPL[',
                'NTK[','SBB[','SWR[','SLQ[','<WZ[',
                '<WP[','PNH[','PF<[','YWP[','Y<D[',
                'QHL[','QPY[','QRB[','RWY[','FVH[',
                'CWX[','CWR[','CVP[','CQQ[','T>R[',
                'T<H[']

motion_verbs = [verb.replace("[", "") for verb in motion_verbs]
#motion_verbs

#### 1.2 List of occurrences of the motion verbs (1QS)

In [3]:
# Keep only the phrases being predicates

phrases = TFOb.all("book", XB).filter(book=BOOK).to_phrases.filter(function="Pred")

# List of verbs in predicative phrases
verbs = phrases.to_words.filter_in(lex=motion_verbs)

In [4]:
verbs

<word_90 "CWB J<LW JY> NPL SRH HLWK NPWL JTQRBW Y>T Y>T Y>T CWB HNJS Y>T HPJL JCJB CWB BW> BW> LKT [...] TCJB JLKW HTHLK TCWBW Y>T HPJL NPWL Y>T CWB NGC HTQRB HPJL JNJXW NPLW BW> HSJR HSJR JBW>W NPLW NGC">

In [34]:
TFOb.all("book", XB).book

['1QH',
 '1QM',
 '1QS',
 'Kuntillet_Ajrud',
 'Arad',
 'Balaam',
 'Ketef_Hinnom',
 'Lachish',
 'Mesha_Stela',
 'Mesad_Hashavyahu',
 'Pirqe',
 'Shirata',
 'Siloam']

### 2. Necessary functions

In [10]:
def clean(g_cons):
    return g_cons.replace("_", " ").replace("׳", "").replace("'", "")


def find_clause(verb):
    """Find the complement of a verb. If no match, returns None"""
    clause = verb.to_clauses.to_clauses
    return clause


def find_complements(verb):
    """Find the complement of a verb. If no match, returns None"""
    complements = verb.to_clauses.to_phrases.filter(function="Cmpl")
    return complements


def find_subject(verb):
    """Find the subject of a verb. If no match, returns None"""
    subjects = verb.to_clauses.to_phrases.filter(function="Subj")
    assert len(subjects) <= 1
    return subjects


def is_sign_unc(verse):
    """If a verse contains a missing or uncertain sign, returns True. Else, returns False."""
    
    unc_types = ['missing', 'unc']
    verse_sign_types = []
    
    for sign in verse.to_signs:
        if sign.type[0] in unc_types:
            sign_type = 1
        else:
            sign_type = 0
            
        verse_sign_types.append(sign_type)
    
    return verse_sign_types

### 3. Generate the dataset

In [11]:
complements = []

for verb in verbs:
    cmpl = verb.to_clauses.to_phrases.filter(function="Cmpl")
    if cmpl:
        complements.append(cmpl)
        
len(complements)    

50

In [12]:
# Create a dataset with the occurrences


items = [] # create an empty list to store all the information for each occ.

       
for verb in verbs:
    
    # Add the scroll name
    scroll = BOOK
    verse = verb.to_verses
    sign_info = ""
    clause = find_clause(verb)
    subject = find_subject(verb)
    complements = find_complements(verb)
    dir_he_dss_verse = ""
    
    if not complements:
        complements = [""]


    for complement in complements: 
        
        if complement != "":
            dir_he = int("H" in complement.to_words.uvf[0])
    
        # Collect information about the following variables:    
        item = {
            "verb_id": verb.ids[0], 
            "lex": verb.lex[0], 
            "scroll": scroll,
            "book": verb.book[0], 
            "chapter": verb.chapter[0], 
            "verse_num": verb.verse[0],
            "gcons_verb": clean(verb.g_cons[0]),
            "gcons_verse": clean(str(verse)),
            "gcons_clause": clean(str(find_clause(verb))),
            "subject": clean(str(subject)),
            "complement": clean(str(complement)),
            "dir_he": dir_he,
            "sign_info": sign_info,
            "stem": verb.vs[0],
            "tense": verb.vt[0],
        }
        
        if complement != "": 
            prepositions = complement.to_words.filter(sp="prep").lex
            n = 0
            for preposition in prepositions:
                n += 1
                item[f"preposition_{n}"] = str(preposition)
                
        items.append(item)

### 3.4 Create the dataset with Pandas

In [13]:
df = pd.DataFrame(items).fillna("")
df.sort_values(["book", "chapter", "verse_num"], ascending=[True, True, True], ignore_index=True, inplace=True)

In [14]:
df

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,gcons_clause,subject,complement,dir_he,sign_info,stem,tense,preposition_1,preposition_2
0,11144,SWR,1QM,1QM,1,10,SRH,W SRH MMCLT KTJJM L HKNJ< RC<H L >JN C>RJT,W SRH MMCLT KTJJM,MMCLT KTJJM,,0,,qal,perf,,
1,11168,HLK,1QM,1QM,1,12,HLWK,W BNJ YDQ J>JRW L KWL QYWWT TBL HLWK W >WR <D ...,HLWK,,,0,,qal,infc,,
2,11204,NPL,1QM,1QM,1,14,NPWL,W B JWM NPWL B W KTJJM QRB W NXCJR XZQ L PNJ >...,NPWL B W KTJJM,KTJJM,B W,0,,qal,infc,B,
3,11232,QRB,1QM,1QM,1,15,JTQRBW,B W JTQRBW L NXCJR GDWL <DT >LJM W QHLT >NCJM,B W JTQRBW L NXCJR GDWL <DT >LJM W QHLT >NCJM,<DT >LJM W QHLT >NCJM,,0,,hit,impf,,
4,11060,CWB,1QM,1QM,1,4,CWB,BNJ LWJ W BNJ JHWDH W BNJ BNJMJN GWLT H MDBR J...,B CWB GWLT BNJ >WR M MDBR H <MJM,GWLT BNJ >WR,M MDBR H <MJM,0,,qal,infc,MN,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,15630,CWB,1QM,1QM,9,18,TCJB,W L NKBDJ HM TCJB L BWZ,W L NKBDJ HM TCJB L BWZ,,L NKBDJ HM,0,,hif,impf,L,
89,15630,CWB,1QM,1QM,9,18,TCJB,W L NKBDJ HM TCJB L BWZ,W L NKBDJ HM TCJB L BWZ,,L BWZ,0,,hif,impf,L,
90,15429,CWB,1QM,1QM,9,3,CBW,W CBW >L MQWM <WMD M >CR SDRW CM H M<RKH L PNJ...,W CBW >L MQWM,,>L MQWM,0,,qal,perf,>L,
91,15440,PNH,1QM,1QM,9,3,PNJ,W CBW >L MQWM <WMD M >CR SDRW CM H M<RKH L PNJ...,L PNJ NPWL XLLJ H >WJB,XLLJ H >WJB,,0,,,,,


### 3.5 Save the dataset in a csv file

In [15]:
df.to_csv("data/1qm_all_verbs.csv", index=False)