In [1]:
import pandas as pd

from itertools import chain

from tfob import TFOb, get_xb

XB = get_xb()

In [None]:
from collections import Counter

### 1.Create a dataset with all motion verbs (in predicative phrases)

#### 1.1 List of wanted motion verbs, adapted for the extrabiblical database

In [None]:
motion_verbs = ['BW>[','HLK[','JY>[','JRD[','<BR[',
                '<LH[','CWB[','>TH[','BRX[','GJX[',
                'GLH[','GLL[','DXP[','DLG[','HWH[',
                'XWC[','XLP[','XSH[','VB<[','VWF[',
                'MHR[','MWC[','NGC[','NHR[','NWX[',
                'NWS[','NXT[','NVP[','NS<[','NPL[',
                'NTK[','SBB[','SWR[','SLQ[','<WZ[',
                '<WP[','PNH[','PF<[','YWP[','Y<D[',
                'QHL[','QPY[','QRB[','RWY[','FVH[',
                'CWX[','CWR=[','CVP[','CQQ[','T>R[',
                'T<H[']

motion_verbs = [verb.replace("[", "") for verb in motion_verbs]
#motion_verbs

#### 1.2 List of the scrolls I want in the dataset

In [None]:
books = TFOb.all("book", XB).book
books.remove("Shirata")
books.remove("Pirqe")
books

In [None]:
TFOb.all("book", XB)._levels

In [None]:
# Keep only the phrases being predicates

phrases = TFOb.all("book", XB).filter_in(book=books).to_phrases.filter(function="Pred")
# List of verbs in predicative phrases
verbs = phrases.to_words.filter_in(lex=motion_verbs)

verbs

In [None]:
Counter(verbs.book)

### 2. Necessary functions

In [None]:
def clean(g_cons):
    return g_cons.replace("_", " ").replace("׳", "").replace("'", "")


def find_clause(verb):
    """Find the complement of a verb. If no match, returns None"""
    clause = verb.to_clauses.to_clauses
    return clause


def find_complements(verb):
    """Find the complement of a verb. If no match, returns None"""
    complements = verb.to_clauses.to_phrases.filter(function="Cmpl")
    return complements


def find_subject(verb):
    """Find the subject of a verb. If no match, returns None"""
    subjects = verb.to_clauses.to_phrases.filter(function="Subj")
    assert len(subjects) <= 1
    return subjects

### 3. Generate the dataset

In [None]:
complements = []

for verb in verbs:
    cmpl = verb.to_clauses.to_phrases.filter(function="Cmpl")
    if cmpl:
        complements.append(cmpl)
        
len(complements)    

In [None]:
# Create a dataset with the occurrences


items = [] # create an empty list to store all the information for each occ.

       
for verb in verbs:
    
    # Add the scroll name
    scroll = verb.book[0]
    verse = verb.to_verses
    sign_info = ""
    clause = find_clause(verb)
    subject = find_subject(verb)
    complements = find_complements(verb)
    dir_he_dss_verse = ""
    
    if not complements:
        complements = [""]
        dir_he = ""


    for complement in complements: 
        
        if complement != "":
            dir_he = int("H" in complement.to_words.uvf[0])
    
        # Collect information about the following variables:    
        item = {
            "verb_id": verb.ids[0], 
            "lex": verb.lex[0], 
            "scroll": scroll,
            "book": verb.book[0], 
            "chapter": verb.chapter[0], 
            "verse_num": verb.verse[0],
            "gcons_verb": clean(verb.g_cons[0]),
            "gcons_verse": clean(str(verse)),
            "gcons_clause": clean(str(find_clause(verb))),
            "subject": clean(str(subject)),
            "complement": clean(str(complement)),
            "dir_he": dir_he,
            "sign_info": sign_info,
            "stem": verb.vs[0],
            "tense": verb.vt[0],
        }
        
        if complement != "": 
            prepositions = complement.to_words.filter(sp="prep").lex
            n = 0
            for preposition in prepositions:
                n += 1
                item[f"preposition_{n}"] = str(preposition)
                
        items.append(item)

### 3.4 Create the dataset with Pandas

In [None]:
df = pd.DataFrame(items).fillna("")
df.sort_values(["book", "chapter", "verse_num"], ascending=[True, True, True], ignore_index=True, inplace=True)

In [None]:
df

### 3.5 Save the dataset in a csv file

In [None]:
df.to_csv("data/extrabiblical_all_verbs.csv", index=False)