In [6]:
from tfob import TFOb, BHSA, DSS

**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
scroll,1001,1428.81,100
lex,10450,129.14,94
fragment,11182,127.91,100
line,52895,27.04,100
clause,125,12.85,0
cluster,101099,6.68,47
phrase,315,5.1,0
word,500995,2.81,99
sign,1430241,1.0,100


**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


In [7]:
import pandas as pd
from itertools import chain

### 1.Create two lists with the verses from MT Isaiah and 1QIsaa

#### 1.1 List of wanted motion verbs

In [8]:
motion_verbs = ['BW>[','HLK[','JY>[','JRD[','<BR[',
                '<LH[','CWB[','>TH[','BRX[','GJX[',
                'GLH[','GLL[','DXP[','DLG[','HWH[',
                'XWC[','XLP[','XSH[','VB<[','VWF[',
                'MHR[','MWC[','NGC[','NHR[','NWX[',
                'NWS[','NXT[','NVP[','NS<[','NPL[',
                'NTK[','SBB[','SWR[','SLQ[','<WZ[',
                '<WP[','PNH[','PC<[','YWP[','Y<D[',
                'QHL[','QPY[','QRB[','RWY[','FVH[',
                'CWX[','CWR[','CVP[','CQQ[','T>R[',
                'T<H[']

freq_verbs = ['BW>[','HLK[','JY>[','JRD[','<BR[',
                '<LH[','CWB[','>TH[','BRX[','GLH[',
                'HWH[','XLP[','XSH[','MHR[','NGC[',
                'NWX[','NWS[','NS<[','NPL[','NTK[',
                'SBB[','SWR[','<WP[','PNH[','PC<[',
                'QHL[','QRB[','RWY[','CVP[','T<H[']

In [9]:
set(motion_verbs) - set(freq_verbs )

{'<WZ[',
 'CQQ[',
 'CWR[',
 'CWX[',
 'DLG[',
 'DXP[',
 'FVH[',
 'GJX[',
 'GLL[',
 'MWC[',
 'NHR[',
 'NVP[',
 'NXT[',
 'QPY[',
 'SLQ[',
 'T>R[',
 'VB<[',
 'VWF[',
 'XWC[',
 'Y<D[',
 'YWP['}

#### 1.2 List of occurrences of the motion verbs (MT Isaiah)

In [10]:
verbs_bhsa = TFOb.all("word", BHSA).filter(book="Isaiah").filter_in(lex=motion_verbs)
verbs_bhsa

<word_596 "PC<W TB>W HBJ> HSJRW LKW JBW> >CJBH >SJRH >CJBH CBJH PC<JM NHRW HLKW LKW N<LH NLKH TY> LKW NLKH BW> [...] JRDT HLKJM QRB TGC HWY>TJ HNXTM T<LJNH M<LH >BJ> JBW> CWVP JBW> HCJB B>H B>W HBJ>W JBJ>W JBW> JY>W PC<JM">

In [11]:
#TESTING ZONE 
#verbs_bhsa
#TFOb(215297, BHSA).to_verses.text
#TFOb(215297, BHSA).to_verses.pretty()

#### 1.3 List of occurrences of the motion verbs (1QIsaa)

In [12]:
verbs_dss = TFOb.all("scroll", DSS).filter(scroll="1Qisaa").to_words.filter_in(lex=motion_verbs)

In [13]:
verbs_dss

<word_602 "PC<W TB>W HBJ> HSJRW LKW JBW> HCJB >SJR >CJBH CBJH PWC<JM NHRW HLKW LKW N<LH N>LKH TY> LKW NLKH JXLWPW [...] JRDTH HWLKJM QRB HWYJTJ HNXTMH T<LJN> M<LH >BJ> JBW> CWVP JBW> HCJB JBW> B>W B>W HBJ>W JBJ>W JBW> JY>W PWC<JM">

### 2. Get the verses from both databases

#### 2.1 Clean function to harmonised the DSS verses with the BHSA verses

In [14]:
def clean(g_cons):
    return g_cons.replace("_", " ").replace("׳", "").replace("'", "")

#### 2.2 Creating two lists of verses (BHSA and DSS) containing the motion verbs

In [15]:
# From the list of motion verbs, create two lists (BHSA and DSS) with the verses where the verbs occur

verses_bhsa = []
verses_dss = []

for verb in chain(verbs_bhsa, verbs_dss):
    if verb.source.name == "BHSA":
        scroll = "MT"
        verse = verb.to_verses.to_words
        verses_bhsa.append(verse)
    else:
        scroll = verb.to_scrolls.scroll[0]
        verse = TFOb.section([verb.book[0], verb.chapter[0], verb.verse[0]], DSS, scroll)
        #verse = clean(str(verse))
        verses_dss.append(verse)

In [16]:
print(len(verses_bhsa))
print(len(verses_dss))

596
602


#### 2.3 Finding the clauses in the BHSA verses

In [17]:
clauses_bhsa = []

for verb in verbs_bhsa:
    mv_clause = verb.to_clauses
    clauses_bhsa.append(mv_clause)

# Find the complement in the mv_clause

cmpl_bhsa = []

for clause in clauses_bhsa:
    cmpl = clause.to_phrases.filter(function="Cmpl")
    cmpl_bhsa.append(cmpl)

In [18]:
#cmpl_bhsa

#### 2.4 Find the complements in the BHSA verses

In [19]:
# Filter the clauses without a complement
cmpl_bhsa = [cmpl for cmpl in cmpl_bhsa if cmpl]
cmpl_bhsa      

[<phrase_1 "BJ">,
 <phrase_1 "M NGD <JNJ">,
 <phrase_1 ">LJHM">,
 <phrase_1 "<LJK">,
 <phrase_1 ">LJW">,
 <phrase_1 ">L HR JHWH >L BJT >LHJ J<QB">,
 <phrase_1 "B >RXTJW">,
 <phrase_1 "M YJWN">,
 <phrase_1 "B >WR JHWH">,
 <phrase_1 "B YWR">,
 <phrase_1 "B M<RWT YRJM W B MXLWT <PR">,
 <phrase_1 "B NQRWT H YRJM W B S<PJ H SL<JM">,
 <phrase_1 "M JRWCLM W M JHWDH">,
 <phrase_2 "B MCPV <M ZQNJ <MW W FRJW">,
 <phrase_1 "B FDH">,
 <phrase_1 ">LJ">,
 <phrase_1 "JRWCLM">,
 <phrase_1 "B JHWDH">,
 <phrase_1 "<LJK W <L <MK W <L BJT >BJK">,
 <phrase_1 "M <L JHWDH">,
 <phrase_1 "B NXLJ H BTWT W B NQJQJ H SL<JM W B KL H N<YWYJM W B KL H NHLLJM">,
 <phrase_1 "CMH">,
 <phrase_1 "CMH">,
 <phrase_1 ">L H NBJ>H">,
 <phrase_1 "<LJHM">,
 <phrase_1 "<L KL >PJQJW">,
 <phrase_1 "<L KL GDWTJW">,
 <phrase_1 "B JHWDH">,
 <phrase_1 "B DRK H <M H ZH">,
 <phrase_1 "BH">,
 <phrase_1 "L M<LH">,
 <phrase_1 "B JFR>L">,
 <phrase_1 "M MRXQ">,
 <phrase_1 "<L MJ">,
 <phrase_1 "TXT HRWGJM">,
 <phrase_1 ">L >L GBWR">,
 <phrase

In [20]:
cmpl_bhsa[1].to_words.lex

['MN', 'NGD/', '<JN/']

### 3. Create the dataset

### 3.1 Check correspondances between DSS and BHSA

In [21]:
verbs_dss[0] # this is a TFOb object containing a verb

<word_1 "PC<W">

In [22]:
def find_bhsa_verb(verb_dss):
    """
    Checks if a verb occurring in DSS also occurs in BHSA (same book, chapter, verse, lexeme).
    Else, returns None.
    """
    
    # Get book chapter verse info from a DSS verb
    v_book = verb_dss.book[0]    
    v_chapter = verb_dss.chapter[0]
    v_verse = verb_dss.verse[0]

    # Get the corresponding BHSA verse
    verse_bhsa = TFOb.section((v_book, v_chapter, v_verse), BHSA).to_words
    verb_bhsa = verse_bhsa.filter(lex=verb_dss.lex[0])
    
    # If repetition of verb in same verse: TODO
    if len(verb_bhsa) > 1:
        return # TODO
        scroll = verb_dss.to_scrolls.scroll[0]
        verse_dss = TFOb.section([v_book, v_chapter, v_verse], DSS, scroll)
        print("Verse BHSA:", verse_bhsa)
        print("Verse DSS:", verse_dss)
        
    if verb_bhsa:
        return verb_bhsa

    

#print(v_book, v_chapter, v_verse)
#print(verse_bhsa)

In [23]:
def find_dss_verse(verb_dss):
    """
    Returns the verse from the verb for a DSS occurrence.
    """
    # Extract the book, chapter, verse info form the verb
    book = verb_dss.book[0]    
    chapter = verb_dss.chapter[0]
    verse = verb_dss.verse[0]
    
    # Returns the verse
    scroll = verb_dss.to_scrolls.scroll[0]
    verse_dss = TFOb.section([book, chapter, verse], DSS, scroll)
    return verse_dss

In [24]:
# Test find_dss_verse function
print("BHSA verse: " + str(verbs_bhsa[1].to_verses)) # for comparison
print("DSS verse: " + str(find_dss_verse(verbs_dss[1]))) # function test

BHSA verse: KJ TB>W L R>WT PNJ MJ BQC Z>T M JDKM RMS XYRJ
DSS verse: KJ> TB>W L R>WT PNJ MJ BQC ZW>T M JDKM L RMWS XYRJ


In [25]:
# Testing the find_dss_verse function

#for i in range(0, len(verbs_bhsa)):
    #print(f"\ni = {i}")
    #print(is_lex_identical(verbs_bhsa[i].to_verses, find_dss_verse(verbs_dss[i]))) 

### 3.2 Create a function to test if a DSS verse has the same lexemes as a BHSA verse

In [26]:
def is_lex_identical(verse_bhsa, verse_dss):
    """
    Checks if the verses (i.e. BHSA versus DSS) are identical on the lexeme level.
    """
    return [clean(lex) for lex in verse_bhsa.to_words.lex if lex] == [clean(lex) for lex in verse_dss.to_words.lex if lex]
    #verse_with_spaces = [clean(lex) for lex in verse_bhsa.to_words.lex] == [clean(lex) for lex in verse_dss.to_words.lex]
    #if verse_without_spaces != verse_with_spaces:
        #print(verse_bhsa.to_words.lex)
        #print(verse_dss.to_words.lex)
        #print(verse_bhsa.text)
        #print(verse_dss.text)
        #print(verse_bhsa.to_words[-1].text)
        #print(verse_dss.to_words[-1].text)
        

In [27]:
# Print the results from is_lex_identical

n = 0

for i in range(0, len(verbs_bhsa)):
    if is_lex_identical(verbs_bhsa[i].to_verses, find_dss_verse(verbs_dss[i])):
        n += 1
print(n)


    #print(f"\ni = {i}")
    #print(is_lex_identical(verbs_bhsa[i].to_verses, find_dss_verse(verbs_dss[i])))  
    
# To see the lexemes, add the following lines inside the function is_lex_identical above:
#    print([clean(lex) for lex in verse_bhsa.to_words.lex if lex])
#    print([clean(lex) for lex in verse_dss.to_words.lex if lex])

30


In [28]:
def is_sign_unc(verse):
    """If a verse contains a missing or uncertain sign, returns True. Else, returns False."""
    unc_types = ['missing', 'unc']
    for sign in verse.to_signs:
        if sign.type[0] in unc_types:
            return True
    return False

### 3.3 Find complements

In [35]:
# Create a dataset with the occurrences

def find_complements(verb):
    """Find the complement of a verb. If no match, returns None"""
    if verb.source.name == "BHSA":
        complements = verb.to_clauses.to_phrases.filter(function="Cmpl")
        return complements
    
    # if the verb is not BHSA, it's DSS
    verb_bhsa = find_bhsa_verb(verb)

    # Check if verses are identical
    if verb_bhsa and is_lex_identical(verb_bhsa.to_verses, find_dss_verse(verb)):
        return find_complements(verb_bhsa)
        


items = [] # create an empty list to store all the information for each occ.

       
for verb in chain(verbs_bhsa, verbs_dss):
    
    # Add MT as "scroll" for the BHSA
    if verb.source.name == "BHSA":
        scroll = "MT"
        verse = verb.to_verses
        dir_he_dss_verse = ""
        sign_info = ""

    else:
        scroll = verb.to_scrolls.scroll[0]
        verse = TFOb.section([verb.book[0], verb.chapter[0], verb.verse[0]], DSS, scroll)
        dir_he_dss_verse = int("H" in verse.uvf_etcbc)
        if is_sign_unc(verse):
            sign_info = "unc or missing"
        else:
            sign_info = ""
        
    complements = find_complements(verb)
    
    # If complements is None ==> there was no match between DSS and BHSA verses ==> find complement manually
    if not complements:
        complements = [""]

    for complement in complements:        
        if complement == "":
            dir_he = ""
        else: 
            dir_he = int("H" in complement.to_words.uvf)
               
    
        # Collect information about the following variables:    
        item = {
            "verb_id": verb.ids[0], 
            "lex": verb.lex[0], 
            "scroll": scroll,
            "book": verb.book[0], 
            "chapter": verb.chapter[0], 
            "verse_num": verb.verse[0],
            "gcons_verb": clean(verb.g_cons[0]),
            "gcons_verse": clean(str(verse)),
            "gcons_clause": clean(str(verb.to_clauses)),
            "subject": clean(str(verb.to_clauses.to_phrases.filter(function="Subj"))),
            "complement": clean(str(complement)),
            "dir_he": dir_he,
            "dir_he_dss": dir_he_dss_verse,
            "sign_info": sign_info,
            "stem": verb.vs[0],
            "tense": verb.vt[0],
        }
        
        if complement != "":
            prepositions = complement.to_words.filter(sp="prep")
            n = 0
            for preposition in prepositions:
                n += 1
                item[f"preposition_{n}"] = str(preposition)

        items.append(item)

### 3.4 Create the dataset with Pandas

In [32]:
df = pd.DataFrame(items).fillna("")
df.sort_values(["book", "chapter", "verse_num"], ascending=[True, True, True], ignore_index=True, inplace=True)

In [33]:
# Show the occurrences where DSS verse = BHSA verse
# df[(df.scroll != "MT") & (df.complement != "")]
# df[(df.complement != "")]
df[(df.chapter == "53") & (df.verse_num == "12")]

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,gcons_clause,subject,complement,dir_he,dir_he_dss,sign_info,stem,tense,preposition_1,preposition_2,preposition_3,preposition_4
899,230769,PC<[,MT,Isaiah,53,12,PC<JM,LKN >XLQ LW B RBJM W >T <YWMJM JXLQ CLL TXT >C...,W >T PC<JM NMNH,,>T PC<JM,0.0,,,qal,ptca,>T,,,
900,230779,PC<[,MT,Isaiah,53,12,PC<JM,LKN >XLQ LW B RBJM W >T <YWMJM JXLQ CLL TXT >C...,W L PC<JM JPGJ<,,L PC<JM,0.0,,,qal,ptca,L,,,
901,1914528,PC<[,1Qisaa,Isaiah,53,12,PWC<JM,L KN >XLQ LW B RBJM W >T <YWMJM JXLQ CLL TXT >...,,,,,0.0,,qal,ptca,,,,
902,1914537,PC<[,1Qisaa,Isaiah,53,12,PC<JHMH,L KN >XLQ LW B RBJM W >T <YWMJM JXLQ CLL TXT >...,,,,,0.0,,,,,,,


In [208]:
# Show dataset
df[(df.chapter == "20") & (df.verse_num == "6")]

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,gcons_clause,subject,complement,dir_he,dir_he_dss,sign_info,stem,tense,preposition_1,preposition_2,preposition_3,preposition_4
194,218718,NWS[,MT,Isaiah,20,6,NSNW,W >MR JCB H >J H ZH B JWM H HW> HNH KH MBVNW >...,>CR NSNW CM L <ZRH,,CM,0,,,qal,perf,,,,


In [205]:
df[(df.book == "Isaiah") & (df.chapter == "10") & (df.verse_num == "28")]

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,gcons_clause,subject,complement,dir_he,dir_he_dss,sign_info,stem,tense,preposition_1,preposition_2,preposition_3,preposition_4
34,216009,BW>[,MT,Isaiah,10,28,B>,B> <L <JT <BR B MGRWN L MKMF JPQJD KLJW,B> <L <JT,,<L <JT,0,,,qal,perf,<L,,,
35,216012,<BR[,MT,Isaiah,10,28,<BR,B> <L <JT <BR B MGRWN L MKMF JPQJD KLJW,<BR B MGRWN,,B MGRWN,0,,,qal,perf,B,,,
36,1898948,BW>[,1Qisaa,Isaiah,10,28,B>,B> <L <JTH <BR B MGRWN L MKMC JPQWD KLJW,,,<L <JT,0,1.0,,qal,perf,<L,,,
37,1898951,<BR[,1Qisaa,Isaiah,10,28,<BR,B> <L <JTH <BR B MGRWN L MKMC JPQWD KLJW,,,B MGRWN,0,1.0,,qal,perf,B,,,


In [189]:
is_sign_unc(TFOb(ids, DSS))

True

In [179]:
verse = TFOb(ids, DSS)
verse

<word_9026 "CM<W CMJM W H>ZJNJ H >RY KJ> JHWH DBR BNJM GDLTJ W RWMMTJ W HMH PC<W BJ KJ> TB>W [...] >NCJM H PWC<JM BJ> KJ> TWL<TM LW> TMWT W >CHMH LW> TKBH W HJW DR>WN L KWL H BFR">

In [187]:
unc_types = ['missing', 'unc']
for sign in verse.to_signs:
    if sign.type[0] in unc_types:
        print(True)
print(False)

True
True
True
True
True
True
True
True
True
True
True
True
True
False


In [183]:
set(verse.to_signs.type)

{'cons', 'empty', 'missing', 'punct', 'sep', 'unc'}

In [190]:
for verse in verses_dss:
    if is_sign_unc(verse):
        print(verse)

KJ JW>MR B KWX JDJ <FJTJ W B XKMTJ KJ NBWNWTJ W >SJR GBLWT <MJM W <TJDWTJHMH CWCJTJ W >WRJD JWCBJM
KJ JW>MR B KWX JDJ <FJTJ W B XKMTJ KJ NBWNWTJ W >SJR GBLWT <MJM W <TJDWTJHMH CWCJTJ W >WRJD JWCBJM
W HJJTH MSLH L C>R <MW >CR JC>R M >CWR K >CR HJJTH L JFR>L B JWM <LWT M >RY MYRJM
>L TFMXJ PLCT KWLK KJ NCBR CBV MKKH KJ M CWRC NXC JY> YP< W PRJW FRP M<WPP
>L TFMXJ PLCT KWLK KJ NCBR CBV MKKH KJ M CWRC NXC JY> YP< W PRJW FRP M<WPP
QRWBW >LJ W CM<W ZW>T LW> M RWC B STR DBRTJ B <T HJWTH CMH >NJ W <TH >DWNJ JHWH CLXNJ W RWXW
W PWL W PZWRJ JHWH JCWBW W B>W YJWN B RWNH W FMXT <WLM <L RW>CJHMH FFWN W FMXH JFJGW W NS JGWN W >NXH
W PWL W PZWRJ JHWH JCWBW W B>W YJWN B RWNH W FMXT <WLM <L RW>CJHMH FFWN W FMXH JFJGW W NS JGWN W >NXH
W PWL W PZWRJ JHWH JCWBW W B>W YJWN B RWNH W FMXT <WLM <L RW>CJHMH FFWN W FMXH JFJGW W NS JGWN W >NXH
K JM NWX ZW>T LJ >CR NCB<TJ M <BWR MJ NWX <WD <L H >RY KN NCB<TJ M QYWP <LJK <WD W M G<WR BK
W HBJ>W >T KWL KWL >XJKMH M KL H GW>JM MNXH L JHWH B SWSJM W B RKBM W B YWBJM 

In [170]:
ids = []

for verse in verses_dss:
    ids.extend(verse.ids)
    
set(TFOb(ids, DSS).to_signs.type)


{'cons', 'empty', 'missing', 'punct', 'sep', 'unc'}

In [148]:
unc_types = ["unc", "missing"]
verse_signs = verse.to_signs
uncertain_sign = False

for i in range(len(verse_signs)):
    sign = verse_signs[i]
    if sign.type in unc_types:
        uncertain_sign = True 

In [None]:
def is_lex_identical(verse_bhsa, verse_dss):
    """
    Checks if the verses (i.e. BHSA versus DSS) are identical on the lexeme level.
    """
    return [clean(lex) for lex in verse_bhsa.to_words.lex if lex] == [clean(lex) for lex in verse_dss.to_words.lex if lex]

In [96]:
# if the verb is not BHSA, it's DSS
verb_bhsa = find_bhsa_verb(verb)

# Check if verses are identical
if verb_bhsa and is_lex_identical(verb_bhsa.to_verses, find_dss_verse(verb)):
    complements = find_complements(verb_bhsa)

complements

<phrase_1 "<L <JT">

In [72]:
# Show only MT occurrences
#df[df.scroll == "MT"]

# Show only non-MT occurrences
#df[df.scroll != "MT"]

In [73]:
# Compare the lexemes and their order
# clean the DSS verses by Id to be able to retrieve the good complements (and not copy them from BHSA)

### 3.5 Save the dataset in a csv file

In [34]:
df.to_csv("data/DATASET.csv")

In [75]:
complement = TFOb(212418, BHSA).to_verses.to_phrases.filter(function="Cmpl")
complement

<phrase_1 ">LJHM">

In [76]:
complement.to_words.sp

['prep']

In [41]:
prepositions = complement.to_words.filter(sp="prep")

In [50]:
prepositions.to_words

<word_1 ">LJHM">

In [46]:
complement.to_words.pretty()

In [3]:
w = TFOb.section(["Isaiah", "31", "8"], BHSA).to_words
w
print(w)
print(" ".join(w.lex))

W NPL >CWR B XRB L> >JC W XRB L> >DM T>KLNW W NS LW M PNJ XRB W BXWRJW L MS JHJW
W NPL[ >CWR/ B XRB/ L> >JC/ W XRB/ L> >DM/ >KL[ W NWS[ L MN PNH/ XRB/ W BXWR/ L MS/ HJH[


In [32]:
for word in w.to_phrases:
    func = word.function
    print(word, func)

BW> ['Pred']
B YWR ['Cmpl']
W ['Conj']
HVMN ['Pred']
B <PR ['Cmpl']
M PNJ PXD JHWH W M HDR G>NW ['Adju']


In [4]:
scroll = "1Qisaa"
v = TFOb.section(["Isaiah", "31", "8"], DSS, scroll)
#v = TFOb.section(["Isaiah", "23", "10"], DSS, scroll)
v.lex
print(v)
#print(" ".join(v.lex))

W NPL >CWR B XRB LW> >JC W XRB LW> >DM T>KWLNW W NS W LW> M PNJ XRB W BXWRJW L MS JHJW
