In [1]:
import pandas as pd

from itertools import chain

from tfob import TFOb, get_bhsa, get_dss

BHSA = get_bhsa()
DSS = get_dss()

**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
scroll,1001,1428.81,100
lex,10450,129.14,94
fragment,11182,127.91,100
line,52895,27.04,100
clause,125,12.85,0
cluster,101099,6.68,47
phrase,315,5.1,0
word,500995,2.81,99
sign,1430241,1.0,100


### 1.Create a list with the verses from the biblical Dead Sea Scrolls

#### 1.1 List of wanted motion verbs

In [2]:
motion_verbs = ['BW>[','HLK[','JY>[','JRD[','<BR[',
                '<LH[','CWB[','>TH[','BRX[','GJX[',
                'GLH[','GLL[','DXP[','DLG[','HWH[',
                'XWC[','XLP[','XSH[','VB<[','VWF[',
                'MHR[','MWC[','NGC[','NHR[','NWX[',
                'NWS[','NXT[','NVP[','NS<[','NPL[',
                'NTK[','SBB[','SWR[','SLQ[','<WZ[',
                '<WP[','PNH[','PF<[','YWP[','Y<D[',
                'QHL[','QPY[','QRB[','RWY[','FVH[',
                'CWX[','CWR=[','CVP[','CQQ[','T>R[',
                'T<H[']

#### 1.2 List of occurrences of the motion verbs

In [74]:
verbs = TFOb.all("scroll", DSS).filter(biblical=1).to_words.filter_in(lex=motion_verbs)
verbs

<word_5074 "J<WPP TLK JLK J<LHW B>J T<L T<L <LT JRD T<LH TGLH JY> >Y> TPNW GLH MBJ> TLKW [...] B>W JY>W JLK JLK JY> JY> JY> B>W J<LW LK TLKJ HLKTJ TLKJ >LK HLK >LK">

In [4]:
set(verbs.book)

{'11Q1',
 '1Q1',
 '1Q13',
 '1Q4',
 '1Q5',
 '1Q6',
 '1_Kings',
 '1_Samuel',
 '2_Kings',
 '2_Samuel',
 '4Q1',
 '4Q118',
 '4Q14',
 '4Q17',
 '4Q23',
 '4Q30',
 '4Q32',
 '4Q45',
 '4Q55',
 '4Q77',
 '4Q78',
 '5Q3',
 '6Q4',
 'Amos',
 'Daniel',
 'Deuteronomy',
 'Ecclesiastes',
 'Exodus',
 'Ezekiel',
 'Ezra',
 'Genesis',
 'Habakkuk',
 'Haggai',
 'Hosea',
 'Isaiah',
 'Jeremiah',
 'Job',
 'Joel',
 'Jonah',
 'Joshua',
 'Judges',
 'Lamentations',
 'Leviticus',
 'Malachi',
 'Micah',
 'Nahum',
 'Numbers',
 'Obadiah',
 'PAM43113',
 'Proverbs',
 'Psalms',
 'Ruth',
 'Song_of_songs',
 'X4',
 'Zechariah',
 'Zephaniah'}

In [70]:
verb = verbs[13]
verb
scroll = verb.to_scrolls.scroll[0]
verse = TFOb.section([verb.book[0], verb.chapter[0], verb.verse[0]], DSS, scroll)
#print(verse)

In [20]:
def is_sign_unc(verse):
    """If a verse contains a missing or uncertain sign, returns True. Else, returns False."""
    
    unc_types = ['missing', 'unc']
    verse_sign_types = []
    
    for word in verse.to_words:

        word_signs = []

        for sign in word.to_signs:
            if sign.type[0] in unc_types:
                sign_type = "1"
            elif sign.type[0] == "cons":
                sign_type = "0"
            else:
                continue

            word_signs.append(sign_type)

        verse_sign_types.append("".join(word_signs))    
    return " ".join(verse_sign_types).rstrip().lstrip()

In [55]:
def check_signs_status(verse):
    
    verse_sign_types = []
    verse_consonantal = []
    
    for word in verse.to_words:
        
        word_signs_types = []
        word_cons = []
        
        for sign in word.to_signs:
            print(sign)
    
            if sign.unc[0]: # four possible values, 1, 2, 3 and None
                sign_type = "1"
                
            else:
                sign_type = "0"
                
            word_signs_types.append(sign_type)
            word_cons.append(sign)
            
        verse_sign_types.append("".join(word_signs_types))  
       # verse_consonantal.append("".join(word_cons))
        
    return (" ".join(verse_sign_types).rstrip().lstrip())#, " ".join(verse_consonantal).rstrip().lstrip())

In [77]:
print(verb, verse.book[0], verse.chapter[0], verse.verse[0], scroll)
print("Verse length: ", len(str(verse.to_signs)))
print("Sign_info length: ", len(is_sign_unc(verse)))
print(is_sign_unc(verse))
#print(check_signs_status(verse))
print(verse.to_signs)

>SJRH Exodus 20 26 1Q2
Verse length:  49
Sign_info length:  49
0 00 0000 0 00000 00 00000 000 00 0000 00000 0000
W L> T<LH B M<LWT <L MZBXJ >CR L> TGLH <RWTK <LJW


In [110]:
def check_zero_amount(string):
    string = string.replace("1", "")
    return(string, len(string))

In [111]:
verse1 = "000 11"
check_zero_amount(verse1)

('000 ', 4)

In [109]:
#see 23: missing sign

correct_verses = 0
incorrect_verses = 0
zero_len_verses = 0
for verb in verbs:
    
    scroll = verb.to_scrolls.scroll[0]
    verse = TFOb.section([verb.book[0], verb.chapter[0], verb.verse[0]], DSS, scroll)
    
    if scroll and verse:
        signs = verse.to_signs
        
        if len(str(verse.to_signs)) != 0:

            if len(is_sign_unc(verse)) !=  len(str(verse.to_signs)):
                print("Verse length: ", len(str(verse.to_signs)))
                print("Sign_info length: ", len(is_sign_unc(verse)))
                print(is_sign_unc(verse))
                print(verse.to_signs)
                print("\n")
                incorrect_verses += 1

            elif len(is_sign_unc(verse)) ==  len(signs):
                correct_verses += 1
        else:
            zero_len_verses += 1

Verse length:  62
Sign_info length:  65
0 00 000 0000 0 000 00000 00 0000 00 0000 0 00 000 00 000 0000  1
W >M >MR J>MR H <BD >HBTJ >T >DNJ >T >CTJ W >T BNJ L> >Y> XPCJ


Verse length:  69
Sign_info length:  74
0 00 1111 0000 0 0000 0 000 000 000 0000 0 00000 00 00 00 000 000 0 000 00
W L> TLKW B XQWT H GWJ >CR >NJ MCLX M PNJKM KJ >T KL >LH <FW W >QY BM


Verse length:  66
Sign_info length:  68
0 000000 000 0 0000 0000 0000 0 000 0 00000 0000 000 00 00000 0000 1
W HQRBTM >CH L JHWH CB<T JMJM B JWM H CBJ<J MQR> QDC KL ML>KT <BDH


Verse length:  79
Sign_info length:  81
1 0 00 000 0000 0 000 00000 0 000 00 000 00 000 0 0000 000 000000 00000 000 00000
W L> TSB NXLH L BNJ JFR>L M MVH >L MVH KJ >JC B NXLT MVH >BWTJW JDBQW BNJ JFR>L


Verse length:  42
Sign_info length:  46
0000 0000 00000000 00000 0 0000 00000 1 1 0000
>XRJ JHWH >LWHJKMH TLKWN W >WTW T<BDW TCMR


Verse length:  178
Sign_info length:  179
0 0 000 0 0000 00 0 100 0 0000 0 000 0000 00 000 000 00 0000 000000 0 00000 0000 0 000

Verse length:  70
Sign_info length:  71
00 000 0000 0000 00000 0 00000 0 00000 000000 10 000 0 00 000 000 00000
KH >MR JHWH QDWC JFR>L W JWYRW H >TJWT C>LWNJ L BNJ W <L P<L JDJ TYWNJ


Verse length:  81
Sign_info length:  82
0 00000 0 0000 0000 000 0000 00000 0 0000 000 000000 0 00000 0 000 000  0 0000 000
W TBVXJ B R<TK >MRT >JN R>NJ XKMTK W D<TK HJ> CWBBTK W T>MRJ B LBK >NJ W >PSJ <WD


Verse length:  84
Sign_info length:  85
0 000 0000 000 00 0000 0000 0 000 0000 000 00 00000 0000  0 000 0000 0000 000 00 0000
W B>H <LJK R<H L> TD<J CXRH W TPL <LJK HWH L> TWKLJ KPRH W TB> <LJK PT>M C>H L> TD<J


Verse length:  84
Sign_info length:  85
0 000 0000 000 00 0000 0000 0 000 0000 000 00 00000 0000  0 000 0000 0000 000 00 0000
W B>H <LJK R<H L> TD<J CXRH W TPL <LJK HWH L> TWKLJ KPRH W TB> <LJK PT>M C>H L> TD<J


Verse length:  84
Sign_info length:  85
0 000 0000 000 00 0000 0000 0 000 0000 000 00 00000 0000  0 000 0000 0000 000 00 0000
W B>H <LJK R<H L> TD<J CXRH W TPL <LJK HWH L> TWKLJ KPRH

1 0 0000 00000 000 000 000 0000 00000 000 000 0 0000 000 00 000 0000 000 0 00 000 00000 0000 0 00 00 0 000 00000 000 0 00000 00 0 000
W J>MR >LJHM HW> >CR DBR JHWH CBTWN CBT QDC L JHWH MXR >T >CR T>PW >PW W >T >CR TBCLW BCLW W >T KL H <DP HNJXW LKM L MCMRT <D H BQR


Verse length:  42
Sign_info length:  53
1 111 1 0 000 0 00000 0000 00 0 00 0 000 0 00 0000  1
B JWM H CBJ<J JY>W MN H <M L LQV W L> MY>W


Verse length:  66
Sign_info length:  74
0 0000 000 00 00000 000 000 00000 0 00 0000 0 0000 000 0000 000 00 111 1 1
W J>MR MCH >L JHWC< BXR LNW >NCJM W Y> HLXM B <MLQ MXR >NKJ NYB <L


Verse length:  83
Sign_info length:  90
1 1111 0000 00 0 00 00 0 00 0 00000 0 00000 000 00 0 000 000 0000 00 0 00 0 0000 000 00000
>THM >T H XQ JM W >T H TWRWT W HWD<T LHM >T H DRK >CR JLKW BH W >T H M<FH >CR J<FWN


Verse length:  72
Sign_info length:  77
0000 00 00 0 0000 000 0 0000 000 0 0 00000 0 0 00 00 00000 0 000 00 0000 1 11
JHWH LK RD W <LJT >TH W >HRN <MK W H KHNJM W H <M >L JHRSW L <LT >L JHWH



Verse length:  55
Sign_info length:  61
0 00000 00 0 000 000 000000 0 00000 0000 0000 00 00 000 1 1 1
W J>WPW >T H BYQ >CR HWYJ>W M MYRJM <GWT MYWT KJ L> XMY


Verse length:  102
Sign_info length:  105
0 0000 0 000 00000 00 000 0 00 0 0000 000 0000 1 10 0 00 0 000000 00 000 00000 00 00000 00 00000 0 000000
W JWGD L MLK MYRJM KJ BRX H <M W JHPK LBB PR<H L H <M W J>WMRW MH Z>T <FJNW KJ CLXNW >T JFR>L M <BWDNW


Verse length:  93
Sign_info length:  95
1 0 000 00 0000 0000000 0 000000 0 0000 0 0000 00000 00000 0 000 00000 00 0000 0000 000 0 00000
W JSR >T >WPN MRKBTJW W JNHGHW B KBDT W J>MR MYRJM >NWSH M PNJ JFR>L KJ JHWH NLXM LHM B MYRJM


Verse length:  93
Sign_info length:  95
1 0 000 00 0000 0000000 0 000000 0 0000 0 0000 00000 00000 0 000 00000 00 0000 0000 000 0 00000
W JSR >T >WPN MRKBTJW W JNHGHW B KBDT W J>MR MYRJM >NWSH M PNJ JFR>L KJ JHWH NLXM LHM B MYRJM


Verse length:  66
Sign_info length:  68
1 0 0000 0000 0 00 0000 0 000 000 0 000 00 0000 00 00 00 000 000 000
W JB>W MRTH W 

Sign_info length:  195
0 00000 0000 00 0000 0000 000 0 0000 00 000 00000 000 0000 0 0000 00 00000 000 00000 0 00000 0 00 0000 000 00 0 000 00 0000 0 0000 00 00000  0 000 0 00 000 00 00000 0 000000 00 000 0 0000 000 000
W JW>MR JHWH >L MWCH QWMW S<W W <BRW >T NXL >RNWN R>H NTTJ B JDKH >T SJXWN MLK XCBWN H >MWRJ W >T >RYW HXL RC L RCT >T >RYW W HTGR BW MLXMH H JWM H ZH HXL TT PXDKH W JR>TKH <L PNJ H <MJM TXT KWL


Verse length:  194
Sign_info length:  195
0 00000 0000 00 0000 0000 000 0 0000 00 000 00000 000 0000 0 0000 00 00000 000 00000 0 00000 0 00 0000 000 00 0 000 00 0000 0 0000 00 00000  0 000 0 00 000 00 00000 0 000000 00 000 0 0000 000 000
W JW>MR JHWH >L MWCH QWMW S<W W <BRW >T NXL >RNWN R>H NTTJ B JDKH >T SJXWN MLK XCBWN H >MWRJ W >T >RYW HXL RC L RCT >T >RYW W HTGR BW MLXMH H JWM H ZH HXL TT PXDKH W JR>TKH <L PNJ H <MJM TXT KWL


Verse length:  139
Sign_info length:  141
1 0 0000 000000 00 0000 00 0000 00000 000 00 0 000 000 000 0000 0 0000 00 0 0000 000 00 000 0 00000 0 000 0

Verse length:  118
Sign_info length:  129
0 0000 00 000 0000 0 000 00000 0 0000 0000 00000 0 00 0 00 0000 0 0 000 00000 00 00 000 0000 00000 0 0000 00 000 0 000  1 1 1 1 1
W ZKRT KJ <BD HJJT B >RY MYRJM W JY>K JHWH >LHJK M CM B JD XZQH W B ZR< NVWJH <L KN YWK JHWH >LHJK L <FWT >T JWM H CBT


Verse length:  98
Sign_info length:  102
00 0 000 000 0000 00 000 0 0000 00 0 000 00000 000 1 1 0 00 000 0000 00 0000 0 00000 0 0000 0 00 0 000
KJ H >RY >CR >TMH B> CMH L RCTH L> K >RY MYRJM HW> M CM >CR TZR< >T ZR<K W HCQJT B RGLK K GN H JRQ


Verse length:  69
Sign_info length:  71
0 0 000 001 1000 00000 000 0 0000 000 0000 0 0000 0 000 0 0000 0000 000
W H >RY >C TMH <BRJM CMH L RCTH >RY HRJM W BQ<T L MVR H CMJM TCTH MJM


Verse length:  180
Sign_info length:  182
0 000 00 00000 0000 0000000 00 0 000 0 00000 0 000 0 0 00000 0 0 0000 0 000 0 0 00000 0 0 00000 0 000 0000 0 0000000 0 00 000 000 000 000 0 000 0 00000 00 0 00000 0 000 0 0000 0 00 1
W HJH KJ JBJ>K JHWH >LWHJKH >L H >RY H KN<NJ H XTJ W 

Verse length:  61
Sign_info length:  63
1 0 0000 00 0000 0000 0000 0 000 0 00000 0 0000000 0 00 0000000
M CJLH >T >RWN BRJT JHWH W JY> B QRBNW W JWCJ<NW M KP >WJBJNW


Verse length:  80
Sign_info length:  82
0 0000 000 000000 0 0 00000 0 0000 0000 0 000 0 000 0 0000 000000 0 0000 00 0000 1
W JRWY >JC BNJMJN M H M<RKH W JBW> CJLH B JWM H HW> W MDJW QRW<JM W >DMH <L R>CW


Verse length:  80
Sign_info length:  82
0 0000 000 000000 0 0 00000 0 0000 0000 0 000 0 000 0 0000 000000 0 0000 00 0000 1
W JRWY >JC BNJMJN M H M<RKH W JBW> CJLH B JWM H HW> W MDJW QRW<JM W >DMH <L R>CW


Verse length:  150
Sign_info length:  152
1 0 00000 0 00000 00 0000 0000000 00000 0 00000 00 0000 0 0000 00000 00000 0 00000 0 00000 0000 00 0000 00000 00000 00000 0 0000 00 0000 00000 00000 000
W JCLXW W J>SPW >T SRNJ PLCTJJM >LJHM W J>MRW MH N<FH L >RWN >LWHJ JFR>L W J>MRW H GTJJM JSBW >T >RWN >LWHJ JFR>L >LJNW W JSBW >T >RWN >LWHJ JFR>L GTH


Verse length:  150
Sign_info length:  152
1 0 00000 0 00000 00 0000 0000

Verse length:  89
Sign_info length:  91
1 0 0000 0000 00 000 00 000 0 000 0000 0 0000 00 000 00000 000 0 000 000 000 0000 0 000 000
W JCLX JHWH >T NTN >L DWD W JB> >LJW W J>MR LW CNJ >NCJM HJW B <JR >XT >XD <CJR W >XD R>C


Verse length:  49
Sign_info length:  51
1 0 00000 0000 00 0 00 0 000 0000 0 0000 00 0 00000
W J>SWP DWJD >T H <M W JLK RBTH W JLXM BH W JLKDH


Verse length:  96
Sign_info length:  98
1 0 000 0000 00 0 000 0 000 00 0 0000 00 000000 0 000 00 0 000 0 00000 00 00 0000 0000 0 000 0 000
W JB> JW>B >L H MLK W JGD LW W JQR> >L >BCLWM W JB> >L H MLK W JCTXW LW <L >PJW >RYH L PNJ H MLK


Verse length:  96
Sign_info length:  98
1 0 000 0000 00 0 000 0 000 00 0 0000 00 000000 0 000 00 0 000 0 00000 00 00 0000 0000 0 000 0 000
W JB> JW>B >L H MLK W JGD LW W JQR> >L >BCLWM W JB> >L H MLK W JCTXW LW <L >PJW >RYH L PNJ H MLK


Verse length:  9
Sign_info length:  11
0000 0000 1
HCJB <LJK


Verse length:  56
Sign_info length:  58
1 0 0000 000000 00 0000 00 0000 00 000 000 000 0000 0

Sign_info length:  151
00 00000 0 00 0 0000 1 1 1 0 00000 0 0 000 0 0 0000 0 0 00000 0 0 000000 00 00 0000 000000 000 0000 0 000 00000 000 00000 00 0 0000 0 000 0000 000 0000
KL >XJKM M KL H GWJM B SWSJM W B RKB W B YBJM W B PRDJM W B KRKRWT <L HR QDCJ JRWCLM >MR JHWH K >CR JBJ>W BNJ JFR>L >T H MNXH B KLJ VHWR BJT JHWH


Verse length:  57
Sign_info length:  61
1 0 000 000000 0000 00000 0 0000 00 0000 0 00000 00 00 0000 1
W B>W HTNGCW JXDW PLJVJ H GWJM L> JD<W H NF>JM >T <Y PSLM


Verse length:  57
Sign_info length:  61
1 0 000 000000 0000 00000 0 0000 00 0000 0 00000 00 00 0000 1
W B>W HTNGCW JXDW PLJVJ H GWJM L> JD<W H NF>JM >T <Y PSLM


Verse length:  62
Sign_info length:  64
000 0 0000 000 0 000 0000 000 0000 00 1 00 000000 00000 00 00000
QR> M MZRX <JV M >RY MRXQ >JC <YTW >P >P >BJ>NH JYRTJ >P ><FNH


Verse length:  76
Sign_info length:  83
0 00000 00 000 000 000 0 000 000 0000 0 0000 0 000 000 0000 111 0 0000 00000 000  1
W TB>NH LK CTJ >LH RG< B JWM >XD CKWL W >LMN K TMM B>W <LJK

Verse length:  71
Sign_info length:  76
1 10 1 000 0000 000000 000 00000 00 000 0 00000 000 000000 0 000 00000 0 000
C DBR JHWH <LJKMH BNJ JFR>L <L KWL H MCPXH >CR H<LJTJ M >RY MYRJM L >MR


Verse length:  70
Sign_info length:  71
0 00 00 000 0000 0000 00 0 0000 0 000 0 0001 000 0000 0 00000 000000000
L KN KH >MR >DNJ JHWH YR W SBJB H >RY W HWR MMK <WZK W NBWZW >RMWNWTJK


Verse length:  12
Sign_info length:  25
1 11 1 1 1 1 0 0000 0 000
W NPLW L >RY


Verse length:  121
Sign_info length:  134
111 11 1 0 000 000 00000 00 0000 0 000000 0 000 00000 1 1 0 00000 0 00000 0 000 000 0000 00000 0 0000 0 000000 0 0 0000000 00 000 0000
W XQJ >CR YWJTJ >T <BDJ H NBJ>JM H LW> HFJGW W JCWBW W J>MRW K >CR ZMM JHWH YB>WT L <FWT K DRKJNW W K M<LLJNW KN <FH >TNW


Verse length:  66
Sign_info length:  70
0 000 0 0000 0 0000 000 000 0000 0000 0 0000 000000 1 1 0000 000 0 000
W J<N H ML>K W JWMR >LJ >LH >RB< RXWT H CMJM JWY>WT >DWN KWL H >RY


Verse length:  50
Sign_info length:  53
0 00000 00 0000 0 0000

00000 0 00000 00 000000 00 000 000 000 000 0 00 000 000 000 000 0 00 000 000 000  00000 0 00 000 000 000 00000
Z><JN W DXLJN MN QDMWHJ DJ HWH YB> HW> QVL W DJ HWH YB> HWH MX> W DJ HWH YB> HWH MHRJM W DJ HWH YB> HWH MCPJL


Verse length:  109
Sign_info length:  110
00000 0 00000 00 000000 00 000 000 000 000 0 00 000 000 000 000 0 00 000 000 000  00000 0 00 000 000 000 00000
Z><JN W DXLJN MN QDMWHJ DJ HWH YB> HW> QVL W DJ HWH YB> HWH MX> W DJ HWH YB> HWH MHRJM W DJ HWH YB> HWH MCPJL


Verse length:  109
Sign_info length:  110
00000 0 00000 00 000000 00 000 000 000 000 0 00 000 000 000 000 0 00 000 000 000  00000 0 00 000 000 000 00000
Z><JN W DXLJN MN QDMWHJ DJ HWH YB> HW> QVL W DJ HWH YB> HWH MX> W DJ HWH YB> HWH MHRJM W DJ HWH YB> HWH MCPJL


Verse length:  109
Sign_info length:  110
00000 0 00000 00 000000 00 000 000 000 000 0 00 000 000 000 000 0 00 000 000 000  00000 0 00 000 000 000 00000
Z><JN W DXLJN MN QDMWHJ DJ HWH YB> HW> QVL W DJ HWH YB> HWH MX> W DJ HWH YB> HWH MHRJM W DJ HW

Verse length:  134
Sign_info length:  138
1 0 0000 0000 00 000 000 0 000 000 000 0 00 000000 0 00000 0 00 0 000 00 0 000 0 0 000 00000 0 00000 0 00000 0 000 0 000 0 00000 0 00000 1
W J>MR >LJW L> LBJ HLK K >CR HPK >JC M <L MRKBTW L QR>TK H <T L QXT >T H KSP W L QXT BGDJM W ZJTJM W KRMJM W Y>N W BQR W <BDJM W CPXWT


Verse length:  218
Sign_info length:  220
0 00000 000 0 0000 0 0 00000 00000 000 0 0000 000 0 0 0000 0 000 000 0 0000 0000 0 000 000 00 0 00000 0 00000 00 000 00 0 0000 0 00 0 0000 00 0000 000 0 00 0 0000 0000 0 000 0 00000 000 0 000 0 000 000 0000 00000 00000 1
W >LJC< JCB B BJTW W H ZQNJM JCBJM >TW W JCLX >JC M L PNJW B VRM JB> H ML>K >LJW W HW> >MR >L H ZQNJM H R>JTM KJ CLX BN H MRYX H ZH L HSJR >T R>CJ R>W K B> H ML>K SGRW H DLT W LXYTM >TW B DLT H LW> QWL RGLJ >DNJW >XRJW


Verse length:  218
Sign_info length:  220
0 00000 000 0 0000 0 0 00000 00000 000 0 0000 000 0 0 0000 0 000 000 0 0000 0000 0 000 000 00 0 00000 0 00000 00 000 00 0 0000 0 00 0 0000 00 0000 000 0 00 

Verse length:  54
Sign_info length:  56
1 0 0000 0 00000 00 000 0 00 0 000 0000 0000 00 000 0 00
W JY>W W JBRKW >T KWL H <M W JR> KBWD JHWH >L KWL H <M


Verse length:  25
Sign_info length:  27
1 0000 0000 00 0 0 0000 000
G>LH THJH LW W B JWBL JY>


Verse length:  42
Sign_info length:  44
1 0 000 0 000 000 00 0 00000 00 00000 0 0000
W MJM B KSP TTN LJ W CTJTJ RQ ><BRH B RGLJ


Verse length:  61
Sign_info length:  63
000 00000 0 00000 000 000000 0000000 00 00 0000 0 00 00000000 1
CLX >WRKH W >MTKH HMH JNXWNJ JBJ>WNJ >L HR QDCK W >L MCKNWTJK


Verse length:  79
Sign_info length:  82
00 000 0000 00 0000 0000 0000 0 00 00000 00 000000 00 0000 00000 000 0000 0 000  1
KH >MR JHWH <L CLCH PC<J MW>B W <L >RB<H L> >CJBNW <L FRPW <YMWT MLK >DWM L FJD


Verse length:  79
Sign_info length:  87
0 0000 00 000 000 0000 0 000 0000 000 1 111 1 00 0 00 00 000 00000 00 00000 000 0000 00
W J>MR MH >TH R>H <MWS W >MR KLWB QJY B> H QY >L <MJ JFR>L L> >WSJP <WD <BWR LW


Verse length:  79
Sign_info length:  

In [91]:
print(f"Correct verses : {correct_verses}.\nIncorrect verses: {incorrect_verses}.\nZero sign verses: {zero_len_verses}")

Correct verses : 11.
Incorrect verses: 497.
Zero sign verses: 86


### 2. Used functions

In [None]:
def clean(g_cons):
    """Use to harmonise the DSS content (as strings) with the BHSA content."""
    return g_cons.replace("_", " ").replace("׳", "").replace("'", "")


def find_verb_ref(verb):
    """Returns a list with book, chapter, verse number for a given verb (DSS or BHSA)."""
    if verb.source == "BHSA":
        book = verb.book[0]
        chapter = verb.chapter[0]
        verse_num = verb.verse[0]
    else:
        book = verb.book[0]    
        chapter = verb.chapter[0]
        verse_num = verb.verse[0]
    return [book, chapter, verse_num]


def is_lex_identical(verb_dss): # TODO: handle the "" inside the verse, not only at the end
    """
    Checks if the verses (i.e. BHSA versus DSS) are identical on the lexeme level.
    Remove the empty strings from the DSS verses, if present.
    """
    ref_verb = find_verb_ref(verb_dss)
    
    scroll = verb_dss.to_scrolls.scroll[0]
    
    dss_lex = TFOb.section(ref_verb, DSS, scroll=scroll).to_words.lex
    bhsa_lex = TFOb.section(ref_verb, BHSA).to_words.lex
    
    if dss_lex[-1] == "":
        dss_lex.pop()
    
    return [clean(lex) for lex in bhsa_lex] == [clean(lex) for lex in dss_lex]


def find_bhsa_verb(verb_dss):
    """
    Checks if a verb occurring in DSS also occurs in BHSA (same book, chapter, verse, lexeme).
    Else, returns None.
    """
    
    # Get book chapter verse info from a DSS verb
    ref_dss = find_verb_ref(verb_dss)
    
    if not ref_dss[1].isnumeric():
        # Handles the cases when the chapter in DSS is not a simple number (ex: f14)
        # print("Ref DSS not numeric", ref_dss)
        return 

    # Get the corresponding BHSA verse
    verse_bhsa = TFOb.section(ref_dss, BHSA).to_words
    verb_bhsa = verse_bhsa.filter(lex=verb_dss.lex[0])
    
    # If repetition of verb in same verse: TODO
    if len(verb_bhsa) > 1:
        return # TODO
        scroll = verb_dss.to_scrolls.scroll[0]
        verse_dss = TFOb.section(ref_dss, DSS, scroll)
        print("Verse BHSA:", verse_bhsa)
        print("Verse DSS:", verse_dss)
        
    if verb_bhsa:
        return verb_bhsa
    

def find_clause(verb):
    """Find the complement of a verb. If no match, returns None"""
    if verb.source.name == "BHSA":
        clause = verb.to_clauses.to_clauses
        return clause
    
    # if the verb is not BHSA, it's DSS
    verb_bhsa = find_bhsa_verb(verb)
    scroll = verb.to_scrolls.scroll[0]

    # Check if verses are identical  
    if verb_bhsa and is_lex_identical(verb):
        verse_dss = TFOb.section(find_verb_ref(verb), DSS, scroll=scroll).to_words
        clause_bhsa = find_clause(verb_bhsa)
        
    
        first_word_id = clause_bhsa.to_words.ids[0]
        last_word_id = clause_bhsa.to_words.ids[-1]

        verse_ids = clause_bhsa.to_verses.to_words.ids
        
        try: #TODO TODO TODO
            first_word_index = verse_ids.index(first_word_id)
        except:
            print("Case when clause has no verse (to_verses bugs)", verb_bhsa.ids[0])
            return ""

        #first_word_index = verse_ids.index(first_word_id)
        last_word_index = verse_ids.index(last_word_id)
        
        return verse_dss[first_word_index:last_word_index + 1]

    
def find_complements(verb):
    """Find the complement of a verb. If no match, returns None"""
    if verb.source.name == "BHSA":
        complements = verb.to_clauses.to_phrases.filter(function="Cmpl")
        return complements
    
    # if the verb is not BHSA, it's DSS
    verb_bhsa = find_bhsa_verb(verb)
    scroll = verb.to_scrolls.scroll[0]

    # Check if verses are identical  
    if verb_bhsa and is_lex_identical(verb): # TODO
        verse_dss = TFOb.section(find_verb_ref(verb), DSS, scroll=scroll).to_words
        complements_bhsa = find_complements(verb_bhsa)
        
        complements_dss = []
    
        for complement_bhsa in complements_bhsa:
            first_word_id = complement_bhsa.to_words.ids[0]
            last_word_id = complement_bhsa.to_words.ids[-1]
            
            verse_ids = complement_bhsa.to_verses.to_words.ids
            
            try: #TODO TODO TODO
                first_word_index = verse_ids.index(first_word_id)
            except:
                print("Case when phrase has no verse (to_verses bugs)", verb_bhsa.ids[0])
                return ""
            
            #first_word_index = verse_ids.index(first_word_id)
            last_word_index = verse_ids.index(last_word_id)
            
            complements_dss.append(verse_dss[first_word_index:last_word_index + 1])
        
        return complements_dss

    
def find_subject(verb):
    """Find the subject of a verb. If no match, returns None"""
    if verb.source.name == "BHSA":
        subjects = verb.to_clauses.to_phrases.filter(function="Subj")
        assert len(subjects) <= 1
        return subjects
    
    # if the verb is not BHSA, it's DSS
    verb_bhsa = find_bhsa_verb(verb)
    scroll = verb.to_scrolls.scroll[0]

    # Check if verses are identical  
    if verb_bhsa and is_lex_identical(verb): # TODO
        verse_dss = TFOb.section(find_verb_ref(verb), DSS, scroll=scroll).to_words
        subject_bhsa = find_subject(verb_bhsa)
        
        if not subject_bhsa: 
            return ""

        first_word_id = subject_bhsa.to_words.ids[0]
        last_word_id = subject_bhsa.to_words.ids[-1]

        verse_ids = subject_bhsa.to_verses.to_words.ids

        first_word_index = verse_ids.index(first_word_id)
        last_word_index = verse_ids.index(last_word_id)
            
        return verse_dss[first_word_index:last_word_index + 1]    
    

def find_prepositions(verb):
    """Find the complement of a verb. If no match, returns None"""
    if verb.source.name == "BHSA":
        complements = verb.to_clauses.to_phrases.filter(function="Cmpl")
        prepositions = complements.to_words.filter(sp="prep")
        return prepositions
    
    # if the verb is not BHSA, it's DSS
    verb_bhsa = find_bhsa_verb(verb)
    scroll = verb.to_scrolls.scroll[0]

    # Check if verses are identical  
    if verb_bhsa and is_lex_identical(verb): 
        verse_dss = TFOb.section(find_verb_ref(verb), DSS, scroll=scroll).to_words
        prepositions_bhsa = find_prepositions(verb_bhsa)
        
        prepositions_dss = []
    
        for preposition_bhsa in prepositions_bhsa:
            first_word_id = preposition_bhsa.to_words.ids[0]
            last_word_id = preposition_bhsa.to_words.ids[-1]
            
            verse_ids = preposition_bhsa.to_verses.to_words.ids
            
            try: #TODO TODO TODO
                first_word_index = verse_ids.index(first_word_id)
            except:
                print("Case when phrase has no verse (to_verses bugs)", verb_bhsa.ids[0])
                return ""
            
            #first_word_index = verse_ids.index(first_word_id)
            last_word_index = verse_ids.index(last_word_id)
            
            prepositions_dss.append(verse_dss[first_word_index:last_word_index + 1])
        
        return prepositions_dss

In [None]:
### find_prepositions testing
verb = TFOb(1889904, DSS)
find_prepositions(verb)

### 3. Generate the dataset with pandas

In [None]:
# Create a dataset with the occurrences


items = [] # create an empty list to store all the information for each occ.

       
for verb in verbs:
    
    # Add MT as "scroll" for the BHSA
    if verb.source.name == "BHSA":
        scroll = "MT"
        verse = verb.to_verses
        dir_he_dss_verse = ""
        sign_info = ""

    else:
        scroll = verb.to_scrolls.scroll[0]
        verse = TFOb.section([verb.book[0], verb.chapter[0], verb.verse[0]], DSS, scroll)
        dir_he_dss_verse = int("H" in verse.uvf_etcbc)
        sign_info = is_sign_unc(verse)
        
        
    subject = find_subject(verb)
    complements = find_complements(verb)
    
    # If complements is None ==> there was no match between DSS and BHSA verses ==> find complement manually
    if complements is None:
        complements = [""]
        dir_he = ""

    for complement in complements:        
        if complement == "":
            dir_he = ""
        else: 
            dir_he = int("H" in complement.to_words.uvf_etcbc)
               
        if verb.g_cons[0] is None:
            g_cons = "no_g_cons"
            #print("Absent G_CONS", verb.ids[0])
        else:
            g_cons = clean(verb.g_cons[0])    
            
        if verse.g_cons is None:
            g_cons_verse = f"LEX: {' '.join(verse.to_words.lex)}"
            #print("Absent G_CONS", verb.ids[0])
        else:
            g_cons_verse = clean(" ".join([g_cons for g_cons in verse.g_cons if g_cons]))
            #print(g_cons_verse)
            
            
        
        # Collect information about the following variables:    
        item = {
            "verb_id": verb.ids[0], 
            "lex": verb.lex[0], 
            "scroll": scroll,
            "book": verb.book[0], 
            "chapter": verb.chapter[0], 
            "verse_num": verb.verse[0],
            "gcons_verb": g_cons,
            "gcons_verse": g_cons_verse,
            "gcons_clause": clean(str(find_clause(verb))),
            "subject": clean(str(subject)),
            "complement": clean(str(complement)),
            "dir_he": dir_he,
            "dir_he_dss": dir_he_dss_verse,
            "sign_info": sign_info,
            "stem": verb.vs[0],
            "tense": verb.vt[0],
        }
        
        if complement != "": 
            prepositions = find_prepositions(verb)
            n = 0
            for preposition in prepositions:
                n += 1
                item[f"preposition_{n}"] = str(preposition)

        items.append(item)

In [None]:
df = pd.DataFrame(items).fillna("")
df.head(5)

In [None]:
#df[df.book == 'PAM43113']

### 4. Save to CSV

In [None]:
df.to_csv("data/bib_dss_all_verbs.csv", index=False)