## Description: prepare the notebook for manual annotation

In this notebook, three elements are added to the previously generated dataset:
- check that the chapter and verse_num columns are integers
- sign_info column, based on a recent version of the function
- reconstructed_verse column which is a gcons version of the verse with brackets indicating the reconstructed sections
- the spatial_arg_type column for annotating goal, source, location, etc. 
- remove the rows which have "MN" in preposition_1 and no preposition_2

In [1]:
import pandas as pd

from itertools import chain

from tfob import TFOb,  get_dss

DSS = get_dss()

PATH = 'data/biblical_datasets/exodus/annotation_df_history/Exodus_dataset.csv' # Modify before executing the notebook

**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
scroll,1001,1428.81,100
lex,10450,129.14,94
fragment,11182,127.91,100
line,52895,27.04,100
clause,125,12.85,0
cluster,101099,6.68,47
phrase,315,5.1,0
word,500995,2.81,99
sign,1430241,1.0,100


In [2]:
#df_1 = pd.read_csv('../2_datasets/generate_datasets/data/biblical_datasets/Jeremiah_dataset.csv').fillna("")
df = pd.read_csv(PATH).fillna("")

In [3]:
# Make the values in "chapter" and "verse_num" integers
df['chapter'] = df['chapter'].astype(int)
df['verse_num'] = df['verse_num'].astype(int)

In [4]:
# Reorganise the dataset by chapter, verse_num and scroll

# Custom sorting logic for "scroll" column
df['scroll_order'] = df['scroll'].apply(lambda x: 0 if x == "MT" else 1)

# Sort by "scroll_order", "chapter", and "verse_num"
df = df.sort_values(by=['chapter', 'verse_num', 'scroll_order'])

# Drop the temporary 'scroll_order' column
df = df.drop(columns='scroll_order')

df = df.reset_index(drop=True)

In [5]:
pd.set_option("display.max_columns", None)

In [14]:
df.head(20)

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,sign_info,stem,tense,gcons_clause,subject,complement,cmpl_lex,cmpl_translation,dir_he,cmpl_constr,cmpl_nt,cmpl_anim,cmpl_det,cmpl_indiv,cmpl_complex,motion_type,preposition_1,preposition_2,preposition_3,preposition_4,comments,reconstructed_verse,spatial_arg_type
0,28771,BW>[,MT,Exodus,1,1,B>JM,W >LH CMWT BNJ JFR>L H B>JM MYRJMH >T J<QB >JC...,,qal,ptca,H B>JM MYRJMH >T J<QB,,MYRJMH,MYRJM/,to Egypt,1.0,dir_he,topo,inanim,det,nmpr,simple,factive,,,,,,,
1,28778,BW>[,MT,Exodus,1,1,B>W,W >LH CMWT BNJ JFR>L H B>JM MYRJMH >T J<QB >JC...,,qal,perf,>JC W BJTW B>W,>JC W BJTW,no complement,no complement,,,,,,,,,,,,,,,,
2,1945779,BW>[,4Q13,Exodus,1,1,B>JM,>LH CMWT BNJ JFR>L H B>JM MYRJMH >T J<QWB >BJH...,111 1111 111 11111 1 1111 111111 00 00000 0000...,qal,ptca,H B>JM MYRJMH >T J<QWB,,MYRJMH,MYRJM/,to Egypt,1.0,dir_he,topo,inanim,det,nmpr,simple,factive,,,,,,[>LH CMWT BNJ JFR>L H B>JM MYRJMH] >T J<QWB >B...,
3,1945787,BW>[,4Q13,Exodus,1,1,B>W,>LH CMWT BNJ JFR>L H B>JM MYRJMH >T J<QWB >BJH...,111 1111 111 11111 1 1111 111111 00 00000 0000...,qal,perf,>JC W BJTW B>W,>JC W BJTW,no complement,no complement,,,,,,,,,,,,,,,[>LH CMWT BNJ JFR>L H B>JM MYRJMH] >T J<QWB >B...,
4,1941985,BW>[,4Q11,Exodus,1,1,B>JM,>LH CMWT BNJ JFR>L H B>JM MYRJMH >T J<QB >JC W...,000 0000 000 00111 1 1111 111000 11 1111 111 0...,qal,ptca,H B>JM MYRJMH >T J<QB,,MYRJMH,MYRJM/,to Egypt,1.0,dir_he,topo,inanim,det,nmpr,simple,factive,,,,,,>LH CMWT BNJ JF[R>L H B>JM MYR]JMH [>T J<QB >J...,
5,1941992,BW>[,4Q11,Exodus,1,1,B>W,>LH CMWT BNJ JFR>L H B>JM MYRJMH >T J<QB >JC W...,000 0000 000 00111 1 1111 111000 11 1111 111 0...,qal,perf,>JC W BJTW B>W,>JC W BJTW,no complement,no complement,,,,,,,,,,,,,,,>LH CMWT BNJ JF[R>L H B>JM MYR]JMH [>T J<QB >J...,
6,28798,JY>[,MT,Exodus,1,5,JY>J,W JHJ KL NPC JY>J JRK J<QB CB<JM NPC W JWSP HJ...,,qal,ptca,W JHJ KL NPC JY>J JRK J<QB CB<JM NPC,KL NPC JY>J JRK J<QB,no complement,no complement,,,,,,,,,,,,,,JY>[ not predicate,,
7,1935630,JY>[,4Q1,Exodus,1,5,JY>J,W JHJ KL NPC JY>J JRK J<QB CB<JM W XMC NPC W J...,1 111 11 111 1111 111 1111 11111 0 000 000 0 0...,qal,ptca,W JHJ KL NPC JY>J JRK J<QB CB<JM W XMC NPC,KL NPC JY>J JRK J<QB,no complement,no complement,,,,,,,,,,,,,,JY>[ not predicate,[W JHJ KL NPC JY>J JRK J<QB CB<JM] W XMC NPC W...,
8,28881,<LH[,MT,Exodus,1,10,<LH,HBH NTXKMH LW PN JRBH W HJH KJ TQR>NH MLXMH W ...,,qal,perf,W <LH MN H >RY,,MN H >RY,MN H >RY/,from the land,0.0,prep,topo,inanim,det,subs,simple,factive,MN,,,,,,
9,1935721,<LH[,4Q1,Exodus,1,10,<LH,HBH NTXKMH LW PN JRBH W HJH KJ TQR>NH MLXMH W ...,000 000000 11 11 1111 1 111 11 111111 11111 1 ...,qal,perf,W <LH MN H >RY,,MN H >RY,MN H >RY/,from the land,0.0,prep,topo,inanim,det,subs,simple,factive,MN,,,,,HBH NTXKMH [LW PN JRBH W HJH KJ TQR>NH MLXMH W...,


In [7]:
df.columns

Index(['verb_id', 'lex', 'scroll', 'book', 'chapter', 'verse_num',
       'gcons_verb', 'gcons_verse', 'sign_info', 'stem', 'tense',
       'gcons_clause', 'subject', 'complement', 'cmpl_lex', 'cmpl_translation',
       'dir_he', 'cmpl_constr', 'cmpl_nt', 'cmpl_anim', 'cmpl_det',
       'cmpl_indiv', 'cmpl_complex', 'motion_type', 'preposition_1',
       'preposition_2', 'preposition_3', 'preposition_4', 'comments',
       'reconstructed_verse', 'spatial_arg_type'],
      dtype='object')

In [8]:
# This is the most recent sign_info function which uses the .rec (1 October 2024)

def sign_info(verb):
    sign_rec_words = []
    
    scroll = verb.to_scrolls.scroll[0]
    verse = TFOb.section([verb.book[0], verb.chapter[0], verb.verse[0]], DSS, scroll)
    
    for word in verse.to_words:
        signs = word.to_signs
        sign_rec = []
        for sign in signs:
            if sign.type[0] == "cons": # only keeps consonants
                if sign.rec[0]:
                    sign_rec.append("1")
                else:
                    sign_rec.append("0")
        if sign_rec:
            sign_rec_words.append("".join(sign_rec))
        
    return " ".join(sign_rec_words).strip()

In [9]:
# Updates the dataframe (change the old sign_info column to one containing the return of the new function)

def update_sign_info(row):
    if row['scroll'] != 'MT':
        return sign_info(TFOb(int(row['verb_id']), DSS))
    return row['sign_info']

# Apply the function to the dataframe
df['sign_info'] = df.apply(update_sign_info, axis=1)

In [10]:
# Initialize an empty list to collect mismatch information
mismatches = []

# Function to check the lengths of sign_info and gcons_verse
def check_length(row):
    if row['scroll'] != 'MT':
        if len(row['sign_info']) != len(row['gcons_verse']):
            mismatches.append(f"Mismatch found in row {row.name}: sign_info='{row['sign_info']}' (len={len(row['sign_info'])}), gcons_verse='{row['gcons_verse']}' (len={len(row['gcons_verse'])})")

# Apply the function to the dataframe
df.apply(check_length, axis=1)

# Check if there are any mismatches and print them
if mismatches:
    print("Mismatches found:")
    for mismatch in mismatches:
        print(mismatch)
else:
    print("No mismatches found.")

No mismatches found.


In [11]:
# The reconstruct_verse function adds a gcons version of the verse with brackets indicating the reconstructed sections

# Add a column with the verse plus brackets indicating reconstructed letters

def reconstruct_verse(sign_info, verse):
    rec_verse = []
    is_reconstructed = False

    for c_verse, c_sign in zip(verse, sign_info):
        if c_sign == "1" and not is_reconstructed:
            is_reconstructed = True
            rec_verse.append("[")
        if c_sign == "0" and is_reconstructed:
            is_reconstructed = False
            if rec_verse[-1] == " ":
                rec_verse.pop()
                rec_verse.extend(["]", " "])
            else:
                rec_verse.append("]")
        rec_verse.append(c_verse)

    if is_reconstructed:
        rec_verse.append("]")
    return "".join(rec_verse)


df["reconstructed_verse"] = ""

df["reconstructed_verse"] = df.apply(lambda row: reconstruct_verse(row.sign_info, row.gcons_verse) if row.book != "MT" else "", axis=1)

In [12]:
# Delete the DSS duplicated

verb_ids = df.verb_id.to_list()

duplicates = set([verb_id for verb_id in verb_ids if verb_ids.count(verb_id) > 1])


df = df[~((df.gcons_clause == "None") & (df.subject == "None") & (df.verb_id.isin(duplicates)))] # delete duplicates
df.reset_index(drop=True, inplace=True) # reset the index for AnnotEasy

In [13]:
df.columns

Index(['verb_id', 'lex', 'scroll', 'book', 'chapter', 'verse_num',
       'gcons_verb', 'gcons_verse', 'sign_info', 'stem', 'tense',
       'gcons_clause', 'subject', 'complement', 'cmpl_lex', 'cmpl_translation',
       'dir_he', 'cmpl_constr', 'cmpl_nt', 'cmpl_anim', 'cmpl_det',
       'cmpl_indiv', 'cmpl_complex', 'motion_type', 'preposition_1',
       'preposition_2', 'preposition_3', 'preposition_4', 'comments',
       'reconstructed_verse', 'spatial_arg_type'],
      dtype='object')

In [15]:
# Add the spatial_arg_type column

df["spatial_arg_type"] = ""

In [16]:
df.columns

Index(['verb_id', 'lex', 'scroll', 'book', 'chapter', 'verse_num',
       'gcons_verb', 'gcons_verse', 'gcons_clause', 'subject', 'complement',
       'cmpl_lex', 'dir_he', 'cmpl_constr', 'cmpl_nt', 'cmpl_anim', 'cmpl_det',
       'cmpl_indiv', 'cmpl_complex', 'dir_he_dss', 'sign_info', 'stem',
       'tense', 'preposition_1', 'preposition_2', 'preposition_3',
       'preposition_4', 'preposition_5', 'preposition_6', 'preposition_7',
       'cmpl_translation', 'motion_type', 'comments', 'reconstructed_verse',
       'spatial_arg_type'],
      dtype='object')

In [17]:
# Remove the columns that have MN as preposition 1 and not other prepositions

In [18]:
# have a look at the data
df_test = df[(df.preposition_1 == "MN") & (df.preposition_2 == "")]
df_test.head(5)
#len(df_test)

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,gcons_clause,subject,complement,cmpl_lex,dir_he,cmpl_constr,cmpl_nt,cmpl_anim,cmpl_det,cmpl_indiv,cmpl_complex,dir_he_dss,sign_info,stem,tense,preposition_1,preposition_2,preposition_3,preposition_4,preposition_5,preposition_6,preposition_7,cmpl_translation,motion_type,comments,reconstructed_verse,spatial_arg_type
27,177089,<LH[,MT,1_Kings,1,45,J<LW,W JMCXW >TW YDWQ H KHN W NTN H NBJ> L MLK B GX...,W J<LW M CM FMXJM,,M CM,MN CM,0.0,prep,,,det,,simple,,,qal,wayq,MN,,,,,,,,,,,
42,177458,BRX[,MT,1_Kings,2,7,BRXJ,W L BNJ BRZLJ H GL<DJ T<FH XSD W HJW B >KLJ CL...,B BRXJ M PNJ >BCLWM >XJK,,M PNJ >BCLWM >XJK,MN PNH/ >BCLWM/ >X/,0.0,prep,pers,anim,det,subs,complex,,,qal,infc,MN,,,,,,,,,,,
66,178168,JY>[,MT,1_Kings,2,36,TY>,W JCLX H MLK W JQR> L CM<J W J>MR LW BNH LK BJ...,W L> TY> M CM >NH W >NH,,M CM,MN CM,0.0,prep,,,det,,simple,,,qal,impf,MN,,,,,,,,,,,
73,178266,BW>[,MT,1_Kings,2,40,JB>,W JQM CM<J W JXBC >T XMRW W JLK GTH >L >KJC L ...,W JB> >T <BDJW M GT,,M GT,MN GT=/,0.0,prep,topo,inanim,det,nmpr,simple,,,hif,wayq,MN,,,,,,,,,,,
74,178276,HLK[,MT,1_Kings,2,41,HLK,W JGD L CLMH KJ HLK CM<J M JRWCLM GT W JCB,KJ HLK CM<J M JRWCLM GT,CM<J,M JRWCLM,MN JRWCLM/,0.0,prep,topo,inanim,det,nmpr,simple,,,qal,perf,MN,,,,,,,,,,,


In [19]:
# Identify the rows to drop based on the condition
index_to_drop = df[(df.preposition_1 == "MN") & (df.preposition_2 == "")].index

# Drop the rows by index
df = df.drop(index_to_drop)

# Reset the index
df = df.reset_index(drop=True)

In [20]:
# Verification
df[(df.preposition_1 == "MN") & (df.preposition_2 == "")]

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,gcons_clause,subject,complement,cmpl_lex,dir_he,cmpl_constr,cmpl_nt,cmpl_anim,cmpl_det,cmpl_indiv,cmpl_complex,dir_he_dss,sign_info,stem,tense,preposition_1,preposition_2,preposition_3,preposition_4,preposition_5,preposition_6,preposition_7,cmpl_translation,motion_type,comments,reconstructed_verse,spatial_arg_type


In [21]:
df.head(20)

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,gcons_clause,subject,complement,cmpl_lex,dir_he,cmpl_constr,cmpl_nt,cmpl_anim,cmpl_det,cmpl_indiv,cmpl_complex,dir_he_dss,sign_info,stem,tense,preposition_1,preposition_2,preposition_3,preposition_4,preposition_5,preposition_6,preposition_7,cmpl_translation,motion_type,comments,reconstructed_verse,spatial_arg_type
0,176086,BW>[,MT,1_Kings,1,1,B>,W H MLK DWD ZQN B> B JMJM W JKSHW B BGDJM W L>...,B> B JMJM,,B JMJM,B H JWM/,0.0,prep,,,det,subs,simple,,,qal,perf,B,,,,,,,,,,,
1,2072601,BW>[,5Q2,1_Kings,1,1,B>,W H MLK DWD ZQN B> B JMJM W JKSHW B BGDJM W L>...,,,,,,,,,,,,0.0,1 1 000 111 111 11 1 1111 1 11111 1 11111 1 11...,qal,perf,,,,,,,,,,,[W H] MLK [DWD ZQN B> B JMJM W JKSHW B BGDJM W...,
2,176145,BW>[,MT,1_Kings,1,3,JB>W,W JBQCW N<RH JPH B KL GBWL JFR>L W JMY>W >T >B...,W JB>W >TH L MLK,,L MLK,L H MLK/,0.0,prep,,,det,subs,simple,,,hif,wayq,L,,,,,,,,,,,
3,176187,RWY[,MT,1_Kings,1,5,RYJM,W >DNJH BN XGJT MTNF> L >MR >NJ >MLK W J<F LW ...,RYJM L PNJW,,L PNJW,L PNH/,0.0,prep,,,det,subs,simple,,,qal,ptca,L,,,,,,,,,,,
4,176326,HLK[,MT,1_Kings,1,12,LKJ,W <TH LKJ >J<YK N> <YH W MLVJ >T NPCK W >T NPC...,LKJ,,no complement,no complement,,,,,,,,,,qal,impv,,,,,,,,,,,,
5,176339,HLK[,MT,1_Kings,1,13,LKJ,LKJ W B>J >L H MLK DWD W >MRT >LJW H L> >TH >D...,LKJ,,no complement,no complement,,,,,,,,,,qal,impv,,,,,,,,,,,,
6,176341,BW>[,MT,1_Kings,1,13,B>J,LKJ W B>J >L H MLK DWD W >MRT >LJW H L> >TH >D...,W B>J >L H MLK DWD,,>L H MLK DWD,>L H MLK/ DWD==/,0.0,prep,pers,anim,det,subs,complex,,,qal,impv,>L,,,,,,,,,,,
7,176383,BW>[,MT,1_Kings,1,14,>BW>,HNH <WDK MDBRT CM <M H MLK W >NJ >BW> >XRJK W ...,W >NJ >BW> >XRJK,>NJ,>XRJK,>XR/,0.0,vc,,,det,subs,simple,,,qal,impf,,,,,,,,,,,,
8,176390,BW>[,MT,1_Kings,1,15,TB>,W TB> BT_CB< >L H MLK H XDRH W H MLK ZQN M>D W...,W TB> BT_CB< >L H MLK H XDRH,BT_CB<,>L H MLK,>L H MLK/,0.0,prep,,,det,subs,simple,,,qal,wayq,>L,,,,,,,,,,,
9,176390,BW>[,MT,1_Kings,1,15,TB>,W TB> BT_CB< >L H MLK H XDRH W H MLK ZQN M>D W...,W TB> BT_CB< >L H MLK H XDRH,BT_CB<,H XDRH,H XDR/,1.0,dir-he,,,det,subs,simple,,,qal,wayq,,,,,,,,,,,,


In [15]:
df.to_csv(PATH, index=False)