## Description: prepare the notebook for manual annotation

In this notebook, three elements are added to the previously generated dataset:
- check that the chapter and verse_num columns are integers
- sign_info column, based on a recent version of the function
- reconstructed_verse column which is a gcons version of the verse with brackets indicating the reconstructed sections
- the spatial_arg_type column for annotating goal, source, location, etc. 
- remove the rows which have "MN" in preposition_1 and no preposition_2

In [1]:
import pandas as pd

from itertools import chain

from tfob import TFOb,  get_dss

#DSS = get_dss()

In [2]:
PATH = 'data/extra_biblical_datasets/extra_biblical/annotation_df_history/extrabiblical_all_verbs.csv' # Modify before executing the notebook

In [3]:
#df_1 = pd.read_csv('../2_datasets/generate_datasets/data/biblical_datasets/Jeremiah_dataset.csv').fillna("")
df = pd.read_csv(PATH).fillna("")

In [4]:
# Make the values in "chapter" and "verse_num" integers
df['chapter'] = df['chapter'].astype(int)
df['verse_num'] = df['verse_num'].astype(int)

In [5]:
# Reorganise the dataset by chapter, verse_num and scroll

# Custom sorting logic for "scroll" column
df['scroll_order'] = df['scroll'].apply(lambda x: 0 if x == "MT" else 1)

# Sort by "scroll_order", "chapter", and "verse_num"
df = df.sort_values(by=['chapter', 'verse_num', 'scroll_order'])

# Drop the temporary 'scroll_order' column
df = df.drop(columns='scroll_order')

df = df.reset_index(drop=True)

In [6]:
pd.set_option("display.max_columns", None)

In [7]:
df.head(2)

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,gcons_clause,subject,complement,dir_he,sign_info,stem,tense,preposition_1,preposition_2,preposition_3,preposition_4
0,23732,CWB,Ketef_Hinnom,Ketef_Hinnom,1,1,JCJB,JHWH YB>T H >L H GDL CMR H BRJT W H XSD L >HB ...,KJ JHWH JCJB NW,JHWH,,,,hif,impf,,,,
1,24706,BW>,Mesad_Hashavyahu,Mesad_Hashavyahu,1,1,JB>,JCM< >DN J H FR >T DBR <BD H <BD K QYR HJH <BD...,W JB> HWC<JHW BN CBJ,HWC<JHW BN CBJ,,,,qal,impf,,,,


In [9]:
df[(df.scroll == "Ketef_Hinnom")]

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,gcons_clause,subject,complement,dir_he,sign_info,stem,tense,preposition_1,preposition_2,preposition_3,preposition_4
0,23732,CWB,Ketef_Hinnom,Ketef_Hinnom,1,1,JCJB,JHWH YB>T H >L H GDL CMR H BRJT W H XSD L >HB ...,KJ JHWH JCJB NW,JHWH,,,,hif,impf,,,,


In [10]:
#df[(df.scroll != "MT") & (df.gcons_clause != "None")]

In [11]:
df.columns

Index(['verb_id', 'lex', 'scroll', 'book', 'chapter', 'verse_num',
       'gcons_verb', 'gcons_verse', 'gcons_clause', 'subject', 'complement',
       'dir_he', 'sign_info', 'stem', 'tense', 'preposition_1',
       'preposition_2', 'preposition_3', 'preposition_4'],
      dtype='object')

In [12]:
# This is the most recent sign_info function which uses the .rec (1 October 2024)

def sign_info(verb):
    sign_rec_words = []
    
    scroll = verb.to_scrolls.scroll[0]
    verse = TFOb.section([verb.book[0], verb.chapter[0], verb.verse[0]], DSS, scroll)
    
    for word in verse.to_words:
        signs = word.to_signs
        sign_rec = []
        for sign in signs:
            if sign.type[0] == "cons": # only keeps consonants
                if sign.rec[0]:
                    sign_rec.append("1")
                else:
                    sign_rec.append("0")
        if sign_rec:
            sign_rec_words.append("".join(sign_rec))
        
    return " ".join(sign_rec_words).strip()

In [13]:
# Updates the dataframe (change the old sign_info column to one containing the return of the new function)

def update_sign_info(row):
    if row['scroll'] != 'MT':
        return sign_info(TFOb(int(row['verb_id']), DSS))
    return row['sign_info']

# Apply the function to the dataframe
df['sign_info'] = df.apply(update_sign_info, axis=1)

NameError: name 'DSS' is not defined

In [289]:
# Initialize an empty list to collect mismatch information
mismatches = []

# Function to check the lengths of sign_info and gcons_verse
def check_length(row):
    if row['scroll'] != 'MT':
        if len(row['sign_info']) != len(row['gcons_verse']):
            mismatches.append(f"Mismatch found in row {row.name}: sign_info='{row['sign_info']}' (len={len(row['sign_info'])}), gcons_verse='{row['gcons_verse']}' (len={len(row['gcons_verse'])})")

# Apply the function to the dataframe
df.apply(check_length, axis=1)

# Check if there are any mismatches and print them
if mismatches:
    print("Mismatches found:")
    for mismatch in mismatches:
        print(mismatch)
else:
    print("No mismatches found.")

No mismatches found.


In [290]:
# The reconstruct_verse function adds a gcons version of the verse with brackets indicating the reconstructed sections

# Add a column with the verse plus brackets indicating reconstructed letters

def reconstruct_verse(sign_info, verse):
    rec_verse = []
    is_reconstructed = False

    for c_verse, c_sign in zip(verse, sign_info):
        if c_sign == "1" and not is_reconstructed:
            is_reconstructed = True
            rec_verse.append("[")
        if c_sign == "0" and is_reconstructed:
            is_reconstructed = False
            if rec_verse[-1] == " ":
                rec_verse.pop()
                rec_verse.extend(["]", " "])
            else:
                rec_verse.append("]")
        rec_verse.append(c_verse)

    if is_reconstructed:
        rec_verse.append("]")
    return "".join(rec_verse)


df["reconstructed_verse"] = ""

df["reconstructed_verse"] = df.apply(lambda row: reconstruct_verse(row.sign_info, row.gcons_verse) if row.book != "MT" else "", axis=1)

In [291]:
# Delete the DSS duplicated

verb_ids = df.verb_id.to_list()

duplicates = set([verb_id for verb_id in verb_ids if verb_ids.count(verb_id) > 1])


df = df[~((df.gcons_clause == "None") & (df.subject == "None") & (df.verb_id.isin(duplicates)))] # delete duplicates
df.reset_index(drop=True, inplace=True) # reset the index for AnnotEasy

In [292]:
df.columns

Index(['verb_id', 'lex', 'scroll', 'book', 'chapter', 'verse_num',
       'gcons_verb', 'gcons_verse', 'gcons_clause', 'subject', 'complement',
       'cmpl_lex', 'cmpl_translation', 'dir_he', 'cmpl_constr', 'cmpl_nt',
       'cmpl_anim', 'cmpl_det', 'cmpl_indiv', 'cmpl_complex', 'dir_he_dss',
       'sign_info', 'stem', 'tense', 'motion_type', 'comments',
       'preposition_1', 'preposition_2', 'preposition_3', 'preposition_4',
       'preposition_5', 'preposition_6', 'preposition_7',
       'reconstructed_verse'],
      dtype='object')

In [20]:
# Add the spatial_arg_type column

df["spatial_arg_type"] = ""
new_columns_list = ['cmpl_lex', 'cmpl_translation', 'cmpl_constr', 'cmpl_nt', 'cmpl_anim', 'cmpl_det', 'cmpl_indiv', 'cmpl_complex', 'motion_type', 'comments', 'reconstructed_verse']

for column_name in new_columns_list:
    df[column_name] = ""

In [21]:
df.columns

Index(['verb_id', 'lex', 'scroll', 'book', 'chapter', 'verse_num',
       'gcons_verb', 'gcons_verse', 'gcons_clause', 'subject', 'complement',
       'dir_he', 'sign_info', 'stem', 'tense', 'preposition_1',
       'preposition_2', 'preposition_3', 'preposition_4', 'spatial_arg_type',
       'cmpl_lex', 'cmpl_translation', 'cmpl_constr', 'cmpl_nt', 'cmpl_anim',
       'cmpl_det', 'cmpl_indiv', 'cmpl_complex', 'motion_type', 'comments',
       'reconstructed_verse'],
      dtype='object')

In [295]:
# Remove the columns that have MN as preposition 1 and not other prepositions

In [16]:
# have a look at the data
df_test = df[(df.preposition_1 == "MN") & (df.preposition_2 == "MN")]
df_test.head(5)
#len(df_test)

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,gcons_clause,subject,complement,dir_he,sign_info,stem,tense,preposition_1,preposition_2,preposition_3,preposition_4,spatial_arg_type


In [297]:
len(df_test)

0

In [17]:
# Identify the rows to drop based on the condition
index_to_drop = df[(df.preposition_1 == "MN") & (df.preposition_2 == "")].index
index_to_drop_2 = df[(df.preposition_1 == "MN") & (df.preposition_2 == "MN") & (df.preposition_3 == "")].index
index_to_drop_3 = df[(df.preposition_1 == "MN") & (df.preposition_2 == "MN") & (df.preposition_3 == "MN") & (df.preposition_4 == "")].index

# Drop the rows by index
df = df.drop(index_to_drop)
df = df.drop(index_to_drop_2)
df = df.drop(index_to_drop_3)

# Reset the index
df = df.reset_index(drop=True)

In [299]:
# Verification
df[(df.preposition_1 == "MN") & (df.preposition_2 == "")]

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,gcons_clause,subject,complement,cmpl_lex,cmpl_translation,dir_he,cmpl_constr,cmpl_nt,cmpl_anim,cmpl_det,cmpl_indiv,cmpl_complex,dir_he_dss,sign_info,stem,tense,motion_type,comments,preposition_1,preposition_2,preposition_3,preposition_4,preposition_5,preposition_6,preposition_7,reconstructed_verse,spatial_arg_type


In [300]:
len(df)

18

In [23]:
set(df.scroll)

{'1QH',
 '1QM',
 '1QS',
 'Arad',
 'Balaam',
 'Ketef_Hinnom',
 'Lachish',
 'Mesad_Hashavyahu',
 'Mesha_Stela',
 'Siloam'}

In [28]:
df_1 = df[(df.scroll == "1QS") | (df.scroll == "1QH") |(df.scroll == "1QM")].copy().reset_index()

In [29]:
set(df_1.scroll)

{'1QH', '1QM', '1QS'}

In [36]:
len(df_1[df_1.scroll == "1QM"])

82

In [30]:
df_1.head(5)

Unnamed: 0,index,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,gcons_clause,subject,complement,dir_he,sign_info,stem,tense,preposition_1,preposition_2,preposition_3,preposition_4,spatial_arg_type,cmpl_lex,cmpl_translation,cmpl_constr,cmpl_nt,cmpl_anim,cmpl_det,cmpl_indiv,cmpl_complex,motion_type,comments,reconstructed_verse
0,8,11088,JY>,1QM,1QM,1,6,JY>,W B QY W JY> B XMH GDWLH L HLXM B MLKJ H YPWN ...,W B QY W JY> B XMH GDWLH,,,,,qal,impf,,,,,,,,,,,,,,,,
1,9,17046,HLK,1QS,1QS,1,6,LKT,W LW> L LKT <WD B CRJRWT LB >CMH W <JNJ ZNWT L...,L LKT <WD B CRJRWT LB >CMH W <JNJ ZNWT,,,,,qal,infc,,,,,,,,,,,,,,,,
2,11,17061,BW>,1QS,1QS,1,7,HBJ,W L HBJ >T KWL H NDBJM L <FWT XWQJ >L B BRJT XSD,W L HBJ >T KWL,,,,,hif,infc,,,,,,,,,,,,,,,,
3,12,17080,HLK,1QS,1QS,1,8,HTHLK,L HWXD B <YT >L W L HTHLK L PNJ W TMJM KWL H N...,W L HTHLK L PNJ W TMJM KWL,,L PNJ W,0.0,,hit,infc,L,,,,,,,,,,,,,,,
4,13,11136,NPL,1QM,1QM,1,9,NPL,W NPL >CWR W >JN <WZR L W,W NPL >CWR,>CWR,,,,qal,perf,,,,,,,,,,,,,,,,


In [39]:
# Reorganise the dataset by chapter, verse_num and scroll

# Custom sorting logic for "scroll" column

# Sort by "scroll_order", "chapter", and "verse_num"
df_1 = df_1.sort_values(by=['scroll','chapter', 'verse_num'])



df_1 = df_1.reset_index(drop=True)

In [40]:
df_1.head(4)

Unnamed: 0,index,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,gcons_clause,subject,complement,dir_he,sign_info,stem,tense,preposition_1,preposition_2,preposition_3,preposition_4,spatial_arg_type,cmpl_lex,cmpl_translation,cmpl_constr,cmpl_nt,cmpl_anim,cmpl_det,cmpl_indiv,cmpl_complex,motion_type,comments,reconstructed_verse
0,55,206,GLL,1QH,1QH,3,9,HTGWLLTJ,KJ B NDH HTGWLLTJ W M SWD RMH Y>TJ W L> NLWJTJ...,KJ B NDH HTGWLLTJ,,B NDH,0.0,,hit,perf,B,,,,,,,,,,,,,,,
1,73,354,NPL,1QH,1QH,4,1,HTNPL,MZMWR L MFKJL L HTNPL L PNJ >L M<FJ >L W L HBJ...,L HTNPL L PNJ >L,,L PNJ >L,0.0,,hit,infc,L,,,,,,,,,,,,,,,
2,74,376,HLK,1QH,1QH,4,1,HTHLKW,MZMWR L MFKJL L HTNPL L PNJ >L M<FJ >L W L HBJ...,HTHLKW,,,,,hit,perf,,,,,,,,,,,,,,,,
3,81,447,GLH,1QH,1QH,4,4,GLJTH,>TH HW>L QDWCJM W B RZJ PL> K HWD< BWR KBWD K ...,>TH GLJTH DRKJ >MT W M<FJ R< XWKMH W >WLT,>TH,,,,qal,perf,,,,,,,,,,,,,,,,


In [41]:
df_1.to_csv(PATH, index=False)