In [3]:
import pandas as pd
import re

In [4]:
pd.set_option("display.max_columns", None)

In [5]:
# Import Medill dataset
df_1 = pd.read_excel("data/Medill 2020 Hebrew Goal Data.xls").fillna("")

In [6]:
# Select only books present in Perez
df_m = df_1[df_1.book.isin(['exodus', 'deuteronomy', 'isaiah'])].copy()

In [7]:
# Import Perez dataset
df_2 = pd.read_csv("data/sbl_grouped_cmpls_for_annot.csv").fillna("")

In [8]:
# Select only goals
df_p = df_2[df_2.spatial_arg_type == "goal"].copy()

In [10]:
df_1.columns

Index(['id', 'book', 'info_refer', 'info_reads', 'info_means', 'texttype',
       'syn_vbinit', 'syn_gcb4vb', 'syn_realis', 'syn_affneg', 'sub_explicit',
       'sub_anim', 'sub_def', 'sub_num', 'obj_reflexive', 'obj_anim',
       'obj_def', 'obj_num', 'info_verbroot', 'verbid', 'construction',
       'sub_affected', 'sub_affect2', 'vb_binyan', 'vb_passive', 'vb_particip',
       'vb_parse', 'gc_sgpl', 'gc_sameclseq', 'gc_samesame', 'gc_parclseq',
       'gc_parsame', 'info_gcorder', 'gc', 'gc_end', 'gc_def', 'gc_add',
       'gc_proper', 'gc_anim', 'gc_complex', 'gc_prep', 'source', 'era',
       'soc_oral', 'soc_north', 'gc2', 'book2', 'vb_aspect', 'obj_def2',
       'gc_samesame2', 'gc_parsame2', 'verse', 'gc3binary', 'unique'],
      dtype='object')

In [11]:
set(df_1.gc_proper)

{'common noun', 'pronoun', 'proper noun'}

In [22]:
set(df_1[df_1.info_means == "thither"].info_gcorder)

{'', 'a', 'b', 'c', 'd'}

In [26]:
df_m.head(2)

Unnamed: 0,id,book,info_refer,info_reads,info_means,texttype,syn_vbinit,syn_gcb4vb,syn_realis,syn_affneg,sub_explicit,sub_anim,sub_def,sub_num,obj_reflexive,obj_anim,obj_def,obj_num,info_verbroot,verbid,construction,sub_affected,sub_affect2,vb_binyan,vb_passive,vb_particip,vb_parse,gc_sgpl,gc_sameclseq,gc_samesame,gc_parclseq,gc_parsame,info_gcorder,gc,gc_end,gc_def,gc_add,gc_proper,gc_anim,gc_complex,gc_prep,source,era,soc_oral,soc_north,gc2,book2,vb_aspect,obj_def2,gc_samesame2,gc_parsame2,verse,gc3binary,unique
316,315.0,exodus,Ex 01:01,mişrayma:h,to Egypt,narrative,verb not first,GC not before vb,realis,affirm,explicit,animate/extended animate,definite NP,plural/all,not applicable,,,,"b.w.? ""come""",0.0,Intrans MC,affected,affected,G,not passive,one participant,participle,sg,no,,no,,,hey,consonant,definite,no adjuncts,proper noun,inanimate,simple,no prep,Priestly,transitional,more oral like,not yet,hey,exodus,imperfective,,,,prose,0,unique or representative
317,316.0,exodus,Ex 01:19,?alehen,to them,dialogue,verb first,GC not before vb,irrealis,affirm,explicit,animate/extended animate,definite NP,sg/distributive,not applicable,,,,"b.w.? ""come""",0.0,Intrans MC,incompletely aff (irreal),incompletely aff (irreal),G,not passive,one participant,imperfect,pl,no,,no,,,prep,blocked - poss pro suffix,definite,no adjuncts,pronoun,animate,simple,?el,other non-P,CBH,more oral like,not yet,prep or prep plus prep,exodus,imperfective,,,,prose,1,unique or representative


In [27]:
df_p.head(2)

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,sign_info,stem,tense,gcons_clause,subject,complement,cmpl_lex,cmpl_translation,dir_he,cmpl_constr,cmpl_nt,cmpl_anim,cmpl_det,cmpl_indiv,cmpl_complex,motion_type,spatial_arg_type,preposition_1,preposition_2,preposition_3,preposition_4,comments,reconstructed_verse
0,212418,BW>[,MT,Isaiah,1,23,JBW>,FRJK SWRRJM W XBRJ GNBJM KLW >HB CXD W RDP CLM...,,qal,impf,W RJB >LMNH L> JBW> >LJHM,RJB >LMNH,>LJHM,>L,to them,0.0,prep,prps,anim,det,prsf,simple,fictive,goal,>L,,,,"the cause of the widow does not come to them, ...",
2,212564,NHR[,MT,Isaiah,2,2,NHRW,W HJH B >XRJT H JMJM NKWN JHJH HR BJT JHWH B R...,,qal,perf,W NHRW >LJW KL H GWJM,KL H GWJM,>LJW,>L,to it (the mountain),0.0,prep,prps,inanim,det,prsf,simple,factive,goal,>L,,,,,


In [28]:
print("Dataset Medill (Exodus, Deuteronomy, Isaiah) length: ", len(df_m))
print("Dataset Perez (Exodus, Deuteronomy, Isaiah) length:", len(df_p))

Dataset Medill (Exodus, Deuteronomy, Isaiah) length:  364
Dataset Perez (Exodus, Deuteronomy, Isaiah) length: 412


In [29]:
# Add comparing columns to df_m: gc_def2
set(df_m.gc_def)

{'definite', 'not definite'}

In [30]:
# Create a mapping dictionary

mapping = {
    'definite': 'det',
    'not definite': 'und',
}

# Create the new column 'construction' using the map function
df_m['gc_def2'] = df_m['gc_def'].map(mapping)

# check the column gc_def2 and gc_def

print(df_m.gc_def.value_counts())
print(df_m.gc_def2.value_counts())

definite        296
not definite     68
Name: gc_def, dtype: int64
det    296
und     68
Name: gc_def2, dtype: int64


In [31]:
print(df_p.cmpl_det.value_counts())

det    391
und     21
Name: cmpl_det, dtype: int64


In [32]:
# Create necessary columns to compare line by line

In [33]:
set(df_m.book)

{'deuteronomy', 'exodus', 'isaiah'}

In [34]:
# Dictionary for mapping

mapping_book = {
    'deuteronomy': 'Deuteronomy',
    'exodus': 'Exodus',
    'isaiah': 'Isaiah',
}

# Create the new column 'construction' using the map function
df_m['book'] = df_m['book'].map(mapping_book)

In [36]:
# Create the columns chapter and verse from info_refer

def extract_chapter_verse(info_refer):
    match = re.match(r'\w+\s(\d+):(\d+)', info_refer)
    if match:
        return int(match.group(1)), int(match.group(2))
    return None, None

# Apply the function to create new columns
df_m['chapter'], df_m['verse_num'] = zip(*df_m['info_refer'].apply(extract_chapter_verse))

# Create a list of columns in the desired order
desired_columns = ['id', 'book', 'chapter', 'verse_num'] + [col for col in df_m.columns if col not in ['id','book', 'chapter', 'verse_num']]

# Reorder the dataframe columns
df_m = df_m[desired_columns]

# Display the reordered dataframe
df_m.head(5)

Unnamed: 0,id,book,chapter,verse_num,info_refer,info_reads,info_means,texttype,syn_vbinit,syn_gcb4vb,syn_realis,syn_affneg,sub_explicit,sub_anim,sub_def,sub_num,obj_reflexive,obj_anim,obj_def,obj_num,info_verbroot,verbid,construction,sub_affected,sub_affect2,vb_binyan,vb_passive,vb_particip,vb_parse,gc_sgpl,gc_sameclseq,gc_samesame,gc_parclseq,gc_parsame,info_gcorder,gc,gc_end,gc_def,gc_add,gc_proper,gc_anim,gc_complex,gc_prep,source,era,soc_oral,soc_north,gc2,book2,vb_aspect,obj_def2,gc_samesame2,gc_parsame2,verse,gc3binary,unique,gc_def2
316,315.0,Exodus,1,1,Ex 01:01,mişrayma:h,to Egypt,narrative,verb not first,GC not before vb,realis,affirm,explicit,animate/extended animate,definite NP,plural/all,not applicable,,,,"b.w.? ""come""",0.0,Intrans MC,affected,affected,G,not passive,one participant,participle,sg,no,,no,,,hey,consonant,definite,no adjuncts,proper noun,inanimate,simple,no prep,Priestly,transitional,more oral like,not yet,hey,exodus,imperfective,,,,prose,0,unique or representative,det
317,316.0,Exodus,1,19,Ex 01:19,?alehen,to them,dialogue,verb first,GC not before vb,irrealis,affirm,explicit,animate/extended animate,definite NP,sg/distributive,not applicable,,,,"b.w.? ""come""",0.0,Intrans MC,incompletely aff (irreal),incompletely aff (irreal),G,not passive,one participant,imperfect,pl,no,,no,,,prep,blocked - poss pro suffix,definite,no adjuncts,pronoun,animate,simple,?el,other non-P,CBH,more oral like,not yet,prep or prep plus prep,exodus,imperfective,,,,prose,1,unique or representative,det
318,317.0,Exodus,1,22,Ex 01:22,hay?ora:h,to the Nile,dialogue,verb not first,GC before vb,irrealis,affirm,not explicit,animate/extended animate,not explicit,plural/all,not reflexive,animate/extended animate,definite NP,sg/distributive,"sh.l.k ""throw""",67.0,Caused Motion,incompletely aff (irreal),incompletely aff (irreal),C,not passive,more than one,imperfect,sg,no,,no,,,hey,consonant,definite,no adjuncts,common noun,inanimate,simple,no prep,other non-P,CBH,more oral like,not yet,hey,exodus,imperfective,def NP,,,prose,0,unique or representative,det
319,318.0,Exodus,2,10,Ex 02:10,le-bat par'o,to P's daughter,narrative,verb first,GC not before vb,realis,affirm,not explicit,animate/extended animate,not explicit,sg/distributive,not reflexive,animate/extended animate,pronoun,sg/distributive,"b.w.? ""come""",0.0,Leading (non-coercive),affected,affected,C,not passive,more than one,pret wayyiqtol,sg,no,,no,,,prep,consonant,definite,no adjuncts,common noun,animate,complex,l,other non-P,CBH,more oral like,not yet,prep or prep plus prep,exodus,perfective,pronoun,,,prose,1,unique or representative,det
320,319.0,Exodus,2,11,Ex 02:11,?el ?ehayv,to his brothers,narrative,verb first,GC not before vb,realis,affirm,not explicit,animate/extended animate,not explicit,sg/distributive,not applicable,,,,"y.ts.? ""went forth""",77.0,Intrans MC,affected,affected,G,not passive,one participant,pret wayyiqtol,pl,no,,no,,,prep,blocked - poss pro suffix,definite,no adjuncts,common noun,animate,complex,?el,other non-P,CBH,more oral like,not yet,prep or prep plus prep,exodus,perfective,,,,prose,1,unique or representative,det


In [53]:
# Comparing definiteness annotations

# Merge the DataFrames on 'book', 'chapter', and 'verse'
merged_df = pd.merge(df_m, df_p, on=['book', 'chapter', 'verse_num'], suffixes=('_m', '_p'))

# Compare the values in 'gc_det2' and 'cmpl_det'
merged_df['det_match'] = merged_df['gc_def2'] == merged_df['cmpl_det']

# Display the rows where the values differ
differences = merged_df[merged_df['det_match'] == False].reset_index()

# Adjust display options
# Display all rows
pd.set_option('display.max_rows', None)

# Display the full content of each cell
pd.set_option('display.max_colwidth', None)

# Print the differences
differences[['book', 'chapter', 'verse_num', 'gc_def2', 'cmpl_det', "info_reads", "info_means", "cmpl_translation", "cmpl_lex"]]

Unnamed: 0,book,chapter,verse_num,gc_def2,cmpl_det,info_reads,info_means,cmpl_translation,cmpl_lex
0,Exodus,3,8,und,det,?el ?erets tobah varaxabah,to a good and broad land,"to the good and broad land, to the land flowing with milk and honey, to the place of the Canaanites and the Hittites and the Amorites and the Perizzites and the Hivites and the Jebusites",>L >RY/ VWB/ W RXB/ >L >RY/ ZWB[ XLB/ W DBC/ >L MQWM/ H KN<NJ/ W H XTJ/ W H >MRJ/ W H PRZJ/ W H XWJ/ W H JBWSJ/
1,Exodus,3,8,und,det,?el ?erets zbat xalab devash,to a land flowing with milk and honey,"to the good and broad land, to the land flowing with milk and honey, to the place of the Canaanites and the Hittites and the Amorites and the Perizzites and the Hivites and the Jebusites",>L >RY/ VWB/ W RXB/ >L >RY/ ZWB[ XLB/ W DBC/ >L MQWM/ H KN<NJ/ W H XTJ/ W H >MRJ/ W H PRZJ/ W H XWJ/ W H JBWSJ/
2,Exodus,3,17,und,det,?el ?erets zbat xalab devash,to a land flowing with milk and honey,"to the land of the Canaanites and the Hittites and the Amorites and the Perizzites and the Jebusites, a land flowing with milk and honey",>L >RY/ H KN<NJ/ W H XTJ/ W H >MRJ/ W H PRZJ/ W H XWJ/ W H JBWSJ/ >L >RY/ ZWB[ XLB/ W DBC/
3,Exodus,9,23,und,det,?arşa:h,to ground,toward the earth,>RY/
4,Exodus,9,33,und,det,?arşa:h,to ground,toward/on the earth,>RY/
5,Exodus,10,26,und,det,ša:mma:h,thither,there,CM
6,Exodus,14,20,und,det,?el zeh,to that,to this (place),>L ZH
7,Exodus,16,35,und,det,?el ?erets noshabet,to a liveable land,to the border of the land of Canaan,>L QYH=/ >RY/ KN<N/
8,Exodus,16,35,det,und,?el qetsey ?erets kenaaan,to the border of the land of Canaan,to an inhabited land,>L >RY/ JCB[
9,Exodus,21,13,und,det,ša:mma:h,thither,there,CM
