In [2]:
# This code has been generated using ChatGPT (GPT 5) - September 2025

In [3]:
import pandas as pd

In [25]:
df = pd.read_csv("data/datasets_to_check/combined_datasets/combined_datasets.csv").fillna("")

In [36]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [8]:
def classify_complement(row):
    """Check whether the complement in a verse is present, fully reconstructed, or partially reconstructed."""
    gv  = row.get('gcons_verse')
    si  = row.get('sign_info')
    comp = row.get('complement')

    # Coerce NaNs to empty strings
    gv  = "" if pd.isna(gv)  else str(gv)
    si  = "" if pd.isna(si)  else str(si)
    comp = "" if pd.isna(comp) else str(comp)

    gv_ns   = ''.join(gv.split())
    si_ns   = ''.join(si.split())
    comp_ns = ''.join(comp.split()).lower()

    # Skip if sign_info is empty
    if not si_ns:
        return pd.Series([pd.NA, pd.NA, pd.NA, 'skipped_empty_sign_info'],
                         index=['comp_start','comp_end','comp_sign_info','comp_status'])

    # Skip if complement is empty or "no complement"
    if not comp_ns or comp_ns == 'nocomplement':
        return pd.Series([pd.NA, pd.NA, pd.NA, 'skipped_no_complement'],
                         index=['comp_start','comp_end','comp_sign_info','comp_status'])

    # Length mismatch safeguard
    if len(gv_ns) != len(si_ns):
        return pd.Series([pd.NA, pd.NA, pd.NA, 'length_mismatch'],
                         index=['comp_start','comp_end','comp_sign_info','comp_status'])

    # Find complement in the verse (first occurrence)
    start = gv_ns.find(comp_ns)
    if start == -1:
        return pd.Series([pd.NA, pd.NA, pd.NA, 'not_found'],
                         index=['comp_start','comp_end','comp_sign_info','comp_status'])

    end = start + len(comp_ns)
    comp_si = si_ns[start:end]

    # Classify
    if set(comp_si) == {'0'}:
        status = 'present'
    elif set(comp_si) == {'1'}:
        status = 'fully_reconstructed'
    else:
        status = 'partially_reconstructed'

    return pd.Series([start, end - 1, comp_si, status],
                     index=['comp_start','comp_end','comp_sign_info','comp_status'])


In [11]:
def check_comment_agreement(row):
    """Verify whether the manual comment about reconstruction agrees with the automatic classification."""
    status = row.get('comp_status')
    comment = row.get('comments')
    comment = "" if pd.isna(comment) else str(comment).lower()

    if status in ['skipped_empty_sign_info', 'skipped_no_complement']:
        return 'not_applicable'

    if status in ['fully_reconstructed', 'partially_reconstructed']:
        return 'agree' if 'reconstructed' in comment else 'disagree'
    elif status == 'present':
        return 'agree' if 'reconstructed' not in comment else 'disagree'
    else:
        return 'not_applicable'


In [12]:
df[['comp_start','comp_end','comp_sign_info','comp_status']] = df.apply(classify_complement, axis=1)
df['comment_agreement'] = df.apply(check_comment_agreement, axis=1)

# Optional: disagreements
disagreements = df[df['comment_agreement'] == 'disagree']
disagreements[['book','chapter','verse_num','complement','comp_status','comments']]

Unnamed: 0,book,chapter,verse_num,complement,comp_status,comments


In [17]:
set(df.comp_status)

{'not_found', 'skipped_empty_sign_info', 'skipped_no_complement'}

In [18]:
import pandas as pd

def classify_complement_row(gcons_verse, sign_info, complement):
    gv_ns = ''.join((gcons_verse or '').split())  # keep < and >
    si_ns = ''.join((sign_info   or '').split())
    comp_ns = ''.join((complement or '').split())

    # basic guards
    if not si_ns or not comp_ns:
        return {'comp_start': pd.NA, 'comp_end': pd.NA, 'comp_sign_info': pd.NA, 'comp_status': 'skipped'}
    if len(gv_ns) != len(si_ns):
        return {'comp_start': pd.NA, 'comp_end': pd.NA, 'comp_sign_info': pd.NA, 'comp_status': 'length_mismatch'}

    # find first occurrence
    start = gv_ns.find(comp_ns)
    if start == -1:
        return {'comp_start': pd.NA, 'comp_end': pd.NA, 'comp_sign_info': pd.NA, 'comp_status': 'not_found'}

    end = start + len(comp_ns)
    comp_si = si_ns[start:end]

    if set(comp_si) == {'0'}:
        status = 'present'
    elif set(comp_si) == {'1'}:
        status = 'fully_reconstructed'
    else:
        status = 'partially_reconstructed'

    return {'comp_start': start, 'comp_end': end-1, 'comp_sign_info': comp_si, 'comp_status': status}

In [19]:
gcons_verse = "B <T H HJ> >BJ> >TKM W B <T QBYJ >TKM KJ >TN >TKM"
sign_info   = "1 11 1 111 1111 1110 0 0 00 0000 0000 00 000 0111"

In [20]:
# Build a small dataframe with the three complements
df_test = pd.DataFrame({
    "gcons_verse": [gcons_verse]*3,
    "sign_info": [sign_info]*3,
    "complement": ["H HJ>", "QBYJ", ">TKM"]
})

In [21]:
# Apply the function row by row
df_results = df_test.apply(lambda r: pd.Series(classify_complement_row(r['gcons_verse'], r['sign_info'], r['complement'])), axis=1)

In [22]:
# Combine back into one dataframe
df_out = pd.concat([df_test, df_results], axis=1)

In [23]:
df_out

Unnamed: 0,gcons_verse,sign_info,complement,comp_start,comp_end,comp_sign_info,comp_status
0,B <T H HJ> >BJ> >TKM W B <T QBYJ >TKM KJ >TN >TKM,1 11 1 111 1111 1110 0 0 00 0000 0000 00 000 0111,H HJ>,3,6,1111,fully_reconstructed
1,B <T H HJ> >BJ> >TKM W B <T QBYJ >TKM KJ >TN >TKM,1 11 1 111 1111 1110 0 0 00 0000 0000 00 000 0111,QBYJ,19,22,0,present
2,B <T H HJ> >BJ> >TKM W B <T QBYJ >TKM KJ >TN >TKM,1 11 1 111 1111 1110 0 0 00 0000 0000 00 000 0111,>TKM,11,14,1110,partially_reconstructed


In [26]:
# Apply the function row by row
df_results = df.apply(lambda r: pd.Series(classify_complement_row(r['gcons_verse'], r['sign_info'], r['complement'])), axis=1)

In [29]:
df_out = pd.concat([df, df_results], axis=1)

In [32]:
df_out[(df_out.scroll != "MT") & (df_out.book == "Deuteronomy")]

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,sign_info,stem,tense,gcons_clause,subject,complement,cmpl_lex,cmpl_translation,dir_he,cmpl_constr,cmpl_nt,cmpl_anim,cmpl_det,cmpl_indiv,cmpl_complex,motion_type,spatial_arg_type,preposition_1,preposition_2,preposition_3,preposition_4,preposition_5,preposition_6,comments,reconstructed_verse,Study_Edition,comp_start,comp_end,comp_sign_info,comp_status
4005,1985801,PNH[,4Q35,Deuteronomy,1,7,PNW,PNW W S<W LKM W B>W HR H >MRJ W >L KL CKNJW B ...,000 0 000 000 0 000 00 1 1111 1 11 11 11111 1 ...,qal,impv,PNW,,no complement,no complement,,,,,,,,,,,,,,,,,,PNW W S<W LKM W B>W HR [H >MRJ W >L KL CKNJW B...,,,,,not_found
4006,1985803,NS<[,4Q35,Deuteronomy,1,7,S<W,PNW W S<W LKM W B>W HR H >MRJ W >L KL CKNJW B ...,000 0 000 000 0 000 00 1 1111 1 11 11 11111 1 ...,qal,impv,W S<W LKM,,LKM,L,for you / to you,0.0,prep,person,anim,det,prsf,simple,factive,other,L,,,,,,,PNW W S<W LKM W B>W HR [H >MRJ W >L KL CKNJW B...,,7,9,000,present
4007,1985806,BW>[,4Q35,Deuteronomy,1,7,B>W,PNW W S<W LKM W B>W HR H >MRJ W >L KL CKNJW B ...,000 0 000 000 0 000 00 1 1111 1 11 11 11111 1 ...,qal,impv,W B>W HR [H >MRJ],,HR [H >MRJ],HR/ H >MRJ/,to the mountain / the hill country of the Amor...,0.0,vc,place,inanim,det,subs,complex,factive,goal,,,,,,,reconstructed,PNW W S<W LKM W B>W HR [H >MRJ W >L KL CKNJW B...,,,,,not_found
4009,1992973,BW>[,4Q45,Deuteronomy,1,8,B>W,R>H NTTJ L PNJKM >T H >RY B>W W RCW >T H >RY >...,111 1111 1 11111 00 1 111 111 1 111 11 1 111 1...,qal,impv,B>W,,no complement,no complement,,,,,,,,,,,,,,,,,reconstructed,[R>H NTTJ L PNJKM] >T [H >RY B>W W RCW >T H >R...,,,,,not_found
4010,1985849,BW>[,4Q35,Deuteronomy,1,8,B>W,R>H NTTJ L PNJKM >T H >RY B>W W RCW >T H >RY >...,111 1111 1 11111 11 1 111 111 1 111 10 0 000 0...,qal,impv,B>W,,no complement,no complement,,,,,,,,,,,,,,,,,reconstructed,[R>H NTTJ L PNJKM >T H >RY B>W W RCW >]T H >RY...,,,,,not_found
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4873,1893598,BW>[,1Q5,Deuteronomy,33,16,TBW>TH,W M MGD >RY W ML>H W RYWN CKNJ SNH TBW>TH L R>...,0 0 000 000 0 0111 1 1111 1111 111 111111 1 11...,qal,impf,TBW>TH L R>C JWSP W L QDQD NZJR >XJW,,L R>C JWSP W L QDQD NZJR >XJW,L R>C/ JWSP/ W L QDQD/ NZJR/ >X/,to the head of Joseph and on the crown of the ...,0.0,prep + prep,body part,inanim,det,subs,complex,fictive,goal,L,L,,,,,reconstructed,W M MGD >RY W M[L>H W RYWN CKNJ SNH TBW>TH L R...,,31,52,1111111111111111111111,fully_reconstructed
4875,1893629,JY>[,1Q5,Deuteronomy,33,18,Y>TK,W L ZBLWN >MR FMX ZBWLN B Y>TK W JFFKR B >HLJK,1 0 00000 001 111 11111 1 1111 1 11111 1 11111,qal,infc,B Y>TK,,no complement,no complement,,,,,,,,,,,,,,,,,reconstructed,[W] L ZBLWN >M[R FMX ZBWLN B Y>TK W JFFKR B >H...,,,,,not_found
4878,1994539,<LH[,4Q45,Deuteronomy,34,1,J<L,W J<L MCH M <RBT MW>B >L HR NBW R>C H PSGH >CR...,1 111 111 1 1111 1111 11 11 111 111 1 1111 111...,qal,wayq,W J<L MCH M <RBT MW>B >L HR NBW R>C H PSGH,MCH,>L HR NBW R>C H PSGH,>L HR/ NBW=/ R>C/ H PSGH/,to mount Nebo to the top of Pisgah,0.0,prep,place,inanim,det,subs,complex,factive,goal,>L,,,,,,reconstructed,[W J<L MCH M <RBT MW>B >L HR NBW R>C H PSGH >C...,,16,30,111111111111111,fully_reconstructed
4879,2105208,<BR[,Mas1c,Deuteronomy,34,4,T<BR,W J>MR JHWH >LJW Z>T H >RY >CR NCB<TJ L >BRHM ...,1 1111 1110 0000 000 1 111 111 111111 1 11111 ...,qal,impf,W CMH L> T<BR,,CMH,CM,there,1.0,dir-he,place,inanim,det,adv,simple,factive,goal,,,,,,,reconstructed,[W J>MR JHW]H >LJW Z>T [H >RY >CR NCB<TJ L >BR...,,74,76,001,partially_reconstructed


In [37]:
def check_comment_agreement_row(status, comments):
    c = "" if pd.isna(comments) else str(comments).lower()
    if status in ('fully_reconstructed', 'partially_reconstructed'):
        return 'agree' if 'reconstructed' in c else 'disagree'
    elif status == 'present':
        return 'agree' if 'reconstructed' not in c else 'disagree'
    else:
        return 'not_applicable'  # for any other status, if it ever appears

# Add agreement column
df_out['comment_agreement'] = df_out.apply(
    lambda r: check_comment_agreement_row(r['comp_status'], r['comments']),
    axis=1
)

# Show only disagreements
disagreements = df_out[df_out['comment_agreement'] == 'disagree']
disagreements[['complement', 'comp_status', 'comments', 'comp_sign_info', 'comp_start', 'comp_end']]

Unnamed: 0,complement,comp_status,comments,comp_sign_info,comp_start,comp_end
1057,>RYH,present,reconstructed,0,38,41
1109,LW,present,reconstructed,0,32,33
1286,L MQWMW,partially_reconstructed,diff from MT,1,72,77
1538,B<D H XLN,present,reconstructed,0,14,20
3015,HNH,present,reconstructed?,0,59,61
3894,<L XD MN Q>MJ >,present,reconstructed?,0,4,14
4063,H HRH,present,reconstructed,0,41,44
4082,DRK MDBR MW>B,partially_reconstructed,,1111,10,20
4107,BW,fully_reconstructed,,11,15,16
4174,M >RY MYRJM M BJT <BDJM,partially_reconstructed,min excluded,11110000000000,23,40


In [34]:
df_out['comment_agreement'].value_counts(dropna=False)

not_applicable    12261
agree              1303
disagree             73
Name: comment_agreement, dtype: int64

In [39]:
len(df) == (df_out['comment_agreement'] == 'agree').sum() \
          + (df_out['comment_agreement'] == 'disagree').sum() \
          + (df_out['comment_agreement'] == 'not_applicable').sum()

True

In [43]:
# Get the list of indices
disagreement_indices = disagreements.index.tolist()

str(disagreement_indices)

'[1057, 1109, 1286, 1538, 3015, 3894, 4063, 4082, 4107, 4174, 4176, 4179, 4305, 4362, 4366, 4373, 4812, 5185, 5186, 5263, 5294, 5307, 5318, 5345, 5351, 5390, 5392, 5397, 5423, 5430, 5475, 5521, 5524, 5533, 5545, 5546, 5563, 5568, 5585, 5610, 5671, 5748, 6027, 6335, 6594, 6737, 6805, 7207, 7510, 7584, 7672, 7673, 7789, 8434, 8444, 8445, 8446, 8624, 8775, 8935, 8945, 9091, 9100, 9126, 9978, 9983, 10038, 11093, 11577, 11611, 12894, 13183, 13242]'