In [50]:
import pandas as pd

In [51]:
# Import the Isaiah dataset
df1 = pd.read_csv("../../1_annotation_tools/data/biblical_datasets/isaiah/annotation_df_history/isaiah_dataset.csv").fillna("")
df2 = pd.read_csv("../../1_annotation_tools/data/biblical_datasets/deuteronomy/annotation_df_history/Deuteronomy_dataset.csv").fillna("")
df3 = pd.read_csv("../../1_annotation_tools/data/biblical_datasets/exodus/annotation_df_history/Exodus_dataset.csv").fillna("")

In [52]:
pd.set_option("display.max_columns", None)

In [53]:
frames = [df1, df2, df3]
df = pd.concat(frames)

In [54]:
len(df)

3444

In [55]:
# Select only the desired part of the dataframe: Masoretic Text and Verbs having a complement (excluding sources with MN)
df = df[(df.scroll == "MT") & (df.complement != "no complement") & (df.cmpl_anim != "anim inanim") & (df.preposition_1 != "MN")]

len(df)

659

In [56]:
# Add a new variable: construction (prepositional vs non prepositional, prep / non_prep)
set(df.cmpl_constr)

# Create a mapping dictionary
mapping = {
    'dir-he': 'non-prep',
    'dir_he': 'non-prep',
    'vc': 'non-prep',
    'prep': 'prep',
    'prep + dir-he': 'prep',
    'prep + prep': 'prep',
    'prep + prep + prep': 'prep',
}

# Create the new column 'construction' using the map function
df['construction'] = df['cmpl_constr'].map(mapping)

# Verification
print("Set of cmpl_constr for prep construction: ", set(df[df.construction == "prep"].cmpl_constr))
print("Set of cmpl_constr for non-prep construction: ", set(df[df.construction == "non-prep"].cmpl_constr))

Set of cmpl_constr for prep construction:  {'prep + dir-he', 'prep', 'prep + prep + prep', 'prep + prep'}
Set of cmpl_constr for non-prep construction:  {'vc', 'dir-he', 'dir_he'}


In [57]:
# Display sets of the values for each variable of interest

# Animacy
print("Values for complement animacy variables: ", set(df.cmpl_anim))

# Definiteness
print("Values for complement definiteness variables: ", set(df.cmpl_det))

Values for complement animacy variables:  {'anim', 'inanim'}
Values for complement definiteness variables:  {'det', 'und'}


In [58]:
df[(df.construction != "prep") & (df.construction != "non-prep")]

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,sign_info,stem,tense,gcons_clause,subject,complement,cmpl_lex,cmpl_translation,dir_he,cmpl_constr,cmpl_nt,cmpl_anim,cmpl_det,cmpl_indiv,cmpl_complex,motion_type,preposition_1,preposition_2,preposition_3,preposition_4,comments,reconstructed_verse,construction


In [59]:
df.to_csv("data/sbl_grouped_cmpls.csv", index=False)