In [22]:
import pandas as pd

In [23]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

### 1. Datasets

In [24]:
df = pd.read_csv("data/datasets_to_check/combined_datasets/combined_datasets.csv").fillna("")

In [25]:
df.columns

Index(['verb_id', 'lex', 'scroll', 'book', 'chapter', 'verse_num',
       'gcons_verb', 'gcons_verse', 'sign_info', 'stem', 'tense',
       'gcons_clause', 'subject', 'complement', 'cmpl_lex', 'cmpl_translation',
       'dir_he', 'cmpl_constr', 'cmpl_nt', 'cmpl_anim', 'cmpl_det',
       'cmpl_indiv', 'cmpl_complex', 'motion_type', 'spatial_arg_type',
       'preposition_1', 'preposition_2', 'preposition_3', 'preposition_4',
       'preposition_5', 'preposition_6', 'comments', 'reconstructed_verse',
       'Study_Edition'],
      dtype='object')

In [26]:
df1 = pd.read_csv("data/verses_genre/verses_genre_structured_data.csv").fillna("")

In [27]:
df1.columns

Index(['book_nr', 'book_name', 'chapter_nr', 'chapter_genre', 'verse_nr',
       'verse_genre', 'verse_language'],
      dtype='object')

### 2. Harmonise the book names from df1

In [28]:
set(df1.book_name)

{'Amos',
 'Canticum',
 'Chronica_I',
 'Chronica_II',
 'Daniel',
 'Deuteronomium',
 'Ecclesiastes',
 'Esra',
 'Esther',
 'Exodus',
 'Ezechiel',
 'Genesis',
 'Habakuk',
 'Haggai',
 'Hosea',
 'Iob',
 'Jeremia',
 'Jesaia',
 'Joel',
 'Jona',
 'Josua',
 'Judices',
 'Leviticus',
 'Maleachi',
 'Micha',
 'Nahum',
 'Nehemia',
 'Numeri',
 'Obadia',
 'Proverbia',
 'Psalmi',
 'Reges_I',
 'Reges_II',
 'Ruth',
 'Sacharia',
 'Samuel_I',
 'Samuel_II',
 'Threni',
 'Zephania'}

In [29]:
df.head(1)

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,sign_info,stem,tense,gcons_clause,subject,complement,cmpl_lex,cmpl_translation,dir_he,cmpl_constr,cmpl_nt,cmpl_anim,cmpl_det,cmpl_indiv,cmpl_complex,motion_type,spatial_arg_type,preposition_1,preposition_2,preposition_3,preposition_4,preposition_5,preposition_6,comments,reconstructed_verse,Study_Edition
0,206,GLL[,1QH,1QH,3,9,HTGWLLTJ,KJ B NDH HTGWLLTJ W M SWD RMH Y>TJ W L> NLWJTJ...,,hit,perf,KJ B NDH HTGWLLTJ,,B NDH,B NDH,in impurity,0.0,prep,other,inanim,und,subs,simple,fictive,location,B,,,,,,,,p148 col 4 19


In [30]:
df1.head(1)

Unnamed: 0,book_nr,book_name,chapter_nr,chapter_genre,verse_nr,verse_genre,verse_language
0,1,Genesis,1,prose,1,prose,Hebrew


In [31]:
#set(df1.book_name)

In [32]:
book_dict = {
 '1_Chronicles': 'Chronica_I',
 '1_Kings': 'Reges_I',
 '1_Samuel': 'Samuel_I',
 '2_Chronicles': 'Chronica_II',
 '2_Kings': 'Reges_II',
 '2_Samuel': 'Samuel_II',
 'Amos': 'Amos',
 'Daniel': 'Daniel',
 'Deuteronomy': 'Deuteronomium',
 'Ecclesiastes': 'Ecclesiastes',
 'Esther': 'Esther',
 'Exodus': 'Exodus',
 'Ezekiel': 'Ezechiel',
 'Ezra': 'Esra',
 'Genesis': 'Genesis',
 'Habakkuk': 'Habakuk',
 'Haggai': 'Haggai',
 'Hosea': 'Hosea',
 'Isaiah': 'Jesaia',
 'Jeremiah': 'Jeremia',
 'Job': 'Iob',
 'Joel': 'Joel',
 'Jonah': 'Jona',
 'Joshua': 'Josua',
 'Judges': 'Judices',
 'Lamentations': 'Threni',
 'Leviticus': 'Leviticus',
 'Malachi': 'Maleachi',
 'Micah': 'Micha',
 'Nahum': 'Nahum',
 'Nehemiah': 'Nehemia',
 'Numbers': 'Numeri',
 'Obadiah': 'Obadia',
 'Proverbs': 'Proverbia',
 'Psalms': 'Psalmi',
 'Ruth': 'Ruth',
 'Song_of_songs': 'Canticum',
 'Zechariah': 'Sacharia',
 'Zephaniah': 'Zephania',
}

In [33]:
reverse_map = {v: k for k, v in book_dict.items()}

In [34]:
# Prepare the book name columns in df and df1 for modifications

# Clean the book name column in both dataframes
df['book'] = df['book'].str.strip()
df1['book_name'] = df1['book_name'].str.strip()

In [35]:
# Create a new column harmonized_name based on the dictionary in df1
df1['harmonized_name'] = df1['book_name'].map(reverse_map)

# Check for any books that didn't find a match (will result in NaN)
missing = df1[df1['harmonized_name'].isna()]['book_name'].unique()
print(f"Books not found in mapping: {missing}")

Books not found in mapping: []


In [36]:
#df1[df1.book_name == "Samuel_II"]

In [37]:
# All chapter and verse numbers to integer to facilitate the merging

# df
df['chapter'] = pd.to_numeric(df['chapter'], errors='coerce').astype('Int64')
df['verse_num'] = pd.to_numeric(df['verse_num'], errors='coerce').astype('Int64')

# df1
df1['chapter_nr'] = pd.to_numeric(df1['chapter_nr'], errors='coerce').astype('Int64')
df1['verse_nr'] = pd.to_numeric(df1['verse_nr'], errors='coerce').astype('Int64')

### 3. Add the genre and language information to df

In [38]:
df = df.merge(
    df1[['harmonized_name', 'chapter_nr', 'verse_nr', 'verse_genre', 'verse_language']],
    how='left',
    left_on=['book', 'chapter', 'verse_num'],
    right_on=['harmonized_name', 'chapter_nr', 'verse_nr']
)

In [39]:
# drop the duplicate columns
df = df.drop(columns=['harmonized_name', 'chapter_nr', 'verse_nr'])

In [40]:
# Temporary genre/language for Qumran scrolls

mask_qumran = df['scroll'].isin(['1QS', '1QH', '1QM'])

df.loc[mask_qumran, 'verse_genre'] = 'Qumran'
df.loc[mask_qumran, 'verse_language'] = 'Hebrew'

In [41]:
# Create a new book_2 columns with no section for Samuel, Chronicles and Kings

# Default: book_2 = book
df['book_2'] = df['book']

# Collapse split books into unified names
book2_map = {
    '1_Samuel': 'Samuel',
    '2_Samuel': 'Samuel',
    '1_Kings': 'Kings',
    '2_Kings': 'Kings',
    '1_Chronicles': 'Chronicles',
    '2_Chronicles': 'Chronicles'
}

df['book_2'] = df['book_2'].replace(book2_map)

In [43]:
# Add divide between 1st and 2nd Isaiah

mask_isaiah_1 = (df['book'] == 'Isaiah') & (df['chapter'].between(1, 39))
mask_isaiah_2 = (df['book'] == 'Isaiah') & (df['chapter'].between(40, 66))

df.loc[mask_isaiah_1, 'book'] = '1_Isaiah'
df.loc[mask_isaiah_2, 'book'] = '2_Isaiah'

In [45]:
#set(df.book)

### 4. Add era / style information

In [49]:
# Create a dictionary with books and era/style

classical_era_dict = {
    '1QH': 'QH',
    '1QS': 'QH',
    '1QM': 'QH',
    '1_Chronicles': 'LBH',
    '1_Isaiah': 'CBH',
    '1_Kings': 'CBH',
    '1_Samuel': 'CBH',
    '2_Chronicles': 'LBH',
    '2_Isaiah': 'TBH',
    '2_Kings': 'CBH',
    '2_Samuel': 'CBH',
    'Amos': 'CBH',
    'Daniel': 'LBH',
    'Deuteronomy': 'CBH',
    'Ecclesiastes': 'debated',
    'Esther': 'LBH',
    'Exodus': 'CBH',
    'Ezekiel': 'TBH',
    'Ezra': 'LBH',
    'Genesis': 'CBH',
    'Habakkuk': 'CBH',
    'Haggai': 'TBH',
    'Hosea': 'CBH',
    'Jeremiah': 'TBH',
    'Job': 'debated',
    'Joel': 'debated',
    'Jonah': 'debated',
    'Joshua': 'CBH',
    'Judges': 'CBH',
    'Lamentations': 'TBH',
    'Leviticus': 'CBH',
    'Malachi': 'TBH',
    'Micah': 'CBH',
    'Nahum': 'CBH',
    'Nehemiah': 'LBH',
    'Numbers': 'CBH',
    'Obadiah': 'CBH',
    'Proverbs': 'debated',
    'Psalms': 'debated',
    'Ruth': 'debated',
    'Song_of_songs': 'debated',
    'Zechariah': 'TBH',
    'Zephaniah': 'CBH',
}

In [50]:
# add the era_style info
df['era_style'] = df['book'].map(classical_era_dict)

In [53]:
# Check if some books were not taken into account by the previous mapping

df.loc[df['era_style'].isna(), 'book'].unique()

array([], dtype=object)

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,sign_info,stem,tense,gcons_clause,subject,complement,cmpl_lex,cmpl_translation,dir_he,cmpl_constr,cmpl_nt,cmpl_anim,cmpl_det,cmpl_indiv,cmpl_complex,motion_type,spatial_arg_type,preposition_1,preposition_2,preposition_3,preposition_4,preposition_5,preposition_6,comments,reconstructed_verse,Study_Edition,verse_genre,verse_language,book_2,era_style
7549,212256,BW>[,MT,1_Isaiah,1,12,TB>W,KJ TB>W L R>WT PNJ MJ BQC Z>T M JDKM RMS XYRJ,,qal,impf,KJ TB>W,,no complement,no complement,,,,,,,,,,,,,,,,,,,,prophetic,Hebrew,Isaiah,CBH
7550,1895059,BW>[,1Qisaa,1_Isaiah,1,12,TB>W,KJ> TB>W L R>WT PNJ MJ BQC ZW>T M JDKM L RMWS ...,000 0000 0 0000 000 00 000 0000 0 0000 0 0000 ...,qal,impf,KJ> TB>W,,no complement,no complement,,0.0,,,,,,,,,,,,,,,,KJ> TB>W L R>WT PNJ MJ BQC ZW>T M JDKM L RMWS ...,,prophetic,Hebrew,Isaiah,CBH
7551,212269,BW>[,MT,1_Isaiah,1,13,HBJ>,L> TWSJPW HBJ> MNXT CW> QVRT TW<BH HJ> LJ XDC ...,,hif,infc,HBJ> MNXT CW>,,no complement,no complement,,,,,,,,,,,,,,,,,,,,prophetic,Hebrew,Isaiah,CBH
7552,1895075,BW>[,1Qisaa,1_Isaiah,1,13,HBJ>,LW> TWSJPW L HBJ> MNXT CW> QVRT TW<BH HJ> LJ X...,000 000000 0 0000 0000 000 0000 00000 000 00 0...,hifil,infc,L HBJ> MNXT CW>,,no complement,no complement,,0.0,,,,,,,,,,,,,,,,LW> TWSJPW L HBJ> MNXT CW> QVRT TW<BH HJ> LJ X...,,prophetic,Hebrew,Isaiah,CBH


In [54]:
# Rename and reorganise the book columns

# Rename

df = df.rename(columns={
    'book': 'text_unit',
    'book_2': 'book_canonical'
})


# Reorganise

cols = list(df.columns)

cols.remove('book_canonical')
cols.insert(cols.index('text_unit'), 'book_canonical')

df = df[cols]


In [55]:
df.head(5)

Unnamed: 0,verb_id,lex,scroll,book_canonical,text_unit,chapter,verse_num,gcons_verb,gcons_verse,sign_info,stem,tense,gcons_clause,subject,complement,cmpl_lex,cmpl_translation,dir_he,cmpl_constr,cmpl_nt,cmpl_anim,cmpl_det,cmpl_indiv,cmpl_complex,motion_type,spatial_arg_type,preposition_1,preposition_2,preposition_3,preposition_4,preposition_5,preposition_6,comments,reconstructed_verse,Study_Edition,verse_genre,verse_language,era_style
0,206,GLL[,1QH,1QH,1QH,3,9,HTGWLLTJ,KJ B NDH HTGWLLTJ W M SWD RMH Y>TJ W L> NLWJTJ...,,hit,perf,KJ B NDH HTGWLLTJ,,B NDH,B NDH,in impurity,0.0,prep,other,inanim,und,subs,simple,fictive,location,B,,,,,,,,p148 col 4 19,Qumran,Hebrew,QH
1,354,NPL[,1QH,1QH,1QH,4,1,HTNPL,MZMWR L MFKJL L HTNPL L PNJ >L M<FJ >L W L HBJ...,,hit,infc,L HTNPL L PNJ >L,,L PNJ >L,L PNH >L==,,0.0,prep,,,und,subs,complex,,,L,,,,,,reconstructed,,p150 col 5 1,Qumran,Hebrew,QH
2,376,HLK[,1QH,1QH,1QH,4,1,HTHLKW,MZMWR L MFKJL L HTNPL L PNJ >L M<FJ >L W L HBJ...,,hit,perf,HTHLKW,,no complement,no complement,,,,,,,,,,,,,,,,,,,,Qumran,Hebrew,QH
3,447,GLH[,1QH,1QH,1QH,4,4,GLJTH,>TH HW>L QDWCJM W B RZJ PL> K HWD< BWR KBWD K ...,,qal,perf,>TH GLJTH DRKJ >MT W M<FJ R< XWKMH W >WLT,>TH,no complement,no complement,,,,,,,,,,,,,,,,,,,,Qumran,Hebrew,QH
4,709,CWB[,1QH,1QH,1QH,4,19,JCWB,KJ W DBR K L> JCWB >XWR,,qal,impf,W DBR K L> JCWB >XWR,DBR K,>XWR,>XWR,backwards,0.0,vc,direction,inanim,und,adv,simple,fictive,trajectory,,,,,,,,,p150 col 5 24,Qumran,Hebrew,QH


### 5. Save the new dataset

In [56]:
df.to_csv("data/datasets_to_check/dataset_with_genre_era.csv", index=False)