In [33]:
import glob
import math
import os

import numpy as np
import pandas as pd

In [2]:
# Set Pandas display option to display all the columns
pd.set_option("display.max_columns", None)

In [7]:
# Generate the dataset and make minor adjustments

df = pd.read_csv("data/all_verbs_bhsa_bib_dss.csv").fillna("")
df["dir_he"] = df["dir_he"].astype(str)
df["dir_he"] = df["dir_he"].replace("1.0", "1").replace("0.0", "0")

df["dir_he_dss"] = df["dir_he_dss"].astype(str)
df["dir_he_dss"] = df["dir_he_dss"].replace("1.0", "1").replace("0.0", "0")
df.sort_values(["book", "chapter", "verse_num", "scroll"], ascending=[True, True, True, False], ignore_index=True, inplace=True)

In [16]:
df.columns

Index(['verb_id', 'lex', 'scroll', 'book', 'chapter', 'verse_num',
       'gcons_verb', 'gcons_verse', 'gcons_clause', 'subject', 'complement',
       'dir_he', 'cmpl_constr', 'cmpl_nt', 'cmpl_anim', 'cmpl_det',
       'cmpl_indiv', 'dir_he_dss', 'sign_info', 'stem', 'tense',
       'preposition_1', 'preposition_2', 'preposition_3', 'preposition_4',
       'preposition_5', 'preposition_6', 'preposition_7'],
      dtype='object')

In [15]:
# Show the set of books present in the dataset
#set(df.book)
len(set(df.book))

60

In [19]:
# Create one dataset per book (containing both BHSA and DSS rows)

unique_books = df['book'].unique()

# New columns to add

new_columns = [
    "cmpl_lex",
    "cmpl_constr",
    "cmpl_nt",
    "cmpl_anim",
    "cmpl_det",
    "cmpl_indiv",
    "motion_type",
]

# Creating one dataset per unique book in the original dataset
for book in unique_books:
    book_df = df[df['book'] == book].copy()

    # Add each new column with NaN values
    for column in new_columns:
        book_df[column] = ''
        
    # Save the new dataset as a csv file
    #book_df.to_csv(f"data/biblical_datasets/{book}_dataset.csv", index=False)   

In [13]:
len(unique_books)

60

In [36]:
# Path to your CSV files
path = 'data/biblical_datasets/*.csv'

path = 'data/biblical_datasets/*.csv'
target_column = "complement"  
specific_values = ["", "no complement"] 
data = []

for file in glob.glob(path):
    df = pd.read_csv(file)
    
    # Extract the dataset name, removing path, extension, and optional suffix
    dataset_name = os.path.splitext(os.path.basename(file))[0]
    suffix = '_dataset'
    if dataset_name.endswith(suffix):
        dataset_name = dataset_name[:-len(suffix)]  # Remove the suffix

    total_rows = len(df)
    rows_with_complement = df[~df[target_column].isin(specific_values)].shape[0]
    number_of_days_80 = math.ceil(total_rows / 80)
    number_of_days_100 = math.ceil(total_rows / 100)
    
    data.append({
        'dataset_name': dataset_name,
        'total_rows': total_rows,
        'rows_with_complement': rows_with_complement,
        'number_of_days_80': number_of_days_80,
        'number_of_days_100': number_of_days_100,
    })

results_df = pd.DataFrame(data)

# Sorting by column 'total_rows'
results_df = results_df.sort_values(by=['total_rows'], ascending=False)

# Calculate the totals
total_rows_sum = results_df['total_rows'].sum()
rows_with_complement_sum = results_df['rows_with_complement'].sum()
number_of_days_80_sum = results_df['number_of_days_80'].sum()
number_of_days_100_sum = results_df['number_of_days_100'].sum()

# Create a DataFrame with the summary row
summary_df = pd.DataFrame([{
    'dataset_name': 'TOTAL', 
    'total_rows': total_rows_sum, 
    'rows_with_complement': rows_with_complement_sum,
    'number_of_days_80': number_of_days_80_sum,
    'number_of_days_100': number_of_days_100_sum,
}])

# Concatenate the original DataFrame with the summary DataFrame
results_df = pd.concat([results_df, summary_df], ignore_index=True)


results_df

#results_df.to_csv('data/biblical_datasets/summary.csv', index=False)

Unnamed: 0,dataset_name,total_rows,rows_with_complement,number_of_days_80,number_of_days_100
0,Isaiah,1552,1209,20,16
1,Exodus,1047,838,14,11
2,Deuteronomy,970,832,13,10
3,1_Samuel,954,689,12,10
4,Jeremiah,946,668,12,10
...,...,...,...,...,...
56,4Q78,2,2,1,1
57,5Q3,2,2,1,1
58,4Q118,1,1,1,1
59,11Q1,1,1,1,1
