In [1]:
import pandas as pd
import re
from collections import Counter

In [2]:
df = pd.read_excel("data/newest_dataset_lxx_mt_dss.xlsx").fillna("")

In [10]:
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", False)
pd.set_option("display.max_colwidth", False)

In [11]:
set(df.lex)

{'BW>[',
 'δυσμή',
 'ἐπάγω',
 'εἰμί',
 'εἰσάγω',
 'εἰσπορεύω',
 'εἰσφέρω',
 'εἰσφέρω',
 'εἰσάγω',
 'εἰσέρχομαι',
 'εἴσειμι',
 'παραγίγνομαι',
 'ποιέω',
 'πορεύω',
 'προσάγω ',
 'προσέρχομαι',
 'προσφέρω',
 'συνάπτω',
 'τίθημι',
 'φέρω',
 'ἀναφέρω',
 'ἄγω',
 'ἐξέρχομαι',
 'ἐπάγω',
 'ἔρχομαι',
 'ἥκω'}

In [12]:
# 1. Filter only MT rows
mt_only = df[df['scroll'].str.upper() == 'MT']

# 2. Count how many times each Hebrew verb lexeme occurs per stem
stem_verb_counts = (
    mt_only
    .groupby(['stem', 'lex'])
    .size()
    .reset_index(name='n')
    .sort_values(['stem', 'n'], ascending=[True, False])
)

print(stem_verb_counts)

  stem   lex   n
0  hif  BW>[  44
1  hof  BW>[  1 
2  qal  BW>[  80


In [15]:
# 1. Normalize and extract shared numeric ID
df2 = df.copy()
df2['scroll'] = df2['scroll'].astype(str).str.upper()
df2['base_id'] = df2['verb_id'].astype(str).str.extract(r'(\d+)')

# 2. Separate MT (for stem) and LXX (for Greek lex)
mt = df2[df2['scroll'] == 'MT'][['base_id', 'stem']].rename(columns={'stem': 'mt_stem'})
lxx = df2[df2['scroll'] == 'LXX'][['base_id', 'lex']].rename(columns={'lex': 'greek_lex'})

# 3. Merge them on base_id (link Hebrew stem to Greek lex)
merged = mt.merge(lxx, on='base_id', how='inner')

# 4. Count how many times each Greek lex is used for each Hebrew stem
counts = (
    merged.groupby(['mt_stem', 'greek_lex'])
    .size()
    .reset_index(name='n')
    .sort_values(['mt_stem', 'n'], ascending=[True, False])
)

counts

Unnamed: 0,mt_stem,greek_lex,n
10,hif,φέρω,14
4,hif,εἰσάγω,7
2,hif,εἰσφέρω,6
11,hif,ἀναφέρω,3
0,hif,ἐπάγω,2
3,hif,εἰσφέρω,2
1,hif,εἰμί,1
5,hif,εἰσέρχομαι,1
6,hif,προσάγω,1
7,hif,προσφέρω,1


In [16]:
counts.to_csv("data/lexeme_counts.csv", index=False)

In [17]:
import pandas as pd

# Drop duplicate verb_id so each verb is counted only once
df_unique_verbs = df.drop_duplicates(subset='verb_id')

# Count occurrences of each lexeme
lex_counts = df_unique_verbs['lex'].value_counts()

print(lex_counts)

BW>[            161
εἰσέρχομαι      31 
φέρω            16 
παραγίγνομαι    12 
ἔρχομαι         12 
εἰσπορεύω       9  
εἰσάγω          7  
εἰσφέρω         6  
ἥκω             3  
ἀναφέρω         3  
εἴσειμι         2  
εἰμί            2  
εἰσάγω          2  
δυσμή           2  
ἐπάγω         2  
εἰσφέρω         2  
ἐξέρχομαι       2  
προσάγω         1  
προσέρχομαι     1  
ἄγω             1  
πορεύω          1  
συνάπτω         1  
ἐπάγω           1  
τίθημι          1  
προσφέρω        1  
ποιέω           1  
Name: lex, dtype: int64
