In [2]:
import pandas as pd

# Load the Excel file with the default behavior (first row as header)
df = pd.read_excel('gospel-mark-abenaki.xlsx', engine='openpyxl')

# Preview the first few rows
print(df.head())


   chapter  verse                                            abenaki  \
0        1      1  U waji m8jatak Sazos ol8jmow8gan kchiniwaskw w...   
1        1      2  Akw8bawikh8zik nik8niwaw8jmowinnowikoktali, L8...   
2        1      3  Pazgo pizwakamigok pmi li k8g8lwa, Kataiwi kis...   
3        1      4  Az8 pmi sogneb8lwassa pizwakamigok ta kinohoma...   
4        1      5  Ni mziwi Judea wjiaak wdellosan8ssa agma ait t...   

                                             english  \
0  The beginning of the gospel of Jesus Christ, t...   
1  As it is written in the prophets, Behold, I se...   
2  The voice of one crying in the wilderness, Pre...   
3  John did baptize in the wilderness, and preach...   
4  And there went out unto him all the land of Ju...   

                                              french  
0  Commencement de la Bonne Nouvelle de Jésus Chr...  
1  Il était écrit dans le livre du prophète Isaïe...  
2  A travers le désert, une voix crie : Préparez ...  
3  Et Jean

In [3]:
# from the column with header 'abenaki', break every entry into a list of words
# and make sure that everything is lower case. for every new word, add it to a dict
# where value is a list of all the verses (chapter, verse) that contain that exact word


In [10]:
import re

# Initialize dictionary to store word -> list of (chapter, verse) tuples
word_dict = {}

# Process each row in the dataframe
for index, row in df.iterrows():
    # Get the abenaki text and convert to lowercase
    abenaki_text = str(row['abenaki']).lower()
    
    # Extract chapter and verse numbers
    chapter = row['chapter']
    verse = row['verse']
    
    # Split text into words by whitespace and strip punctuation from each word
    words = [w.strip('.,;:!?()[]{}"\'') for w in abenaki_text.split() if w.strip('.,;:!?()[]{}"\'')]
    # Add each word to the dictionary
    for word in words:
        if word not in word_dict:
            word_dict[word] = []
        word_dict[word].append((chapter, verse))

# Sort the dictionary by word
word_dict = dict(sorted(word_dict.items()))

# Display some statistics
print(f"Total unique words found: {len(word_dict)}")
print(f"Sample words and their verse locations:")
for i, (word, locations) in enumerate(list(word_dict.items())[:10]):
    print(f"'{word}': {locations}")
    if i >= 9:  # Show first 10 words only
        break

Total unique words found: 3578
Sample words and their verse locations:
'8abit': [(2, 14), (5, 15), (10, 46)]
'8aikw': [(6, 10)]
'8akwi': [(16, 19)]
'8b8dazwenn8t': [(8, 37)]
'8b8jikich': [(13, 16)]
'8b8jit': [(14, 40)]
'8badahon': [(6, 8), (15, 19)]
'8badahonal': [(14, 43)]
'8bagawasozin8': [(4, 32)]
'8bank8wadw8gan': [(2, 14)]


In [None]:
for word in word_dict:
    print(f"{word}\t{word_dict[word]}")

8abit: [(2, 14), (5, 15), (10, 46)]
8aikw: [(6, 10)]
8akwi: [(16, 19)]
8b8dazwenn8t: [(8, 37)]
8b8jikich: [(13, 16)]
8b8jit: [(14, 40)]
8badahon: [(6, 8), (15, 19)]
8badahonal: [(14, 43)]
8bagawasozin8: [(4, 32)]
8bank8wadw8gan: [(2, 14)]
8bankaw8n: [(12, 14)]
8bijiba: [(6, 14), (6, 16), (10, 34), (16, 6)]
8bijiba8na: [(14, 28)]
8bijibada: [(9, 9)]
8bijibahadit: [(12, 26)]
8bijibaji: [(9, 31)]
8bijibamga: [(12, 23)]
8bijiban: [(9, 10)]
8bijibat: [(8, 31), (16, 9), (16, 14)]
8bijibaw8gan: [(12, 18)]
8bn8akw: [(11, 5)]
8bnokw: [(11, 2)]
8da: [(1, 7), (1, 34), (1, 45), (2, 2), (2, 17), (2, 17), (2, 17), (2, 18), (2, 24), (2, 26), (3, 4), (3, 12), (3, 20), (4, 5), (4, 5), (4, 6), (4, 7), (4, 12), (4, 12), (4, 13), (4, 17), (4, 17), (4, 19), (4, 22), (4, 22), (4, 25), (4, 27), (4, 34), (4, 34), (4, 40), (5, 3), (5, 3), (5, 4), (5, 10), (5, 19), (5, 26), (5, 37), (5, 39), (5, 43), (6, 3), (6, 4), (6, 5), (6, 8), (6, 8), (6, 8), (6, 8), (6, 9), (6, 9), (6, 11), (6, 11), (6, 18), (6, 19), (6, 