In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
from pathlib import Path

import pandas as pd
import nltk

import sys
sys.path.append('..')
from src.table import Table, Key
from src.data.loader import ExcelLoader
from src.code.wals import language_to_wals_code

# Building table data structure

## Dataset path

In [None]:
multilingual = Path('../data/aligned_verses_spreadsheets/')
greek = Path('../data/Greek')

### ExcelLoader

In [None]:
excelloader = ExcelLoader(multilingual, greek)

## Table

In [None]:
table = Table(excelloader, language_to_wals_code)

In [7]:
table.build()
table = util.apply_fixes(table)

deleting titles from non greek
incrementing greek verses
ensuring versicles are strings
removing spaces from start and end
ensuring versicles end with punctuation
removing versicle number in non greek
fixing versicle order


In [8]:
print(util.nchapter_nverse(table, 'GERMANY'))
print(util.nchapter_nverse(table, 'ANCIENT_GREEK'))

(260, 7957)
(260, 7940)


In [9]:
for i, (k, v) in enumerate(table.map(lambda x: x + ('' if x[-1] == '.' else '!'))):
    print(i, k, v[0:20], '...', v[-20:])
    if i > 5: break

0 Key(book='GAL', chap='GAL.5', lang='ANCIENT_GREEK', vers=1) τῇ ἐλευθερίᾳ ἡμᾶς Χρ ... ῷ δουλείας ἐνέχεσθε.
1 Key(book='GAL', chap='GAL.5', lang='ANCIENT_GREEK', vers=2) Ἴδε ἐγὼ Παῦλος λέγω  ... ὑμᾶς οὐδὲν ὠφελήσει.
2 Key(book='GAL', chap='GAL.5', lang='ANCIENT_GREEK', vers=3) μαρτύρομαι δὲ πάλιν  ... ν τὸν νόμον ποιῆσαι.
3 Key(book='GAL', chap='GAL.5', lang='ANCIENT_GREEK', vers=4) κατηργήθητε ἀπὸ Χρισ ... ς χάριτος ἐξεπέσατε.
4 Key(book='GAL', chap='GAL.5', lang='ANCIENT_GREEK', vers=5) ἡμεῖς γὰρ πνεύματι ἐ ... οσύνης ἀπεκδεχόμεθα.
5 Key(book='GAL', chap='GAL.5', lang='ANCIENT_GREEK', vers=6) ἐν γὰρ Χριστῷ Ἰησοῦ  ...  ἀγάπης ἐνεργουμένη.
6 Key(book='GAL', chap='GAL.5', lang='ANCIENT_GREEK', vers=7) Ἐτρέχετε καλῶς· τίς  ... ηθείᾳ μὴ πείθεσθαι;!


# Constituition analysis

## Unique characters for each language

In [10]:
from collections import Counter, defaultdict

_s = defaultdict(str)
for i, (k, v) in enumerate(table()):
    _s[k.lang] += v

In [11]:
from unicodedata import category as cat

for k, v in _s.items():
    c = Counter(v)
    puncts = [ch for ch in c.keys() if cat(ch).startswith('P')]
        
    print(f'{k:50s}{len(Counter(v)):3d}')

ANCIENT_GREEK                                     183
APALAÍ                                            73
APINAYÉ                                           75
APURINÃ                                            60
ASHENINKA                                          84
BAKAIRI                                            89
BORÔRO                                             88
CANELA                                             66
CULINA                                             78
DESANO                                             86
ENGLISH                                            73
FRENCH                                             85
GERMANY                                            81
GUAJAJARA                                          56
GUARANI_EASTERN_BOLIVIAN                           77
GUARANI_MBYA                                       89
GUARANI_PARAGUAY                                   96
GUARANI_WESTERN_BOLIVIAN                           92
HIXKARYÁNA                  

## MMC of versicles

In [12]:
from collections import defaultdict

def language_chapters(table):
    lang_ch = defaultdict(list)
    for e, v in table._data.items():
        lang_ch[e.lang].append((e.chap, e.vers))
    lang_ch = {
        k : set(v) for k, v in lang_ch.items()
    }
    
    return lang_ch

In [13]:
def sort_entry(chap):
    num = chap.find('.')
    return (chap[:num], int(chap[num+1:]))  

In [14]:
lc = language_chapters(table)

In [15]:
new_testament_versicules = 7957

In [16]:
l1 = 'GERMANY'
l2 = 'ENGLISH'
for chap, vers in (lc[l2] - lc[l1]):
    print(chap, vers)
    print(table._data[Key(lang=l2, book=re.sub('.\d+', '', chap), chap=chap, vers=vers)])
    #print(table._data[Key(lang=l1, book=re.sub('.\d+', '', chap), chap=chap, vers=vers)])

EPH.1 23
Now the church is his body, the fullness of him who fills all in all.


In [17]:
def compute_mmc(lc):
    all_verses = set()
    for lang, vs in lc.items():
        all_verses |= vs
    
    mmc = all_verses.copy()
    for l, v in lc.items():
        mmc &= v
    
    ordered = sorted([ l for l in lc.keys()], key=lambda x: len(lc[x]))
    rmving = set()
    
    cols = dict(num_removed=[0], removed=[''], num_versicles=[len(mmc)], percent=[len(mmc)/len(all_verses)]) 
    for item in ordered:
        rmving.add(item)
        s = all_verses.copy()#all_verses.copy()
        for lang, cv in lc.items():
            if lang in rmving: continue
            s &= cv
        cols['num_removed'].append(len(rmving))
        cols['removed'].append(item)
        cols['num_versicles'].append(len(s))
        cols['percent'].append(len(s)/len(all_verses))

    return all_verses, mmc, pd.DataFrame(cols)

    

In [18]:
all_verses, mmc, df4 = compute_mmc(lc)
df4.to_csv('mmc_data.csv', index=False)

In [19]:
chap, ver = zip(*sorted(mmc, key=lambda p: tuple(list(sort_entry(p[0])) + [p[1]])))

In [20]:
df5 = pd.DataFrame(dict(chap=chap, ver=ver))
df5.to_csv('mmc_list_std.csv', index=False)

In [21]:
def find_mmc_cut(df, cut):
    all_languages = set(df['removed'])
    toremove = set(df.loc[df['percent'] < cut, 'removed'])
    return all_languages - toremove

In [22]:
#set(df4['removed'])

In [23]:
df4.loc[df4['percent'] < 0.9, :]

Unnamed: 0,num_removed,removed,num_versicles,percent
0,0,,2585,0.32479
1,1,CANELA,3698,0.464631
2,2,APINAYÉ,3935,0.494409
3,3,XAVANTE,4080,0.512627
4,4,BORÔRO,4279,0.53763
5,5,KASHINAWA,4525,0.568539
6,6,KAYABÍ,4636,0.582485
7,7,NAMBIKUÁRA,4799,0.602965
8,8,KARAJÁ,5022,0.630984
9,9,YAMINÁWA,5161,0.648448


In [24]:
for lang in find_mmc_cut(df4, 0.9):
    print(lang)

PALIKÚR
ENGLISH
GUARANI_PARAGUAY
JAMAMADI
SATERÉ-MAWÉ
ANCIENT_GREEK
NADEB
GERMANY
MAXAKALI
APURINÃ
MACUSHI
SPANISH
ASHENINKA
GUARANI_WESTERN_BOLIVIAN
MATSÉS
YANOMAMI
PORTUGUESE
GUARANI_EASTERN_BOLIVIAN
GUARANI_MBYA
FRENCH
APALAÍ
TUYÚCA
NHEENGATU
KAIGANG
PARECÍS
WAPISHANA
TERÊNA


## Versicles in each language

In [25]:
df1 = pd.DataFrame({ 'lang' : [lang for lang in language_to_wals_code.keys() ], 
                    'num_versicles' : [len(lc[lang]) for lang in language_to_wals_code.keys()]})

df1['percent_complete'] = df1['num_versicles'] / new_testament_versicules
df1.to_csv('versicles_in_each_language.csv', index=False)

## Versicles per chapter in each language

In [26]:
books = list(table._books)
book2chapters = table.b2c
booklen = { b : len(c) for b, c in book2chapters.items() }

In [27]:
bclist = [(b, c) for b in books for c in sorted(book2chapters[b], key=sort_entry)]

In [28]:
b, c = zip(*bclist)

In [29]:
def create_df(bclist, langs):
    cols = dict(lang=sorted(langs))
    for b, c in bclist:
        cols[c] = []
        for l in cols['lang']:
            cols[c].append(len(list(table(book=b, chap=c, lang=l))))
    return pd.DataFrame(cols)

In [30]:
df = create_df(bclist, language_to_wals_code.keys())
#bcl_num_vers

df.to_csv('versicles_per_chapter_in_each_lang.csv', index=False)

## Versicles per book in each language

In [31]:
def create_df2(books, book2chaps, langs):
    cols = dict(lang=sorted(langs))
    for b in books:
        cols[b] = []
        for l in cols['lang']:
            length_chaps = []
            for c in sorted(book2chaps[b]):
                length = len(list(table(book=b, chap=c, lang=l)))
                length_chaps.append(length)
            cols[b].append(sum(length_chaps))
    df = pd.DataFrame(cols)
    return df

In [32]:
df2 = create_df2(books, book2chapters, language_to_wals_code.keys())

df2.to_csv('versicles_per_book_in_each_lang.csv', index=False)

In [33]:
langs = dict(language = language_to_wals_code.keys())

## Chapter Versicle and Language

In [34]:
def lang_has_chapter_versicle(lc):
    def sort_entry(e):
        chap, vers = e
        num = chap.find('.')
        return (chap[:num], int(chap[num+1:]), vers)
    
    all_versicles = set()
    for key, val in lc.items():
        all_versicles |= val

    all_versicles = sorted(all_versicles)
    all_chaps, all_verses = zip(*all_versicles)
    cols = dict(chapter=all_chaps, versicle=all_verses)
    
    for lang in language_to_wals_code.keys():
        cols[lang] = []
        for chap, vers in all_versicles:
            if Key(book=chap[0:3], chap=chap, lang=lang, vers=vers) in table._data:
                cols[lang].append(1)
            else:
                cols[lang].append(0)
        #print(lang, sorted(cv, key=sort_entry))
        #for cv in sorted(
    return pd.DataFrame(cols)

In [35]:
df3 = lang_has_chapter_versicle(lc)

df3.to_csv('chap_vers_language.csv', index=False)

## Average number of spaces and symbols

In [36]:
from nltk.tokenize import word_tokenize

In [37]:
table._books

{'1CO',
 '1JN',
 '1PE',
 '1TH',
 '1TI',
 '2CO',
 '2JN',
 '2PE',
 '2TH',
 '2TI',
 '3JN',
 'ACT',
 'COL',
 'EPH',
 'GAL',
 'HEB',
 'JAS',
 'JHN',
 'JUD',
 'LUK',
 'MAT',
 'MRK',
 'PHM',
 'PHP',
 'REV',
 'ROM',
 'TIT'}

In [39]:
books_in_order = [
    'MAT',
    'MRK',
    'LUK',
    'JHN',
    'ACT',
    'ROM',
    '1CO',
    '2CO',
    'GAL',
    'EPH',
    'PHP',
    'COL',
    '1TH',
    '2TH',
    '1TI',
    '2TI',
    'TIT',
    'PHM',
    'HEB',
    'JAS',
    '1PE',
    '2PE',
    '1JN',
    '2JN',
    '3JN',
    'JUD',
    'REV',
]

book_pos = { book : i for i, book in enumerate(books_in_order) }

In [40]:
table._books == set(books_in_order)

True

In [None]:
english_verses = table.fetch_language_books('ENGLISH', books_in_order)

In [46]:
def sort_chap_vers(entry):
    chap, vers = entry
    num = chap.find('.')
    return (book_pos[chap[:num]], int(chap[num+1:]), vers)  

In [47]:
def concat_ordered(d, key):
    return '\n'.join(d[k] for k in sorted(d.keys(), key=key))

In [48]:
from src.text.util import is_punct, tokens, types

In [55]:
def compute_df6(table):
    from collections import Counter
    cols = dict(language=[], num_tokens=[], num_types=[], num_not_punct=[], average_not_punct_size=[], num_spaces=[])
    for language in table._langs:

        language_verses_dict = table.fetch_language_books(language, books_in_order)
        language_bible_str = concat_ordered(language_verses_dict, sort_chap_vers)
        
        language_token_lst = tokens(language_bible_str)
        language_not_punct_lst = [token for token in language_token_lst if not is_punct(token)]
        language_type_set = types(language_token_lst)
        language_num_spaces = Counter(language_bible_str)[' ']
        average_not_punct_size = sum(map(len, language_not_punct_lst)) / len(language_not_punct_lst)
        

        cols['language'].append(language)
        cols['num_tokens'].append(len(language_token_lst))
        cols['num_types'].append(len(language_type_set))
        cols['num_not_punct'].append(len(language_not_punct_lst))
        cols['average_not_punct_size'].append(average_not_punct_size)
        cols['num_spaces'].append(language_num_spaces)
    return pd.DataFrame(cols)

In [56]:
df6 = compute_df6(table)
df6

Unnamed: 0,language,num_tokens,num_types,num_not_punct,average_not_punct_size,num_spaces
0,PALIKÚR,272428,12793,234478,5.699443,228496
1,ENGLISH,208158,7682,179620,4.096426,172321
2,HIXKARYÁNA,393805,13681,303306,5.380583,295622
3,TICUNA,256034,16144,225338,5.514831,220602
4,XAVANTE,472893,7340,365708,3.97696,376144
5,GUARANI_PARAGUAY,194032,14074,152772,5.030379,133887
6,BAKAIRI,250913,18134,196964,6.395585,193910
7,JAMAMADI,220239,13044,173587,5.622333,165701
8,SATERÉ-MAWÉ,363132,10704,338193,5.028596,335075
9,NAMBIKUÁRA,175175,43696,136214,15.466391,131495


In [57]:
df6.to_csv('tokens_per_language.csv', index=False)