In [1]:
from pathlib import Path

import pandas as pd
import nltk

In [2]:
wals_lang_codes = {
    'APALAÍ': 'apl',
    'APINAYÉ': 'api',
    'APURINÃ': 'apu',
    'ASHENINKA': 'cax', # Peru
    'BAKAIRI': 'bki',
    'BORÔRO': 'brr', 
    'CANELA': 'cnl',
    'CULINA': 'cul',
    'DESANO': 'des',
    'ENGLISH': 'eng', 
    'FRENCH': 'fre',
    'GERMANY': 'ger',
    'GUAJAJARA': 'gjj',
    'GUARANI_EASTERN_BOLIVIAN': 'crg', # Chiriguano
    'GUARANI_MBYA': '[gun]', # Missing
    'GUARANI_PARAGUAY': 'gua', # Guaraní
    'GUARANI_WESTERN_BOLIVIAN': '[gnw]', # Missing
    'HIXKARYÁNA': 'hix',
    'JAMAMADI': 'jmm',
#    'JARAWARA': 'jar', # identifier made up 
    'KAAPOR': 'urk',
    'KADIWÉU': 'kdw',
    'KAIGANG': 'kng',
    'KAIWÁ': 'kaw',
    'KARAJÁ': 'jva',
    'KASHINAWA': 'csh',
    'KAYABÍ': 'kyz',
    'KAYAPÓ': 'kyp',
    'KUBEO': 'cub',
    'MACUSHI': 'mac',
#    'MACHINÉRI': ,
    'MAKUNA': 'mcn',
    'MATSÉS': 'myr',
    'MAXAKALI': 'max',
    'MUNDURUKÚ': 'muu',
    'NADEB': 'nad',
    'NAMBIKUÁRA': 'nmb',
    'NHEENGATU': '[yrl]',
    'PALIKÚR': 'plk',
    'PARECÍS': 'pex',
    'PAUMARÍ': 'pau',
    'PIRATAPÚYA': 'prt',
    'PORTUGUESE': 'por',
    'RIKBAKTSA': 'rik',
    'SANUMÁ': 'snm',
    'SATERÉ-MAWÉ': '[mav]',
    'SIRIANO': 'sri',
    'SPANISH': 'spa',
    'TENHARIM': '[pah]',
    'TERÊNA': 'trn',
    'TICUNA': 'tic',
    'TUCANO': 'tuc',
    'TUYÚCA': 'tuy', 
    'WANANA': 'gno',
    #'WAIÃPY': ,
    'WAPISHANA': 'wps',
    'XAVANTE': 'xav',
    'YAMINÁWA': 'yam',
    'YANOMAMI': '[guu]'
    
}

In [3]:
p = Path('../data/aligned_verses_spreadsheets/')

In [4]:
booksp = [ e for e in p.iterdir() if e.is_file ]
books = [e.name.replace('.xlsx', '') for e in booksp]

In [5]:
chapters = [ pd.read_excel(e, None) for e in booksp]

In [6]:
book2chapters = { b : c for b, c in zip(books, chapters) }
b2c = { b : list(c.keys()) for b, c in book2chapters.items() }

In [7]:
chapter2book = { c : b for b, c2v in book2chapters.items() for c in c2v.keys() }

In [8]:
languages = { c : v for b, chs in book2chapters.items() for c, v in chs.items()}
chapter2languages = { c : v.to_dict() for c, v in languages.items() }

In [9]:
langs = list(languages['MRK.1'].columns)

chaps = [c for b in books for c in b2c[b]]
verss = list(range(0, 98))

In [35]:
#TODO: Put this in a .py file encapsulated by a class
from typing import NamedTuple
from collections import defaultdict

class Entry(NamedTuple):
    book: str
    chap: str
    lang: str
    vers: int

# Dicionario data[(book, chap, lang, nvers)] -> versiculo_texto
data = defaultdict(str)
for b in books:
    for c in book2chapters[b]:
        for l, vs in chapter2languages[c].items():
            for i, val in vs.items():
                key = Entry(b, c, l, i)
                if val != ' ':
                    data[key] = val

In [11]:
len(data)

472409

In [12]:
from itertools import product

In [22]:
for t in product(*[[1, 2, 3]]):
    print(t)

(1,)
(2,)
(3,)


In [23]:
set(['book', 'chap', 'lang', 'vers'])

{'book', 'chap', 'lang', 'vers'}

In [19]:
def get_verses(data, book, chapter, language): 
    for i in verss:
        r =  data[(book, chapter, language, i)]
        if r:
            yield i, r
    

In [53]:
def iter_fixed(data, **kwargs):
    # TODO: Make it faster by iterating over only the tuples that exist
    args = set(['book', 'chap', 'lang', 'vers']) # Take from data
    
    default = dict(book=books, chap=chaps, lang=langs,vers=verss) # Take from data
    
    fix = kwargs.keys()
    var = args - fix
    
    for p in product(*[default[i] for i in var]):
        d = kwargs
        for v, t in zip(var, p):
            d[v] = t
        key = Entry(**d)
        val = data[key]
        if val:
            yield key, val

## Queries

In [37]:
for p in iter_fixed(data, book='MRK', chap='MRK.1', lang='ENGLISH', vers=0):
    print(p)

(Entry(book='MRK', chap='MRK.1', lang='ENGLISH', vers=0), 'Mark 1')


### Language chapters

In [52]:
def language_chapters(data):
    lang_ch = defaultdict(list)
    for e, v in iter_fixed(data):
        lang_ch[e.lang].append((e.chap, e.vers))
    return lang_ch

In [49]:
%%timeit
lc = language_chapters(data)

1min 22s ± 3.05 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [64]:
print(len(lc['GERMANY'])) # Germany is complete

8217


In [71]:
set(lc['GERMANY']) - set(lc['ENGLISH'])

{('2CO.13', 14),
 ('ACT.15', 34),
 ('ACT.24', 7),
 ('ACT.28', 29),
 ('ACT.8', 37),
 ('JHN.5', 4),
 ('LUK.17', 36),
 ('LUK.23', 17),
 ('MAT.17', 21),
 ('MAT.18', 11),
 ('MAT.23', 14),
 ('MRK.11', 26),
 ('MRK.15', 28),
 ('MRK.7', 16),
 ('MRK.9', 44),
 ('MRK.9', 46),
 ('ROM.16', 24)}

In [75]:
set(lc['GERMANY']) - set(lc['ENGLISH'])

{('2CO.13', 14),
 ('ACT.15', 34),
 ('ACT.24', 7),
 ('ACT.28', 29),
 ('ACT.8', 37),
 ('JHN.5', 4),
 ('LUK.17', 36),
 ('LUK.23', 17),
 ('MAT.17', 21),
 ('MAT.18', 11),
 ('MAT.23', 14),
 ('MRK.11', 26),
 ('MRK.15', 28),
 ('MRK.7', 16),
 ('MRK.9', 44),
 ('MRK.9', 46),
 ('ROM.16', 24)}

In [58]:
len(books)

27

In [60]:
len(chaps)

260

In [63]:
8217 - 260


7957

In [82]:
[ f for f, s in iter_fixed(data, book='MRK', lang='GERMANY', chap='MRK.16')]

[Entry(book='MRK', chap='MRK.16', lang='GERMANY', vers=0),
 Entry(book='MRK', chap='MRK.16', lang='GERMANY', vers=1),
 Entry(book='MRK', chap='MRK.16', lang='GERMANY', vers=2),
 Entry(book='MRK', chap='MRK.16', lang='GERMANY', vers=3),
 Entry(book='MRK', chap='MRK.16', lang='GERMANY', vers=4),
 Entry(book='MRK', chap='MRK.16', lang='GERMANY', vers=5),
 Entry(book='MRK', chap='MRK.16', lang='GERMANY', vers=6),
 Entry(book='MRK', chap='MRK.16', lang='GERMANY', vers=7),
 Entry(book='MRK', chap='MRK.16', lang='GERMANY', vers=8),
 Entry(book='MRK', chap='MRK.16', lang='GERMANY', vers=9),
 Entry(book='MRK', chap='MRK.16', lang='GERMANY', vers=10),
 Entry(book='MRK', chap='MRK.16', lang='GERMANY', vers=11),
 Entry(book='MRK', chap='MRK.16', lang='GERMANY', vers=12),
 Entry(book='MRK', chap='MRK.16', lang='GERMANY', vers=13),
 Entry(book='MRK', chap='MRK.16', lang='GERMANY', vers=14),
 Entry(book='MRK', chap='MRK.16', lang='GERMANY', vers=15),
 Entry(book='MRK', chap='MRK.16', lang='GERMANY', 

In [125]:
#del lc['GUARANI']
#del lc['CANELA']
del[o[0]]

In [133]:
s = set(lc['GERMANY'])
for l, v in lc.items():
    v = set(v)
    s = s & v
    if not s or not v:
        print(l)
    

In [134]:
len(s) / len(lc['GERMANY'])

0.47121820615796517

In [131]:
o = sorted([ l for l in lc.keys()], key=lambda x: len(lc[x]))

In [132]:
o[0]

'APINAYÉ'

In [130]:
del lc[o[0]]

In [124]:
from math import isnan
for e, v in iter_fixed(data, lang=o[0]):
    if e.vers == 0 and not(isnan(v)):
        print(e, v)

In [139]:
r = set()
for l in o:
    r.add(l)
    s = set(lc['GERMANY'])
    for l1, v in lc.items():
        if l1 in r: continue 
        v = set(v)
        s = s & v
    print(len(r), len(lc.keys()) - len(r), len(s), len(s)/len(lc['GERMANY']))

1 57 4096 0.4984787635390045
2 56 4238 0.5157600097359134
3 55 4429 0.5390045028599245
4 54 4664 0.5676037483266398
5 53 4760 0.5792868443470853
6 52 4915 0.5981501764634295
7 51 5126 0.6238286479250335
8 50 5258 0.6398929049531459
9 49 5381 0.6548618717293416
10 48 5528 0.6727516125106486
11 47 5652 0.687842278203724
12 46 5787 0.7042716319824753
13 45 5903 0.7183887063405135
14 44 5986 0.7284897164415237
15 43 6003 0.7305585980284776
16 42 6135 0.74662285505659
17 41 6151 0.7485700377266642
18 40 6228 0.7579408543263965
19 39 6357 0.77364001460387
20 38 6444 0.7842278203723987
21 37 6520 0.7934769380552513
22 36 6603 0.8035779481562614
23 35 6729 0.818912011683096
24 34 6815 0.8293781185347451
25 33 6917 0.8417914080564683
26 32 7030 0.8555433856638676
27 31 7087 0.8624802239260071
28 30 7166 0.8720944383594986
29 29 7267 0.8843860289643423
30 28 7337 0.892904953145917
31 27 7415 0.9023974686625289
32 26 7472 0.9093343069246683
33 25 7560 0.9200438116100766
34 24 7641 0.9299014238773

In [141]:
len(o[31:])

27