In [1]:
from pathlib import Path

import pandas as pd
import nltk

In [2]:
wals_lang_codes = {
    'APALAÍ': 'apl',
    'APINAYÉ': 'api',
    'APURINÃ': 'apu',
    'ASHENINKA': 'cax', # Peru
    'BAKAIRI': 'bki',
    'BORÔRO': 'brr', 
    'CANELA': 'cnl',
    'CULINA': 'cul',
    'DESANO': 'des',
    'ENGLISH': 'eng', 
    'FRENCH': 'fre',
    'GERMANY': 'ger',
    'GUAJAJARA': 'gjj',
    'GUARANI_EASTERN_BOLIVIAN': 'crg', # Chiriguano
    'GUARANI_MBYA': '[gun]', # Missing
    'GUARANI_PARAGUAY': 'gua', # Guaraní
    'GUARANI_WESTERN_BOLIVIAN': '[gnw]', # Missing
    'HIXKARYÁNA': 'hix',
    'JAMAMADI': 'jmm',
#    'JARAWARA': 'jar', # identifier made up 
    'KAAPOR': 'urk',
    'KADIWÉU': 'kdw',
    'KAIGANG': 'kng',
    'KAIWÁ': 'kaw',
    'KARAJÁ': 'jva',
    'KASHINAWA': 'csh',
    'KAYABÍ': 'kyz',
    'KAYAPÓ': 'kyp',
    'KUBEO': 'cub',
    'MACUSHI': 'mac',
#    'MACHINÉRI': ,
    'MAKUNA': 'mcn',
    'MATSÉS': 'myr',
    'MAXAKALI': 'max',
    'MUNDURUKÚ': 'muu',
    'NADEB': 'nad',
    'NAMBIKUÁRA': 'nmb',
    'NHEENGATU': '[yrl]',
    'PALIKÚR': 'plk',
    'PARECÍS': 'pex',
    'PAUMARÍ': 'pau',
    'PIRATAPÚYA': 'prt',
    'PORTUGUESE': 'por',
    'RIKBAKTSA': 'rik',
    'SANUMÁ': 'snm',
    'SATERÉ-MAWÉ': '[mav]',
    'SIRIANO': 'sri',
    'SPANISH': 'spa',
    'TENHARIM': '[pah]',
    'TERÊNA': 'trn',
    'TICUNA': 'tic',
    'TUCANO': 'tuc',
    'TUYÚCA': 'tuy', 
    'WANANA': 'gno',
    #'WAIÃPY': ,
    'WAPISHANA': 'wps',
    'XAVANTE': 'xav',
    'YAMINÁWA': 'yam',
    'YANOMAMI': '[guu]'
    
}

In [3]:
p = Path('../data/aligned_verses_spreadsheets/')

In [4]:
booksp = [ e for e in p.iterdir() if e.is_file ]
books = [e.name.replace('.xlsx', '') for e in booksp]

In [5]:
chapters = [ pd.read_excel(e, None) for e in booksp]

In [6]:
book2chapters = { b : c for b, c in zip(books, chapters) }
b2c = { b : list(c.keys()) for b, c in book2chapters.items() }

In [7]:
chapter2book = { c : b for b, c2v in book2chapters.items() for c in c2v.keys() }

In [8]:
languages = { c : v for b, chs in book2chapters.items() for c, v in chs.items()}
chapter2languages = { c : v.to_dict() for c, v in languages.items() }

In [9]:
langs = list(languages['MRK.1'].columns)

chaps = [c for b in books for c in b2c[b]]
verss = list(range(0, 98))

In [16]:
from typing import NamedTuple
from collections import defaultdict

class Entry(NamedTuple):
    book: str
    chap: str
    lang: str
    vers: int

# Dicionario data[(book, chap, lang, nvers)] -> versiculo_texto
data = defaultdict(str)
for b in books:
    for c in book2chapters[b]:
        for l, vs in chapter2languages[c].items():
            for i, val in vs.items():
                key = Entry(b, c, l, i)
                if val != ' ':
                    data[key] = val

In [11]:
len(data)

472409

In [12]:
from itertools import product

In [22]:
for t in product(*[[1, 2, 3]]):
    print(t)

(1,)
(2,)
(3,)


In [23]:
set(['book', 'chap', 'lang', 'vers'])

{'book', 'chap', 'lang', 'vers'}

In [19]:
def get_verses(data, book, chapter, language): 
    for i in verss:
        r =  data[(book, chapter, language, i)]
        if r:
            yield i, r
    

In [32]:
def iter_fixed(data, **kwargs):
    
    args = set(['book', 'chap', 'lang', 'vers']) # Take from data
    
    default = dict(book=books, chap=chaps, lang=langs,vers=verss) # Take from data
    
    fix = kwargs.keys()
    var = args - fix
    
    for p in product(*[default[i] for i in var]):
        d = kwargs
        for v, t in zip(var, p):
            d[v] = t
        key = Entry(**d)
        val = data[key]
        if val:
            yield key, val

In [None]:
for e in iter_fixed(data, lang='ENGLISH'):
    print(e)

In [23]:
max(get_verses(data, 'MRK', 'MRK.1', 'ENGLISH'))

(45,
 '45. But as the man went out he began to announce it publicly and spread the story widely, so that Jesus was no longer able to enter any town openly but stayed outside in remote places. Still they kept coming to him from everywhere.')

In [34]:
max(iter_fixed(data, book='MRK', chap='MRK.1', lang='ENGLISH'))

(Entry(book='MRK', chap='MRK.1', lang='ENGLISH', vers=45),
 '45. But as the man went out he began to announce it publicly and spread the story widely, so that Jesus was no longer able to enter any town openly but stayed outside in remote places. Still they kept coming to him from everywhere.')

In [26]:
[max(get_verses(data, b, c, 'ENGLISH'))[0] for b in books for c in book2chapters[b]]

[45,
 28,
 35,
 41,
 43,
 56,
 37,
 38,
 50,
 52,
 33,
 44,
 37,
 72,
 47,
 20,
 27,
 26,
 18,
 17,
 20,
 51,
 25,
 36,
 54,
 47,
 71,
 53,
 59,
 41,
 42,
 57,
 50,
 38,
 31,
 27,
 33,
 26,
 40,
 42,
 31,
 25,
 21,
 22,
 18,
 25,
 12,
 17,
 18,
 18,
 26,
 17,
 22,
 13,
 31,
 16,
 23,
 21,
 13,
 20,
 40,
 13,
 27,
 33,
 34,
 31,
 13,
 40,
 58,
 24,
 29,
 23,
 25,
 18,
 30,
 30,
 21,
 23,
 25,
 23,
 17,
 25,
 48,
 34,
 29,
 34,
 38,
 42,
 30,
 50,
 58,
 36,
 39,
 28,
 27,
 35,
 30,
 34,
 46,
 46,
 39,
 51,
 46,
 75,
 66,
 20,
 23,
 22,
 21,
 32,
 33,
 24,
 26,
 47,
 26,
 37,
 42,
 15,
 60,
 40,
 43,
 48,
 30,
 25,
 52,
 28,
 41,
 40,
 34,
 28,
 41,
 38,
 40,
 30,
 35,
 27,
 27,
 32,
 44,
 31,
 14,
 18,
 19,
 16,
 14,
 20,
 28,
 13,
 28,
 39,
 40,
 29,
 25,
 32,
 29,
 31,
 25,
 21,
 23,
 25,
 39,
 33,
 21,
 36,
 21,
 14,
 23,
 33,
 27,
 20,
 29,
 22,
 11,
 14,
 17,
 17,
 13,
 21,
 11,
 19,
 18,
 18,
 20,
 8,
 21,
 18,
 24,
 21,
 15,
 27,
 21,
 10,
 29,
 24,
 21,
 21,
 20,
 15,
 16,
 16,
 