In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
from math import isnan
from pathlib import Path
import re
import pandas as pd

import sys
sys.path.append('..')
from src.table import Table, Key
from src.data.loader import ExcelLoader
from src.code.wals import language_to_wals_code
import src.util as util

In [4]:
multilingual = Path('../dataset/aligned_verses_spreadsheets/')
greek = Path('../dataset/Ancient_Greek')
excelloader = ExcelLoader(multilingual, greek)
table = Table(excelloader, language_to_wals_code)
table.build()

In [5]:
table = util.apply_fixes(table)

deleting titles from non greek
incrementing greek verses
ensuring verses are strings
removing spaces from start and end
ensuring verses end with punctuation
removing verse number in non greek
fixing verse order


In [6]:
for i, (k, v) in enumerate(table.iterfix()):
    print(i, k)
    if i > 5: break

0 Key(book='ROM', chap='ROM.1', lang='ANCIENT_GREEK', vers=1)
1 Key(book='ROM', chap='ROM.1', lang='ANCIENT_GREEK', vers=2)
2 Key(book='ROM', chap='ROM.1', lang='ANCIENT_GREEK', vers=3)
3 Key(book='ROM', chap='ROM.1', lang='ANCIENT_GREEK', vers=4)
4 Key(book='ROM', chap='ROM.1', lang='ANCIENT_GREEK', vers=5)
5 Key(book='ROM', chap='ROM.1', lang='ANCIENT_GREEK', vers=6)
6 Key(book='ROM', chap='ROM.1', lang='ANCIENT_GREEK', vers=7)


In [7]:
import pandas as pd

In [8]:
from collections import Counter
from unicodedata import category as cat

def table_chars(table):
    cnt = Counter()
    for k, v in table.iterfix():
        cnt.update(v)
    return cnt

In [9]:
char_freq = table_chars(table) 

In [10]:
#for k, v in char_freq.items():
#    print('%r %r' % (k, v), cat(k))

In [11]:
def table_to_df(table):
    cols = dict(book=[], chapter=[], language=[], language_wals_code=[], verse_number=[], text=[])
    for k, v in table.iterfix():
        cols['book'].append(k.book)
        cols['chapter'].append(int(k.chap[k.chap.find('.')+1:]))
        cols['language'].append(k.lang)
        cols['language_wals_code'].append(language_to_wals_code[k.lang])
        cols['verse_number'].append(k.vers)
        cols['text'].append(''.join([c for c in v if not cat(c).startswith('C')]))
    return pd.DataFrame(data=cols)

In [12]:
df = table_to_df(table)
df = df[df.text.str.len() > 0]

In [13]:
df.to_csv('../dataset/bibles.csv', index=False)

In [14]:
df2 = pd.read_csv('../dataset/bibles.csv')

In [15]:
df2.head()

Unnamed: 0,book,chapter,language,language_wals_code,verse_number,text
0,ROM,1,ANCIENT_GREEK,[grc],1,"Παῦλος δοῦλος Χριστοῦ Ἰησοῦ, κλητὸς ἀπόστολος,..."
1,ROM,1,ANCIENT_GREEK,[grc],2,ὃ προεπηγγείλατο διὰ τῶν προφητῶν αὐτοῦ ἐν γρα...
2,ROM,1,ANCIENT_GREEK,[grc],3,περὶ τοῦ υἱοῦ αὐτοῦ τοῦ γενομένου ἐκ σπέρματος...
3,ROM,1,ANCIENT_GREEK,[grc],4,τοῦ ὁρισθέντος υἱοῦ θεοῦ ἐν δυνάμει κατὰ πνεῦμ...
4,ROM,1,ANCIENT_GREEK,[grc],5,δι' οὗ ἐλάβομεν χάριν καὶ ἀποστολὴν εἰς ὑπακοὴ...


In [16]:
# create lcm dataset
import src.data.util as du
lcm_df = du.lcm(df)

In [17]:
lcm_df

Unnamed: 0,book,chapter,verse_number
0,MAT,1,18
1,MAT,1,19
2,MAT,1,20
3,MAT,1,21
4,MAT,2,3
...,...,...,...
2580,REV,22,15
2581,REV,22,16
2582,REV,22,17
2583,REV,22,20


In [18]:
lcm_df_full = df.merge(lcm_df, how='inner', on=['book', 'chapter', 'verse_number'])

In [19]:
lcm_df_full[lcm_df_full.language == 'ENGLISH']

Unnamed: 0,book,chapter,language,language_wals_code,verse_number,text
10,ROM,1,ENGLISH,eng,8,"First of all, I thank my God through Jesus Chr..."
67,ROM,1,ENGLISH,eng,12,"that is, that we may be mutually comforted by ..."
124,ROM,1,ENGLISH,eng,13,"I do not want you to be unaware, brothers and ..."
181,ROM,1,ENGLISH,eng,16,"For I am not ashamed of the gospel, for it is ..."
238,ROM,1,ENGLISH,eng,17,For the righteousness of God is revealed in th...
...,...,...,...,...,...,...
147070,LUK,12,ENGLISH,eng,51,Do you think I have come to bring peace on ear...
147127,LUK,12,ENGLISH,eng,52,For from now on there will be five in one hous...
147184,LUK,12,ENGLISH,eng,53,"They will be divided, father against son and s..."
147241,LUK,12,ENGLISH,eng,54,"Jesus also said to the crowds, “When you see a..."


In [20]:
lcm_df_full.to_csv('../dataset/bibles_lcm.csv', index=False)

In [21]:
# create lcm90 dataset
lcm_cut90_langs = du.lcm_with_cut(df, 0.9)

In [22]:
lcm_cut90_langs

['TUYÚCA',
 'MATSÉS',
 'MACUSHI',
 'NADEB',
 'WAPISHANA',
 'MAXAKALI',
 'GUARANI_EASTERN_BOLIVIAN',
 'GUARANI_WESTERN_BOLIVIAN',
 'APALAÍ',
 'KAIGANG',
 'GUARANI_PARAGUAY',
 'JAMAMADI',
 'YANOMAMI',
 'PALIKÚR',
 'ANCIENT_GREEK',
 'SPANISH',
 'ENGLISH',
 'APURINÃ',
 'ASHENINKA',
 'FRENCH',
 'GUARANI_MBYA',
 'GERMANY',
 'PARECÍS',
 'PORTUGUESE',
 'TERÊNA',
 'NHEENGATU',
 'SATERÉ-MAWÉ']

In [23]:
lcm_cut_90_df = du.intersection(df[df.language.isin(lcm_cut90_langs)])

In [24]:
lcm_cut_90_df.to_csv('../dataset/bibles_90_lcm.csv', index=False)