In [1]:
# VER TODO-LIST em README.md
import json, time, urllib.request
from toolset import (get_examples, get_frequency,
                    get_text, get_tokens)
from requests.exceptions import MissingSchema

In [2]:
from cltk import NLP
from cltk.lemmatize.grc import GreekBackoffLemmatizer
lemmatizer = GreekBackoffLemmatizer()

from cltk.alphabet.text_normalization import cltk_normalize

In [3]:
from words import Word as WikWord
from scraper import Word as BibleWord
from scraper import get_link, get_entry_soup, get_word_data, fetch_group_as_string
from scraper import BASE_URL

In [4]:
import re

# ways to perfect this function:
# check whether the first word in the concordance example is it
# if not, cut off this word (so as to remove incomplete words)
# do the same to the last word of the sentence
def get_context_example(word, concordance):
    """
    for the purpose of getting an example of the word in question
    being used in context, and cleaning up this example 
    (because NLTK adds a lot of trailing spaces and separates words
    from punctuation signals in the string itself)
    """
    trailing_spaces_start = r"^\s+"
    trailing_spaces_end = r"\s+$"

    context = concordance.concordance_list(word.split()[0])[0][6]
    cleaned_up_pre = re.sub(r'\s([?.!,:;"](?:\s|$))', r'\1', context)
    cleaned_up_start = re.sub(trailing_spaces_start, "", cleaned_up_pre)
    cleaned_up = re.sub(trailing_spaces_end, "", cleaned_up_start)

    return cleaned_up

def clean_up(string):
    trailing_spaces_start = r"^\s+"
    trailing_spaces_end = r"\s+$"

    cleaned_up_pre = re.sub(r'\s([?.!,:;"](?:\s|$))', r'\1', context)
    cleaned_up_start = re.sub(trailing_spaces_start, "", cleaned_up_pre)
    cleaned_up = re.sub(trailing_spaces_end, "", cleaned_up_start)

    return cleaned_up

In [5]:
creed, wf = get_text("orthros.txt")
tokens, token_set, phrases = get_tokens(creed)
frequency = get_frequency(tokens)
cltk_nlp = NLP(language="grc")
cltk_doc = cltk_nlp.analyze(creed)

wordlist = []
for pair in frequency:
    token = pair[0]
    wordlist.append(token)

‎𐤀 CLTK version '1.0.23'.
Pipeline for language 'Ancient Greek' (ISO: 'grc'): `GreekNormalizeProcess`, `GreekStanzaProcess`, `GreekEmbeddingsProcess`, `StopsProcess`, `GreekNERProcess`.


In [6]:
def request(action, **params):
    return {'action': action,
            'params': params,
            'version': 6}

def invoke(action, **params):
    requestJson = json.dumps(request(action, **params)).encode('utf-8')
    response = json.load(urllib.request.urlopen(urllib.request.Request('http://localhost:8765', requestJson)))

    if len(response) != 2:
        raise Exception('response has an unexpected number of fields')

    if 'error' not in response:
        raise Exception('response is missing required error field')
    
    if 'result' not in response:
        raise Exception('response is missing required result field')

    if response['error'] is not None:
        raise Exception(response['error'])

    return response['result']

noteIDs1 = invoke('findNotes', query='deck:Liturgia')
noteIDs2 = invoke('findNotes', query='deck:Credo')
noteIDs = noteIDs1 + noteIDs2
notesInfo = invoke('notesInfo', notes=noteIDs)

In [7]:
notes = []
for card in notesInfo:
    notes.append(cltk_normalize(card['fields']['Word']['value']))
fwl = [cltk_normalize(word) for word in wordlist if word not in notes]
notes_group = lemmatizer.lemmatize(notes)
fwl_group = lemmatizer.lemmatize(fwl)

In [8]:
notes_lemmas = set([tuple[1] for tuple in notes_group])
fwl_lemmas = set([tuple[1] for tuple in fwl_group])

In [9]:
filtered_fwl_lemmas = [word for word in fwl_lemmas if word not in notes_lemmas]
reworked_fwl = [tuple for tuple in fwl_group if tuple[1] in filtered_fwl_lemmas]

In [10]:
# antes é necessário fazer com que as palavras idênticas aos seus 
# lemas não provoquem redundância, desfazendo as tuplas
# em uma string única

for i in range(len(reworked_fwl)):
#    for j in reworked_fwl[i]:
    if reworked_fwl[i][0] == reworked_fwl[i][1]:
        reworked_fwl[i] = reworked_fwl[i][0]

In [11]:
def fetch_bible_word(word):
    link = get_link(BASE_URL, word)
    soup = get_entry_soup(link)
    word = BibleWord(get_word_data(soup))

    return word

In [12]:
# isso pode ser aperfeiçoado para tornar mais legível
# em vez de usar uma n-tupla no return value, usar
# um dicionário

def searcher(word):
    if type(word) == tuple:
        try:
            bible_word = fetch_bible_word(word[0])
            return 1, "Bible", word[0], bible_word.data

        except MissingSchema:
            try:
                bible_word = fetch_bible_word(word[1])
                return 2, "Bible", word[0], bible_word.data

            except MissingSchema:

                return 2, False, word[0]
                

    elif type(word) == str:
        try: 
            bible_word = fetch_bible_word(word)
            return 1, "Bible", word, bible_word.data

        except:
                return 1, False, word

    else:
        return 0, False, word

In [13]:
bible_searches = 0
wiki_searches = 0
data = []
blanks = []

# É necessário criar alguma forma de verificar se
# as palavras que o script adquiriu correspondem às palavras
# do input, uma vez que as palavras obtidas do BibleHub podem ser,
# devido ao meu algoritmo meio porco, não muito confiáveis

# uma ideia interessante seria adicionar um marcador ao lado, algo como 

for index in range(len(reworked_fwl)):
    print("word no. {}: {}".format(index, reworked_fwl[index]))
    a = searcher(reworked_fwl[index])
    if a[1] != False:
        data.append((a[1], a[2], a[3]))
    else:
        blanks.append(a[2])
    bible_searches += a[0]
    if bible_searches >= 10:
        for n in range(60):
            time.sleep(1)
            print("Wait... {}/60".format(n))
            bible_searches = 0

word no. 0: ﻿Ἅγιος
word no. 1: Ἐπακούσαι
word no. 2: ('ὑπερασπίσαι', 'ὑπερασπίζω')
word no. 3:  ̓Ιακώβ
word no. 4: ('ἐξαποστείλαι', 'ἐξαποστέλλω')
word no. 5: ('βοήθειαν', 'βοήθεια')
word no. 6: ἀντιλάβοιτό
word no. 7: ('μνησθείη', 'μιμνήσκω')
Wait... 0/60
Wait... 1/60
Wait... 2/60
Wait... 3/60
Wait... 4/60
Wait... 5/60
Wait... 6/60
Wait... 7/60
Wait... 8/60
Wait... 9/60
Wait... 10/60
Wait... 11/60
Wait... 12/60
Wait... 13/60
Wait... 14/60
Wait... 15/60
Wait... 16/60
Wait... 17/60
Wait... 18/60
Wait... 19/60
Wait... 20/60
Wait... 21/60
Wait... 22/60
Wait... 23/60
Wait... 24/60
Wait... 25/60
Wait... 26/60
Wait... 27/60
Wait... 28/60
Wait... 29/60
Wait... 30/60
Wait... 31/60
Wait... 32/60
Wait... 33/60
Wait... 34/60
Wait... 35/60
Wait... 36/60
Wait... 37/60
Wait... 38/60
Wait... 39/60
Wait... 40/60
Wait... 41/60
Wait... 42/60
Wait... 43/60


KeyboardInterrupt: 

In [None]:
import pandas as pd
for item in data[]:
    if item[0] == "Bible":

        print(item[1])

        curr = item[2]
        concordances = curr["concordances"]

        dc = {
            "word": concordances["Original Word"],
            "phonetics": concordances["Phonetic Spelling"],
            "category": concordances["Part of Speech"],
            "meaning": concordances['Definition'],
            "greek": fetch_group_as_string([tuple[0] for tuple in curr['examples']], single_list=True),
            "english": fetch_group_as_string([tuple[1] for tuple in curr['examples']], single_list=True),
        }

        dc["context"] = get_context_example(item[1], wf)
        
        # to display which word is being worked upon at the time

        words_dataframe = pd.DataFrame.from_dict(dc, orient="index")
        words_dataframe = words_dataframe.transpose()
        words_dataframe.to_csv("orthros_output.csv", encoding="utf-8", mode="a", header=False, index=False)

γεννηθέντα
δι ̓


IndexError: list index out of range

In [None]:
curr_d = data[0][1]
word_name = curr_d['concordances']['Original Word']
concordance = wf.concordance_list(word_name)[0][6]
cleaned_up = re.sub(r'\s([?.!,:;"](?:\s|$))', r'\1', concordance)
pattern = r"^\s+"
cleaned_up = re.sub(pattern,"",cleaned_up)
cleaned_up

'Πιστεύω εἰς ἕνα Θεόν, Πατέρα, παντοκράτορ'

In [None]:
reworked_fwl

['Ἐπακούσαι',
 ('ὑπερασπίσαι', 'ὑπερασπίζω'),
 ' ̓Ιακώβ',
 ('ἐξαποστείλαι', 'ἐξαποστέλλω'),
 ('βοήθειαν', 'βοήθεια'),
 'ἀντιλάβοιτό',
 ('μνησθείη', 'μιμνήσκω'),
 ('θυσίας', 'θυσία'),
 'ὁλοκαύτωμά',
 ('πιανάτω', 'πιαίνω'),
 'δῴη',
 ('βουλήν', 'βουλή'),
 ('ἀγαλλιασόμεθα', 'ἀγαλλιάω'),
 'αἰτήματά',
 ('ἔσωσε', 'σώζω'),
 'χριστὸν',
 ('ἐπακούσεται', 'ἐπακούω'),
 ('δυναστείαις', 'δυναστεία'),
 ('δεξιᾶς', 'δεξία'),
 ('ἵπποις', 'ἵππος'),
 ('συνεποδίσθησαν', 'συμποδίζω'),
 ('ἔπεσαν', 'ἔπειμι'),
 ('ἀνωρθώθημεν', 'ἀνορθόω'),
 ('βασιλέα', 'βασιλεύς'),
 'ἐπικαλεσώμεθά',
 ('θέλησιν', 'ἐθέλω'),
 ('ἐστέρησας', 'στερέω'),
 ('προέφθασας', 'προφθάνω'),
 ('εὐλογίαις', 'εὐλογία'),
 ('χρηστότητος', 'χρηστότης'),
 ('ἔθηκας', 'τίθημι'),
 ('κεφαλὴν', 'κεφαλή'),
 ('στέφανον', 'στέφανος'),
 ('λίθου', 'λίθος'),
 'ᾐτήσατό',
 ('μακρότητα', 'μακρότης'),
 ('μεγαλοπρέπειαν', 'μεγαλοπρέπεια'),
 ('ἐπιθήσεις', 'ἐπιτίθημι'),
 ('εὐλογίαν', 'εὐλογία'),
 ('εὐφρανεῖς', 'εὐφραίνω'),
 ('χαρᾷ', 'χαρά'),
 ' ̔Υψίστου',
 ('σαλευθ