In [7]:
%pip


Usage:   
  /usr/bin/python3 -m pip <command> [options]

Commands:
  install                     Install packages.
  download                    Download packages.
  uninstall                   Uninstall packages.
  freeze                      Output installed packages in requirements format.
  list                        List installed packages.
  show                        Show information about installed packages.
  check                       Verify installed packages have compatible dependencies.
  config                      Manage local and global configuration.
  search                      Search PyPI for packages.
  cache                       Inspect and manage pip's wheel cache.
  wheel                       Build wheels from your requirements.
  hash                        Compute hashes of package archives.
  completion                  A helper command used for command completion.
  debug                       Show information useful for debugging.
  help              

In [79]:
# VER TODO-LIST em README.md
import pandas as pd
import json, time, urllib.request
from tools.tokenization import (get_examples, get_frequency,
                    get_text, get_tokens)
from requests.exceptions import MissingSchema

In [80]:
from cltk import NLP
from cltk.lemmatize.grc import GreekBackoffLemmatizer
lemmatizer = GreekBackoffLemmatizer()

from cltk.alphabet.text_normalization import cltk_normalize

In [91]:
from tools.wordstruct import Word as WikWord
from tools.scraper import Word as BibleWord
from tools.scraper import get_link, get_entry_soup, get_word_data, fetch_group_as_string
from tools.scraper import BASE_URL

In [148]:
import re

# ways to perfect this function:
# check whether the first word in the concordance example is it
# if not, cut off this word (so as to remove incomplete words)
# do the same to the last word of the sentence
def get_context_example(word, concordance):
    """
    for the purpose of getting an example of the word in question
    being used in context, and cleaning up this example 
    (because NLTK adds a lot of trailing spaces and separates words
    from punctuation signals in the string itself)
    """
    trailing_spaces_start = r"^\s+"
    trailing_spaces_end = r"\s+$"

    context = concordance.concordance_list(word.split()[0])[0][6]
    cleaned_up_pre = re.sub(r'\s([?.!,:;"](?:\s|$))', r'\1', context)
    cleaned_up_start = re.sub(trailing_spaces_start, "", cleaned_up_pre)
    cleaned_up = re.sub(trailing_spaces_end, "", cleaned_up_start)

    return cleaned_up

def clean_up(string):
    trailing_spaces_start = r"^\s+"
    trailing_spaces_end = r"\s+$"

    cleaned_up_pre = re.sub(r'([?.!,:;"](?:\s|$))', "", string.split()[0])
    cleaned_up_start = re.sub(trailing_spaces_start, "", cleaned_up_pre)
    cleaned_up = re.sub(trailing_spaces_end, "", cleaned_up_start)

    return cleaned_up

In [89]:
orthros, wf = get_text("textos/orthros.txt")
tokens, token_set, phrases = get_tokens(orthros)
frequency = get_frequency(tokens)
cltk_nlp = NLP(language="grc")
cltk_doc = cltk_nlp.analyze(orthros)

wordlist = []
for pair in frequency:
    token = pair[0]
    wordlist.append(token)

‎𐤀 CLTK version '1.0.23'.
Pipeline for language 'Ancient Greek' (ISO: 'grc'): `GreekNormalizeProcess`, `GreekStanzaProcess`, `GreekEmbeddingsProcess`, `StopsProcess`, `GreekNERProcess`.


In [95]:
def request(action, **params):
    return {'action': action,
            'params': params,
            'version': 6}

def invoke(action, **params):
    requestJson = json.dumps(request(action, **params)).encode('utf-8')
    response = json.load(urllib.request.urlopen(urllib.request.Request('http://localhost:8765', requestJson)))

    if len(response) != 2:
        raise Exception('response has an unexpected number of fields')

    if 'error' not in response:
        raise Exception('response is missing required error field')
    
    if 'result' not in response:
        raise Exception('response is missing required result field')

    if response['error'] is not None:
        raise Exception(response['error'])

    return response['result']

noteIDs1 = invoke('findNotes', query='deck:Ελληνικά::Liturgia')
noteIDs2 = invoke('findNotes', query='deck:Ελληνικά::Credo')
noteIDs = noteIDs1 + noteIDs2
notesInfo = invoke('notesInfo', notes=noteIDs)

In [150]:
notes = []
for card in notesInfo:
    notes.append(cltk_normalize(clean_up(card['fields']['Word']['value'])))
fwl = [cltk_normalize(word) for word in wordlist if word not in notes]
notes_group = lemmatizer.lemmatize(notes)
fwl_group = lemmatizer.lemmatize(fwl)

In [153]:
notes_lemmas = set([tuple[1] for tuple in notes_group])
fwl_lemmas = set([tuple[1] for tuple in fwl_group])

In [155]:
filtered_fwl_lemmas = [word for word in fwl_lemmas if word not in notes_lemmas]
reworked_fwl = [tuple for tuple in fwl_group if tuple[1] in filtered_fwl_lemmas]

In [156]:
# antes é necessário fazer com que as palavras idênticas aos seus 
# lemas não provoquem redundância, desfazendo as tuplas
# em uma string única

for i in range(len(reworked_fwl)):
#    for j in reworked_fwl[i]:
    if reworked_fwl[i][0] == reworked_fwl[i][1]:
        reworked_fwl[i] = reworked_fwl[i][0]

In [100]:
def fetch_bible_word(word):
    link = get_link(BASE_URL, word)
    soup = get_entry_soup(link)
    word = BibleWord(get_word_data(soup))

    return word

In [101]:
# isso pode ser aperfeiçoado para tornar mais legível
# em vez de usar uma n-tupla no return value, usar
# um dicionário

def searcher(word):

    if type(word) == tuple:
        try:
            bible_word = fetch_bible_word(word[0])
            return {"search_num": 1, "source": "BibleHub", "input": word[0], "output": bible_word.data}

        except MissingSchema:
            try:
                bible_word = fetch_bible_word(word[1])
                return {"search_num": 2, "source": "BibleHub", "input": word[0], "output": bible_word.data}

            except MissingSchema:

                return {"search_num": 2, "source": None, "input": word[0], "output": None}
                

    elif type(word) == str:
        try: 
            bible_word = fetch_bible_word(word)
            return {"search_num": 1, "source": "BibleHub", "input": word, "output": bible_word.data}

        except:
                return {"search_num": 1, "source": None, "input": word, "output": None}

    else:

        return {"search_num": 0, "source": None, "input": word, "output": None}

In [166]:
# É necessário criar alguma forma de verificar se
# as palavras que o script adquiriu correspondem às palavras
# do input, uma vez que as palavras obtidas do BibleHub podem ser,
# devido ao meu algoritmo meio porco, não muito confiáveis

# uma ideia interessante seria adicionar um marcador ao lado, algo como 

# Mais uma coisa a aperfeiçoar:
# Se a mesma palavra já estiver presente, deixar de lado
# não o mesmo radical, mas *a mesma palavra*

def acquire_data(list):

    bible_searches = 0
    wiki_searches = 0
    data = []
    blanks = []
    already_present = []

    for index in range(len(list)):
        print("word no. {}: {}".format(index+1, list[index]))
        word_dict = searcher(list[index])

        if a[1] != False:
            result = (word_dict["source"], word_dict["input"], word_dict["output"])
            if result[2] not in already_present:
                data.append(result)
            already_present.append(result[2])
        else:
            blanks.append(word_dict["input"])

        bible_searches += word_dict["search_num"]

        if bible_searches >= 10:
            for n in range(60):
                time.sleep(1)
                print("Wait... {}/60".format(n))
                bible_searches = 0
    
    return data, blanks

data, blanks = acquire_data(reworked_fwl)

word no. 1: Χριστὸς
word no. 2: ('θανάτῳ', 'θάνατος')
word no. 3: ('θάνατον', 'θάνατος')
word no. 4: ('πατήσας', 'πατέω')
word no. 5: ('μνήμασι', 'μνῆμα')
word no. 6: ('χαρισάμενος', 'χαρίζομαι')
word no. 7: ('!', 'punc')


In [161]:

for item in data:
    if item[0] == "BibleHub":

        print(item[1])

        curr = item[2]
        concordances = curr["concordances"]

        dc = {
            "word": concordances["Original Word"],
            "phonetics": concordances["Phonetic Spelling"],
            "category": concordances["Part of Speech"],
            "meaning": concordances['Definition'],
            "greek": fetch_group_as_string([tuple[0] for tuple in curr['examples']], single_list=True),
            "english": fetch_group_as_string([tuple[1] for tuple in curr['examples']], single_list=True),
        }

        dc["context"] = get_context_example(item[1], wf)
        
        # to display which word is being worked upon at the time

        words_dataframe = pd.DataFrame.from_dict(dc, orient="index")
        words_dataframe = words_dataframe.transpose()
        words_dataframe.to_csv("anesti_output.csv", encoding="utf-8", mode="a", header=False, index=False)

Χριστὸς
θανάτῳ
θάνατον
πατήσας
μνήμασι
χαρισάμενος


In [159]:
curr_d = data[0][1]
word_name = curr_d['concordances']['Original Word']
concordance = wf.concordance_list(word_name)[0][6]
cleaned_up = re.sub(r'\s([?.!,:;"](?:\s|$))', r'\1', concordance)
pattern = r"^\s+"
cleaned_up = re.sub(pattern,"",cleaned_up)
cleaned_up

TypeError: string indices must be integers

In [103]:
anesti, wf = get_text("textos/anesti.txt")
tokens, token_set, phrases = get_tokens(anesti)
frequency = get_frequency(tokens)
cltk_nlp = NLP(language="grc")
cltk_doc = cltk_nlp.analyze(anesti)

wordlist = []
for pair in frequency:
    token = pair[0]
    wordlist.append(token)

‎𐤀 CLTK version '1.0.23'.
Pipeline for language 'Ancient Greek' (ISO: 'grc'): `GreekNormalizeProcess`, `GreekStanzaProcess`, `GreekEmbeddingsProcess`, `StopsProcess`, `GreekNERProcess`.


In [167]:
data

[('BibleHub',
  'Χριστὸς',
  {'concordances': {'Original Word': 'Χριστός, οῦ, ὁ',
    'Part of Speech': 'Noun, Masculine',
    'Transliteration': 'Christos',
    'Phonetic Spelling': "(khris-tos')",
    'Definition': 'the Anointed One, Messiah, Christ',
    'Usage': 'Anointed One; the Messiah, the Christ.'},
   'examples': [('ὁ λεγόμενος Χριστός ', ' who is called Christ'),
    ('ἕως τοῦ χριστοῦ γενεαὶ δεκατέσσαρες',
     ' to the Christ generations fourteen'),
    ('ΔΕ ΙΗΣΟΥ ΧΡΙΣΤΟΥ ἡ γένεσις', ' now of Jesus Christ the birth')]}),
 ('BibleHub',
  'θανάτῳ',
  {'concordances': {'Original Word': 'θάνατος, ου, ὁ',
    'Part of Speech': 'Noun, Masculine',
    'Transliteration': 'thanatos',
    'Phonetic Spelling': "(than'-at-os)",
    'Definition': 'death',
    'Usage': 'death, physical or spiritual.'},
   'examples': [('ἀδελφὸν εἰς θάνατον καὶ πατὴρ',
     ' brother to death and father'),
    ('ἢ μητέρα θανάτῳ τελευτάτω ', ' or mother in death must die'),
    ('μὴ γεύσωνται θανάτου ἕως ἂ

In [122]:
len(noteIDs)

224

In [128]:
fwl_lemmas

{'punc',
 'Χριστὸς',
 'ζωή',
 'θάνατος',
 'μνῆμα',
 'νεκρός',
 'πατέω',
 'χαρίζομαι',
 'ἀνίστημι',
 'ἐκ'}

In [154]:
notes_lemmas

{'Δόξα',
 'Εὐλογητὸς',
 'Θεός',
 'Θεὸν',
 'Κύριε',
 'Μαρία',
 'Πατρός',
 'Πιλᾶτος',
 'Προσδοκῶ',
 'Πόντιος',
 'Σταυρωθέντα',
 'Χριστός',
 'αἰνέω',
 'αἰών',
 'αἴνεσις',
 'βάπτισμα',
 'βασίλειος',
 'γίγνομαι',
 'γενεά',
 'γεννάω',
 'γιγνώσκω',
 'γραφή',
 'γῆ',
 'δέησις',
 'δίδαξόν',
 'δίδωμι',
 'δείκνυμι',
 'δεξιός',
 'διά',
 'δικαιώματά',
 'δι’',
 'δοξάζω',
 'δόξα',
 'δύναμις',
 'εἰμί',
 'εἰρήνη',
 'εἰς',
 'εἰσφέρω',
 'εἶμι',
 'εἷς',
 'εὐδοκία',
 'εὐλογέω',
 'εὐλογία',
 'εὐχαριστέω',
 'ζάω',
 'ζωή',
 'ζωοποιός',
 'θάπτω',
 'θέλημά',
 'θεός',
 'κάθημαι',
 'καί',
 'καθά',
 'καθολικὴν',
 'κατά',
 'κατάνυξις',
 'κατάἕζομαι',
 'κατέρχομαι',
 'καταφεύγω',
 'κρίνω',
 'κόσμος',
 'κύριος',
 'λέγω',
 'λαλέω',
 'μέγας',
 'μέλλω',
 'μή',
 'μήτηρ',
 'μακαρίζω',
 'μεγαλύνω',
 'μετά',
 'μονογενής',
 'μόνος',
 'νεκρός',
 'νῦν',
 'οὐ',
 'οὐρανός',
 'οὗτος',
 'πάλιν',
 'πάσχω',
 'παναμώμητον',
 'παντοκράτωρ',
 'παρά',
 'παρατείνω',
 'παρθένος',
 'πατήρ',
 'πειρασμός',
 'πηγή',
 'πιστεύω',
 'πνεῦμα',
 'πο

In [149]:
clean_up(notes[188])

'νεκρός'

In [151]:
notes

['καὶ',
 'Ἅγιος',
 'ὁ',
 'Κύριε',
 'ἡμᾶς',
 'σου',
 'τοῦ',
 'ἐν',
 'τὸ',
 'εἰς',
 'εἶ',
 'ἐλέησον',
 'Θεός',
 'σε',
 'ἀθάνατος',
 'με',
 'ἡμῶν',
 'ἡ',
 'Ἰσχυρός',
 'τὰ',
 'δίδαξόν',
 'Εὐλογητὸς',
 'τὸν',
 'τὴν',
 'τοῖς',
 'ἡμῖν',
 'δικαιώματά',
 'τοὺς',
 'ὄνομά',
 'σὺ',
 'δὸς',
 'τῆς',
 'νῦν',
 'φωτί',
 'σοὶ',
 'παρὰ',
 'ποιεῖν',
 'σὲ',
 'πρὸς',
 'ψυχήν',
 'σέ',
 'ἠλπίσαμεν',
 'καθάπερ',
 'ταύτῃ',
 'ἡμέρᾳ',
 'τῇ',
 'αἰῶνος',
 'ἡμέραν',
 'ἑκάστην',
 'Ἰησοῦς',
 'δεξιᾷ',
 'τὰς',
 'Ἰησοῦ',
 'μεγάλην',
 'διὰ',
 'ἀνθρώποις',
 'εἰρήνη',
 'γῆς',
 'ὑψίστοις',
 'Ὅτι',
 'Πατρός',
 'ἐπὶ',
 'Δόξα',
 'ὡς',
 'βασιλεία',
 'θέλημά',
 'ὅτι',
 'μου',
 'ἔλεός',
 'τῶν',
 'μόνος',
 'κόσμου',
 'δόξαν',
 'σοι',
 'φῶς',
 'τῷ',
 'ἀμήν',
 'δόξα',
 'δύναμις',
 'ἐστιν',
 'σοῦ',
 'ἀπὸ',
 'ἀλλὰ',
 'ἡμεῖς',
 'γενεᾷ',
 'πονηροῦ',
 'ῥῦσαι',
 'πειρασμόν',
 'εἰσενέγκῃς',
 'μὴ',
 'σωτήρ',
 'ὀφειλέτης',
 'ἀφίημι',
 'ὀφείλημα',
 'σήμερον',
 'εὐχαριστέω',
 'δοξάζω',
 'προσκυνέω',
 'εὐλογία',
 'εὐδοκία',
 'σύ',
 'δείκνυμι',
 

In [162]:
d = [(1,2,3), 4, 5, (6,7,8)]

True