In [1]:
# VER TODO-LIST em README.md
import re, os, string
import pandas as pd
import json, time, urllib.request
from tools.tokenization import (get_examples, get_frequency,
                    get_text, get_tokens)
from requests.exceptions import MissingSchema

In [2]:
from cltk import NLP
from cltk.lemmatize.grc import GreekBackoffLemmatizer
lemmatizer = GreekBackoffLemmatizer()

from cltk.alphabet.text_normalization import cltk_normalize

In [3]:
from tools.wordstruct import Word as WikWord
from tools.scraper import Word as BibleWord
from tools.scraper import get_link, get_entry_soup, get_word_data, fetch_group_as_string, BASE_URL

In [4]:
# ways to perfect this function:
# check whether the first word in the concordance example is it
# if not, cut off this word (so as to remove incomplete words)
# do the same to the last word of the sentence

def boldify_selected_word(word, string):
    string_l = string.split()
    for index in range(len(string_l)):
        if string_l[index] == word:
            bold = "<b>"+word+"</b>"
            string_l[index] = bold 
    return string_l, bold

def get_context_and_clean_up(word, concordance):

    # get word context from concordance
    c = concordance.concordance_list(cltk_normalize(word))

    # split string and make chosen word bold
    line, bold_word = boldify_selected_word(cltk_normalize(word), c[0][6])
    word_list = line

    # remove first and last words of string, as they are sometimes incomplete
    # and assert that the first and last 
    if word_list[0] not in (word, bold_word):
        del word_list[0]
        if word_list[0] in string.punctuation:
            del word_list[0] 
    if word_list[len(word_list) - 1] not in (word, bold_word):
        del word_list[len(word_list) - 1]

    # get string with extra spaces
    unfinished_string = " ".join(word_list)
    
    # remove unnecessary spaces from string
    trailing_spaces_start = r"^\s+"
    trailing_spaces_end = r"\s+$"

    cleaned_up_pre = re.sub(r'\s([?.!,:;"](?:\s|$))', r'\1', unfinished_string)
    cleaned_up_start = re.sub(trailing_spaces_start, "", cleaned_up_pre)
    final_string = re.sub(trailing_spaces_end, "", cleaned_up_start)
    
    return final_string

def clean_up(string):
    trailing_spaces_start = r"^\s+"
    trailing_spaces_end = r"\s+$"

    cleaned_up_pre = re.sub(r'([?.!,:;"](?:\s|$))', "", string.split()[0])
    cleaned_up_start = re.sub(trailing_spaces_start, "", cleaned_up_pre)
    cleaned_up = re.sub(trailing_spaces_end, "", cleaned_up_start)

    return cleaned_up

In [37]:
songs, wf = get_text("textos/didache.txt")
tokens, token_set, phrases = get_tokens(songs)
frequency = get_frequency(tokens)
cltk_nlp = NLP(language="grc")
cltk_doc = cltk_nlp.analyze(songs)

wordlist = []
for pair in frequency:
    token = pair[0]
    wordlist.append(token)

‎𐤀 CLTK version '1.0.23'.
Pipeline for language 'Ancient Greek' (ISO: 'grc'): `GreekNormalizeProcess`, `GreekStanzaProcess`, `GreekEmbeddingsProcess`, `StopsProcess`, `GreekNERProcess`.


In [9]:
def request(action, **params):
    return {'action': action,
            'params': params,
            'version': 6}

def invoke(action, **params):
    requestJson = json.dumps(request(action, **params)).encode('utf-8')
    response = json.load(urllib.request.urlopen(urllib.request.Request('http://localhost:8765', requestJson)))

    if len(response) != 2:
        raise Exception('response has an unexpected number of fields')

    if 'error' not in response:
        raise Exception('response is missing required error field')
    
    if 'result' not in response:
        raise Exception('response is missing required result field')

    if response['error'] is not None:
        raise Exception(response['error'])

    return response['result']

noteIDs = invoke('findNotes', query='deck:Grego') 
notesInfo = invoke('notesInfo', notes=noteIDs)

In [39]:
def get_input_for_material(wordlist, anki_cards):

    notes = []
    for card in anki_cards:
        notes.append(cltk_normalize(clean_up(card['fields']['Word']['value'])))
    fwl = [cltk_normalize(word) for word in wordlist if word not in notes]
    notes_group = lemmatizer.lemmatize(notes)
    fwl_group = lemmatizer.lemmatize(fwl)
    notes_lemmas = set([tuple[1] for tuple in notes_group])
    fwl_lemmas = set([tuple[1] for tuple in fwl_group])

    filtered_fwl_lemmas = [word for word in fwl_lemmas if word not in notes_lemmas]
    reworked_fwl = [tuple for tuple in fwl_group if tuple[1] in filtered_fwl_lemmas]

    for i in range(len(reworked_fwl)):
        if reworked_fwl[i][0] == reworked_fwl[i][1]:
            reworked_fwl[i] = reworked_fwl[i][0]

    return reworked_fwl

reworked_fwl = get_input_for_material(wordlist, notesInfo)

In [11]:
def fetch_bible_word(word):
    link = get_link(BASE_URL, word)
    soup = get_entry_soup(link)
    word = BibleWord(get_word_data(soup))

    return word

In [14]:
def searcher(word):

    if type(word) == tuple:
        try:
            bible_word = fetch_bible_word(word[0])
            return {"search_num": 1, "source": "BibleHub", "input": word[0], "output": bible_word.data}

        except MissingSchema:
            try:
                bible_word = fetch_bible_word(word[1])
                return {"search_num": 2, "source": "BibleHub", "input": word[0], "output": bible_word.data}

            except MissingSchema:

                return {"search_num": 2, "source": None, "input": word[0], "output": None}
                

    elif type(word) == str:
        try: 
            bible_word = fetch_bible_word(word)
            return {"search_num": 1, "source": "BibleHub", "input": word, "output": bible_word.data}

        except:
                return {"search_num": 1, "source": None, "input": word, "output": None}

    else:

        return {"search_num": 0, "source": None, "input": word, "output": None}

In [None]:
# É necessário criar alguma forma de verificar se
# as palavras que o script adquiriu correspondem às palavras
# do input, uma vez que as palavras obtidas do BibleHub podem ser,
# devido ao meu algoritmo meio porco, não muito confiáveis

# uma ideia interessante seria adicionar um marcador ao lado, algo como 

# Mais uma coisa a aperfeiçoar:
# Se a mesma palavra já estiver presente, deixar de lado
# não o mesmo radical, mas *a mesma palavra*

def acquire_data(list, amount=None):

    bible_searches = 0
    data = []
    blanks = []
    already_present = []

    if (amount != None) and (amount <= len(list)):
        number = range(amount)
    else:
        number = range(len(list))

    for index in number:
        print("word no. {}: {}".format(index+1, list[index]))
        word_dict = searcher(list[index])

        if word_dict["source"] != None:
            result = (word_dict["source"], word_dict["input"], word_dict["output"])
            if result[2] not in already_present:
                data.append(result)
            already_present.append(result[2])
        else:
            blanks.append(word_dict["input"])

        bible_searches += word_dict["search_num"]

        if bible_searches >= 10:
            for n in range(60):
                time.sleep(1)
                print("Wait... {}/60".format(n))
                bible_searches = 0
    
    return data, blanks

data, blanks = acquire_data(reworked_fwl)

In [20]:

def produce_material(output_file_name, data=None, blanks=None):
    if type(output_file_name) is not str:
        raise TypeError("variable 'output_file_name' must be of type 'str'.")

    if (data is not None) and (data != []): 
        for item in data:
            if item[0] is not None:

                print(item[1])

                curr = item[2]
                concordances = curr["concordances"]

                dc = {
                    "word": clean_up(concordances["Original Word"]),
                    "phonetics": concordances["Phonetic Spelling"],
                    "category": concordances["Part of Speech"],
                    "meaning": concordances['Definition'],
                    "greek": fetch_group_as_string([tuple[0] for tuple in curr['examples']], single_list=True),
                    "english": fetch_group_as_string([tuple[1] for tuple in curr['examples']], single_list=True)
                }
                dc["context"] = get_context_and_clean_up(item[1], wf)
                dc["original"] = item[1]        
                dc["source"] = item[0]        
                # to display which word is being worked upon at the time

                words_dataframe = pd.DataFrame.from_dict(dc, orient="index")
                words_dataframe = words_dataframe.transpose()
                words_dataframe.to_csv(os.path.join("output", output_file_name+".csv"), 
                                                    encoding="utf-8", mode="a", 
                                                    header=False, index=False)
            
    if (blanks is not None) and (blanks != []):
        blanks_dataframe = pd.DataFrame(blanks)
        blanks_dataframe= blanks_dataframe.transpose()
        blanks_dataframe.to_csv(os.path.join("blanks", output_file_name+"_blanks.csv"), 
                                            encoding="utf-8", mode="a", 
                                            header=False, index=False)

produce_material("test", data=data, blanks=blanks)

NameError: name 'data' is not defined

In [40]:
d = acquire_data(reworked_fwl[0:6])

word no. 1: μετ ́
word no. 2: δύο
word no. 3: ἱμάτιόν
word no. 4: ('χιτῶνα', 'χιτών')
word no. 5: ('λάβῃ', 'λαμβάνω')
word no. 6: ('οὐδὲ', 'οὐδέ')


In [41]:
d

([('BibleHub',
   'μετ ́',
   {'concordances': {'Original Word': 'μετά',
     'Part of Speech': 'Preposition',
     'Transliteration': 'meta',
     'Phonetic Spelling': "(met-ah')",
     'Definition': 'with, among, after',
     'Usage': '(a) gen: with, in company with, (b) acc: (1) behind, beyond, after, of place, (2) after, of time, with nouns, neut. of adjectives.'},
    'examples': [("ἐστιν μεθερμηνευόμενον Μεθ' ἡμῶν ὁ",
      ' is translated With us'),
     ("πᾶσα Ἰεροσόλυμα μετ' αὐτοῦ ", ' all Jerusalem with him'),
     ('τὸ παιδίον μετὰ Μαρίας τῆς', ' the child with Mary the')]}),
  ('BibleHub',
   'δύο',
   {'concordances': {'Original Word': 'δύο',
     'Part of Speech': 'Adjective; Indeclinable Numeral (Adjective)',
     'Transliteration': 'duo',
     'Phonetic Spelling': "(doo'-o)",
     'Definition': 'two',
     'Usage': 'two.'},
    'examples': [('εἶδεν ἄλλους δύο ἀδελφούς Ἰάκωβον',
      ' he saw others two brothers James'),
     ("μετ' αὐτοῦ δύο ", ' with him two'),
     (

In [42]:
produce_material("didache", d)

('BibleHub', 'δύο', {'concordances': {'Original Word': 'δύο', 'Part of Speech': 'Adjective; Indeclinable Numeral (Adjective)', 'Transliteration': 'duo', 'Phonetic Spelling': "(doo'-o)", 'Definition': 'two', 'Usage': 'two.'}, 'examples': [('εἶδεν ἄλλους δύο ἀδελφούς Ἰάκωβον', ' he saw others two brothers James'), ("μετ' αὐτοῦ δύο ", ' with him two'), ('Οὐδεὶς δύναται δυσὶ κυρίοις δουλεύειν', ' No one is able two masters to serve')]})


TypeError: tuple indices must be integers or slices, not str

In [47]:
for item in d:
    print(item)
    print(item)

[('BibleHub', 'μετ ́', {'concordances': {'Original Word': 'μετά', 'Part of Speech': 'Preposition', 'Transliteration': 'meta', 'Phonetic Spelling': "(met-ah')", 'Definition': 'with, among, after', 'Usage': '(a) gen: with, in company with, (b) acc: (1) behind, beyond, after, of place, (2) after, of time, with nouns, neut. of adjectives.'}, 'examples': [("ἐστιν μεθερμηνευόμενον Μεθ' ἡμῶν ὁ", ' is translated With us'), ("πᾶσα Ἰεροσόλυμα μετ' αὐτοῦ ", ' all Jerusalem with him'), ('τὸ παιδίον μετὰ Μαρίας τῆς', ' the child with Mary the')]}), ('BibleHub', 'δύο', {'concordances': {'Original Word': 'δύο', 'Part of Speech': 'Adjective; Indeclinable Numeral (Adjective)', 'Transliteration': 'duo', 'Phonetic Spelling': "(doo'-o)", 'Definition': 'two', 'Usage': 'two.'}, 'examples': [('εἶδεν ἄλλους δύο ἀδελφούς Ἰάκωβον', ' he saw others two brothers James'), ("μετ' αὐτοῦ δύο ", ' with him two'), ('Οὐδεὶς δύναται δυσὶ κυρίοις δουλεύειν', ' No one is able two masters to serve')]}), ('BibleHub', 'ἱμάτιό

In [54]:
data = d[0]
blanks = d[1]

In [53]:
for item in data:
    print(item[1])

μετ ́
δύο
ἱμάτιόν
χιτῶνα
λάβῃ
οὐδὲ


In [55]:
produce_material("didache", data, blanks)

μετ ́


IndexError: list index out of range