# Suche in PTA


In [1]:
import json,csv,re
import unicodedata
import os,sys,glob
# MyCapytain == 2.0.9
from MyCapytain.resources.texts.local.capitains.cts import CapitainsCtsText
from MyCapytain.common.constants import Mimetypes, XPATH_NAMESPACES
from nltk.tokenize import RegexpTokenizer
import colored
from colored import stylize
import more_itertools as mit

## Functions

In [2]:
def convert_grcpta():
    '''Read all greek files from pta_data (without pta9999 = Bible) and convert to list of dictionaries'''
    xml_dir = os.path.expanduser('~/Dokumente/projekte/pta_data/data/*/*/*.xml')
    xml_paths = glob.glob(xml_dir)
    xml_paths = [path for path in sorted(xml_paths) if 'pta9999' not in path]
    xml_paths = [path for path in sorted(xml_paths) if 'grc' in path]
    pta_dict = []
    for xml_path in xml_paths:
        with open(xml_path, "r") as file_open:
            plain_text = ""
            _, xml_name = os.path.split(xml_path)
            short_path = xml_path.split("/")
            short_path = "/".join(short_path[8:])
            urn = "".join(short_path[7:])
            text = CapitainsCtsText(resource=file_open)
            for ref in text.getReffs(level=len(text.citation)):
                psg = text.getTextualNode(subreference=ref, simple=True)
                psg.plaintext_string_join = "" 
                text_line = psg.export(Mimetypes.PLAINTEXT, exclude=["tei:note","tei:rdg"])
                plain_text += text_line
            plain_text = re.sub("\n","",plain_text) # remove linebreaks
            plain_text = re.sub("[\.,;·]","",plain_text) # remove interpunction
            file_dict = {}
            file_dict["id"] = urn
            file_dict["text"] = plain_text
        pta_dict.append(file_dict)
    return pta_dict

In [3]:
def tokenize_text(text):
    '''Tokenize text by whitespace'''
    word_breaks = RegexpTokenizer(r'\w+')
    tokens = word_breaks.tokenize(text)
    return tokens

### Search single word (regex)

In [4]:
def get_broader_context(urn,position,context_width):
    '''Get broader context in text at position with context of context_width
    Works with tokenized text, ngram = 1'''
    texts = convert_grcpta()
    entry = next((item for item in texts if item["id"] == urn), None)
    mytext = tokenize_text(entry["text"])
    context_before = [mytext[position-x] for x in range(context_width,0,-1)]
    context_after = [mytext[position+x] for x in range(1,context_width+1)]
    result = " ".join(context_before),'{:^10}'.format(stylize(mytext[position], colored.fg("blue")))," ".join(context_after)
    result = " ".join(result)
    return result    

In [5]:
def search_single(tokens, search, context_width):
    '''search single word, returns list of dictionaries with relative position of result and result'''
    indices = [i for i, x in enumerate(tokens) if re.search(unicodedata.normalize("NFKC", search),x)]
    count = 0
    results = []
    for entry in indices:
        found = {}
        count = count+1
        context_before = [tokens[entry-x] for x in range(context_width,0,-1)]
        context_after = [tokens[entry+x] for x in range(1,context_width+1)]
        result = " ".join(context_before),'{:^10}'.format(stylize(tokens[entry], colored.fg("blue")))," ".join(context_after)
        result = " ".join(result)
        found["count"] = str(count)
        found["pos"] = str(entry)
        found["result"] = result
        results.append(found) 
    return results

In [6]:
def search_word(word,context_width):
    '''Search single word in all texts, regex is allowed; 
    context at both sides = context_width'''
    texts = convert_grcpta()
    for text in texts:
        text_id = text["id"]
        tokenized = text["text"]
        tokens = tokenize_text(tokenized)
        results = search_single(tokens, word, context_width)
        for result in results:
            print(stylize(text_id, colored.attr("bold"))+" (Ergebnis nr. "+result["count"]+" an Position "+result["pos"]+"): "+result["result"])

### Search list of words (regexes)  

Works also for single word, but output different from above

In [7]:
def generate_ngrams(words_list, n):
    '''Generate ngrams of n length'''
    ngrams_list = []
 
    for num in range(0, len(words_list)):
        ngram = ' '.join(words_list[num:num + n])
        ngrams_list.append(ngram)
 
    return ngrams_list

In [8]:
def search_words(list_of_words,distance):
    '''Search a list of words in all texts (regex is allowed) 
    within distance (ngram) number of words and give context of words at beginning and end'''
    compiled_list = '(?:% s)' % '|'.join(list_of_words)
    texts = convert_grcpta() 
    results = []
    for text in texts:
        text_id = text["id"]
        ngramed = text["text"]
        ngrams = generate_ngrams(tokenize_text(ngramed),distance)
        numbers = len(ngrams)
        res = [all([re.search(unicodedata.normalize("NFKC", k),s) for k in list_of_words]) for s in ngrams]
        positions = [i for i in range(0, len(res)) if res[i]]
        grouped_positions = [list(group) for group in mit.consecutive_groups(positions)]
        found = {}
        entry_results = []
        for entry in grouped_positions:
            # only the first entry to avoid overlap, alternative merge ngrams back to string
            index = entry[0]
            result_text = ngrams[index]
            tokenized_result = tokenize_text(result_text)
            emph_result = []
            for word in tokenized_result:
                # colorize search terms
                if re.match(compiled_list, word):
                    emph_result.append(stylize(word, colored.attr("bold")))
                else:
                    emph_result.append(word)
            result_text = " ".join(emph_result)
            result_context_before = ngrams[index-distance]
            result_context_after = ngrams[index+distance]
            result = "Position "+str(index)+"/"+str(numbers)+": "+result_context_before+" "+result_text+" "+result_context_after
            entry_results.append(result)
        found["id"] = text_id
        found["results"] = entry_results
        results.append(found)
    print("Searched for "+" and ".join(list_of_words)+" within "+str(distance)+"-grams:")
    for entry in results:
        if entry["results"]: 
            print("===")
            print(stylize(entry["id"], colored.attr("bold")))
            for x in entry["results"]:
                print("---")
                print(x)

# Search

In [9]:
search_words(["[ἡἥ]λ[ίι].{1,2}$","οὐραν.*","καὶ"],10)

Searched for [ἡἥ]λ[ίι].{1,2}$ and οὐραν.* and καὶ within 10-grams:
===
[1mpta0001.pta010.pta-grc1.xml[0m
---
Position 756/32207: τοῖς ἐξελθοῦσιν ἐκ τῆς Αἰγύπτου τοῖς τὴν Αἰγυπτιακὴν πλάνην μαθοῦσιν τοῖς τὰ περὶ τὸν [1mοὐρανὸν[0m [1mκαὶ[0m τὴν γῆν προσκυνοῦσιν [1mἥλιον[0m καὶ σελήνην καὶ ἄστρα ποταμοὺς καὶ πηγὰς καὶ ὕδατα Κατέλιπεν
---
Position 5997/32207: παρακαλῶ καὶ ἑτέραν χρείαν Τὰ ὕδατα ἐπάνω τῶν οὐρανῶν οὐ μόνον συντηρεῖ τὸν [1mοὐρανόν[0m ἀλλὰ [1mκαὶ[0m τὴν φλόγα τοῦ [1mἡλίου[0m καὶ τῆς σελήνης καταπέμπει Εἰ γὰρ ἦν διαφανὴς ὁ οὐρανός
---
Position 10280/32207: ἀσεβὴς ὥστε πεισθῆναι τοῖς ματαιολόγοις Οἱ προφῆται λέγουσιν ὅτι ἀρχὴν ἔχει [1mκαὶ[0m τέλος ὁ [1mοὐρανός[0m Διὰ τοῦτο [1mκαὶ[0m ὁ [1mἥλιος[0m οὐκ ἀναβαίνει ἀλλ ἐξέρχεται Λέγει ἡ γραφή Ὁ ἥλιος ἐξῆλθεν
---
Position 12978/32207: ἐσχηκότος Ἔδει γὰρ τὰ πρεσβεῖα φυλαχθῆναι τῇ ἀκολουθίᾳ Ὅτε τὴν γῆν ἐκόσμησε φυτοῖς [1mκαὶ[0m καρποῖς ὅτε τὸν [1mοὐρανὸν[0m ἐκαλλώπισεν [1mἡλίῳ[0m καὶ σελήνῃ καὶ τῷ λοιπῷ τῶν ἄ

In [10]:
search_word("[ἡἥ]λ[ίι].{1,2}$", 10)

[1mpta0001.pta003.pta-grc1.xml[0m (Ergebnis nr. 1 an Position 4881): ἐπὶ τῶν ἀποστόλων Αἷμα καὶ πῦρ καὶ ἀτμίδα καπνοῦ Ὁ [38;5;4mἥλιος[0m μεταστραφήσεται εἰς σκότος καὶ ἡ σελήνη εἰς αἷμα πρὶν ἐλθεῖν
[1mpta0001.pta004.pta-grc1.xml[0m (Ergebnis nr. 1 an Position 6357): θεοῦ Ποῦ ἤκουσας ἐν τῷ εὐαγγελίῳ Ἰησοῦ Χριστοῦ ὅτι ὁ [38;5;4mἥλιος[0m καὶ ἡ σελήνη δημιουργοί εἰσι ποῦ εἶπεν ὁ Χριστὸς ὅτι
[1mpta0001.pta006.pta-grc1.xml[0m (Ergebnis nr. 1 an Position 5190): καὶ ἔστι ταπεινὴ καὶ εὐμεταχείριστος καὶ εὐάλωτος ὀλίγον δὲ θαλφθεῖσα [38;5;4mἡλίῳ[0m ἀνίπταται ἡ ἐπὶ τόπῳ μὴ δυναμένη πρότερον ἐπαίρεσθαι εἰς τὸν
[1mpta0001.pta006.pta-grc1.xml[0m (Ergebnis nr. 2 an Position 5251): ἐξήλατο ὡς ἀττέλεβος ὁ ἐπίμικτός σου ὡς ἀκρίς Ἀνέτειλεν ὁ [38;5;4mἥλιος[0m καὶ ἐξεπετάσθη καὶ ἐπελάθετο τοῦ τόπου αὐτῆς Τὸν λαὸν λέγει
[1mpta0001.pta006.pta-grc1.xml[0m (Ergebnis nr. 3 an Position 5277): καὶ ὡς ἀκρὶς ἐπιβεβηκυῖα φραγμῷ ἐν ἡμέρᾳ παγετοῦ Ἀνέτειλεν ὁ [38;5;4mἥλιος[0m καὶ ἐξεπετάσθη καὶ οὐκ 

In [11]:
context = get_broader_context("pta0001.pta003.pta-grc1.xml",4881,100)
print(context)

σάρκα Αὕτη ἡ ἐπαγγελία τοῦ ΠατρόςἈλλὰ πόθεν δῆλον ὅτι ἐκ προσώπου τοῦ Πατρὸς ἦν ἡ ἐπαγγελία Ἐπειδὴ οἶδα καὶ τὸν Υἱὸν λαλοῦντα ἐν τοῖς προφήταις πόθεν τοῦτο εἰ ὁ Πατὴρ ἦν ὁ λέγων Ἐκχεῶ ἀπὸ τοῦ Πνεύματός μου ἐπὶ πᾶσαν σάρκα καὶ προφητεύσουσιν οἱ υἱοὶ ὑμῶν καὶ αἱ θυγατέρες ὑμῶν Καὶ δώσω σημεῖα ἐν τῷ οὐρανῷ ἄνω καὶ ἐν τῇ γῇ κάτω αἷμα καὶ πῦρ καὶ ἀτμίδα καπνοῦ Εἶπον πολλάκις αἷμα τὸ ἐκ τῆς πλευρᾶς σημεῖον γὰρ ἦν ἀπὸ νεκροῦ σώματος αἷμα προερχόμενον ἰδεῖν αἷμα τὸ ἀπὸ πλευρᾶς πῦρ τὸ ἐπὶ τῶν ἀποστόλων Αἷμα καὶ πῦρ καὶ ἀτμίδα καπνοῦ Ὁ [38;5;4mἥλιος[0m μεταστραφήσεται εἰς σκότος καὶ ἡ σελήνη εἰς αἷμα πρὶν ἐλθεῖν τὴν ἡμέραν Κυρίου τὴν μεγάλην καὶ ἐπιφανῆ Ὅρα τὸν λαλοῦντα Θεὸν Πατέρα πῶς τὸν Υἱὸν ἑρμηνεύει λέγων Πρὶν ἐλθεῖν τὴν ἡμέραν Κυρίου καὶ οὐ λέγει Τὴν ἡμέραν τὴν ἐμήν Εἰ γὰρ σὺ δίδως σημεῖα πῶς εἰς ἄλλον μεταφέρεις τὴν ἔννοιαν λέγων Πρὶν ἐλθεῖν τὴν ἡμέραν Κυρίου τὴν μεγάλην καὶ ἐπιφανῆ Πρόσεχε Ἴσως ἐρεῖ τις τὸν προφήτην λέγειν ἐκ προσώπου ἰδίου Πρὶν ἐλθεῖν τὴν ἡμέραν Ἀλλ οὐκ ἦν εἰπεῖν τὸν 