# Search in PTA


In [1]:
import json,csv,re
import unicodedata
import os,sys,glob
# MyCapytain == 2.0.9
from MyCapytain.resources.texts.local.capitains.cts import CapitainsCtsText
from MyCapytain.common.constants import Mimetypes, XPATH_NAMESPACES
from nltk.tokenize import RegexpTokenizer
import colored
from colored import stylize
import more_itertools as mit

## Functions

In [149]:
def convert_grcpta(files_path):
    '''Read all greek files from files_path (without pta9999 = Bible) and convert to list of dictionaries (according subreferences)'''
    xml_dir = os.path.expanduser(files_path)
    xml_paths = glob.glob(xml_dir)
    xml_paths = [path for path in sorted(xml_paths) if 'pta9999' not in path]
    xml_paths = [path for path in sorted(xml_paths) if 'grc' in path]
    pta_dict = []
    for xml_path in xml_paths:
        with open(xml_path, "r") as file_open:
            plain_text = ""
            _, xml_name = os.path.split(xml_path)
            short_path = xml_path.split("/")
            short_path = "/".join(short_path[8:])
            urn = "".join(short_path[7:]).split(".xml")[0]
            text = CapitainsCtsText(resource=file_open)
            for ref in text.getReffs(level=len(text.citation)):
                file_dict = {}
                psg = text.getTextualNode(subreference=ref, simple=True)
                psg.plaintext_string_join = "" 
                text_line = psg.export(Mimetypes.PLAINTEXT, exclude=["tei:note","tei:rdg"])
                text_line = re.sub("\n","",text_line) # remove linebreaks
                file_dict["id"] = urn+":"+ref
                file_dict["text"] = text_line
                pta_dict.append(file_dict)
    return pta_dict

In [150]:
def tokenize_text(text):
    '''Tokenize text by whitespace'''
    word_breaks = RegexpTokenizer(r'\w+')
    tokens = word_breaks.tokenize(text)
    return tokens

### Search single word (regex)

In [151]:
def get_broader_context(files_path,urn,position,context_width):
    '''Get broader context in text at position with context of context_width
    Works with tokenized text, ngram = 1'''
    texts = convert_grcpta(files_path)
    entry = next((item for item in texts if item["id"] == urn), None)
    mytext = tokenize_text(entry["text"])
    context_before = [mytext[position-x] for x in range(context_width,0,-1)]
    context_after = [mytext[position+x] for x in range(1,context_width+1)]
    result = " ".join(context_before),'{:^10}'.format(stylize(mytext[position], colored.fg("blue")))," ".join(context_after)
    result = " ".join(result)
    return result    

In [152]:
def search_single(tokens, search, context_width):
    '''search single word, returns list of dictionaries with relative position of result and result'''
    indices = [i for i, x in enumerate(tokens) if re.search(unicodedata.normalize("NFKC", search),x)]
    count = 0
    results = []
    for entry in indices:
        found = {}
        count = count+1
        try:
            context_before = [tokens[entry-x] for x in range(context_width,0,-1)]
        except:
            context_before = []
        try:
            context_after = [tokens[entry+x] for x in range(1,context_width+1)]
        except:
            context_after = []
        result = " ".join(context_before),'{:^10}'.format(stylize(tokens[entry], colored.fg("blue")))," ".join(context_after)
        result = " ".join(result)
        found["count"] = str(count)
        found["pos"] = str(entry)
        found["result"] = result
        results.append(found) 
    return results

In [153]:
def search_word(word,context_width):
    '''Search single word in texts, regex is allowed; 
    context at both sides = context_width'''
    #texts = convert_grcpta(files_path)
    for text in texts:
        text_id = text["id"]
        tokenized = text["text"]
        tokens = tokenize_text(tokenized)
        results = search_single(tokens, word, context_width)
        for result in results:
            print(stylize(text_id, colored.attr("bold"))+" (Ergebnis nr. "+result["count"]+" an Position "+result["pos"]+"): "+result["result"])

### Search list of words (regexes)  

Works also for single word, but output different from above

In [154]:
def generate_ngrams(words_list, n):
    '''Generate ngrams of n length'''
    ngrams_list = []
 
    for num in range(0, len(words_list)):
        ngram = ' '.join(words_list[num:num + n])
        ngrams_list.append(ngram)
 
    return ngrams_list

In [155]:
def search_words(list_of_words,distance):
    '''Search a list of words in texts (regex is allowed) in files_path
    within distance (ngram) number of words and give context of words at beginning and end'''
    precompiled_list = [re.sub(" (\\\S\+ )+","|",x) for x in list_of_words] # case \S+ for words in between search
    compiled_list = '(?:% s)' % '|'.join(precompiled_list)
    #texts = convert_grcpta(files_path) 
    results = []
    counted = 0
    for text in texts:
        text_id = text["id"]
        ngramed = text["text"]
        ngrams = generate_ngrams(tokenize_text(ngramed),distance)
        numbers = len(ngrams)
        res = [all([re.search(unicodedata.normalize("NFKC", k.lower()),s) for k in list_of_words]) for s in ngrams]
        positions = [i for i in range(0, len(res)) if res[i]]
        grouped_positions = [list(group) for group in mit.consecutive_groups(positions)]
        found = {}
        entry_results = []
        for entry in grouped_positions:
            # only the first entry to avoid overlap, alternative merge ngrams back to string
            index = entry[0]
            result_text = ngrams[index]
            tokenized_result = tokenize_text(result_text)
            emph_result = []
            for word in tokenized_result:
                # colorize search terms
                if re.search(compiled_list, word):
                    emph_result.append(stylize(word, colored.attr("bold")))
                else:
                    emph_result.append(word)
            result_text = " ".join(emph_result)
            try:
                result_context_before = ngrams[index-distance]
            except:
                result_context_before = ""
            try:
                result_context_after = ngrams[index+distance]
            except:
                result_context_after = ""
            result = "Position "+str(index)+"/"+str(numbers)+": "+result_context_before+" "+result_text+" "+result_context_after
            entry_results.append(result)
            counted = counted+1
        found["id"] = text_id
        found["results"] = entry_results
        results.append(found)
    print("Searched for "+" and ".join(list_of_words)+" within "+str(distance)+"-grams and found "+str(counted)+" results:")
    for entry in results:
        if entry["results"]: 
            print("===")
            print(stylize(entry["id"], colored.attr("bold")))
            for x in entry["results"]:
                print("---")
                print(x)

# Search

## Examples for path
- `~/Dokumente/projekte/First1KGreek/data/*/*/*.xml`
- `~/Dokumente/projekte/pta_data/data/*/*/*.xml`

## Examples for search expressions
- Search for words: `["[ἡἥ]λ[ίι].{1,2}$","οὐραν.*","καὶ"]`
- Search for these consecutive words in distance of 2 words in between: `["καὶ \S+ \S+ οὐραν.*"]`

## Load corpus to be searched

Needs to be done once

In [156]:
texts = convert_grcpta('~/Dokumente/projekte/pta_data/data/*/*/*.xml')

## Query

In [158]:
search_words(["[ἡἥ]λ[ίι].{1,2}$","οὐραν.*"],10)

Searched for [ἡἥ]λ[ίι].{1,2}$ and οὐραν.* within 10-grams and found 21 results:
===
[1mpta0001.pta010.pta-grc1:1.2[0m
---
Position 403/625: τοῖς ἐξελθοῦσιν ἐκ τῆς Αἰγύπτου τοῖς τὴν Αἰγυπτιακὴν πλάνην μαθοῦσιν τοῖς τὰ περὶ τὸν [1mοὐρανὸν[0m καὶ τὴν γῆν προσκυνοῦσιν [1mἥλιον[0m καὶ σελήνην καὶ ἄστρα ποταμοὺς καὶ πηγὰς καὶ ὕδατα Κατέλιπεν
===
[1mpta0001.pta010.pta-grc1:2.3[0m
---
Position 454/618: τῶν οὐρανῶν Πρόσεχε λοιπὸν τὴν σοφίαν τοῦ δημιουργοῦ Κρυσταλλώδης ἦν ὁ [1mοὐρανὸς[0m ἀπὸ ὑδάτων παγείς Ἐπειδὴ γὰρ ἔμελλε δέχεσθαι [1mἡλίου[0m φλόγα καὶ σελήνης καὶ ἄστρων ἄπειρα πλήθη καὶ εἶναι ὅλος
===
[1mpta0001.pta010.pta-grc1:2.4[0m
---
Position 73/642: παρακαλῶ καὶ ἑτέραν χρείαν Τὰ ὕδατα ἐπάνω τῶν οὐρανῶν οὐ μόνον συντηρεῖ τὸν [1mοὐρανόν[0m ἀλλὰ καὶ τὴν φλόγα τοῦ [1mἡλίου[0m καὶ τῆς σελήνης καταπέμπει Εἰ γὰρ ἦν διαφανὴς ὁ οὐρανός
===
[1mpta0001.pta010.pta-grc1:3.2[0m
---
Position 156/691: οὐρανόν Διὰ τί δὲ προγενεστέραν ποιεῖ τὴν διακόσμησιν τῆς γῆς τοῦ [1mοὐρανοῦ[0m Δ

In [130]:
search_word('~/Dokumente/projekte/pta_data/data/*/*/*.xml',"[ἡἥ]λ[ίι].{1,2}$", 10)

[1mpta0001.pta003.pta-grc1:19[0m (Ergebnis nr. 1 an Position 95): ἐπὶ τῶν ἀποστόλων Αἷμα καὶ πῦρ καὶ ἀτμίδα καπνοῦ Ὁ [38;5;4mἥλιος[0m μεταστραφήσεται εἰς σκότος καὶ ἡ σελήνη εἰς αἷμα πρὶν ἐλθεῖν
[1mpta0001.pta004.pta-grc1:10[0m (Ergebnis nr. 1 an Position 412): θεοῦ Ποῦ ἤκουσας ἐν τῷ εὐαγγελίῳ Ἰησοῦ Χριστοῦ ὅτι ὁ [38;5;4mἥλιος[0m καὶ ἡ σελήνη δημιουργοί εἰσι ποῦ εἶπεν ὁ Χριστὸς ὅτι
[1mpta0001.pta006.pta-grc1:6[0m (Ergebnis nr. 1 an Position 563): καὶ ἔστι ταπεινὴ καὶ εὐμεταχείριστος καὶ εὐάλωτος ὀλίγον δὲ θαλφθεῖσα [38;5;4mἡλίῳ[0m ἀνίπταται ἡ ἐπὶ τόπῳ μὴ δυναμένη πρότερον ἐπαίρεσθαι εἰς τὸν
[1mpta0001.pta006.pta-grc1:6[0m (Ergebnis nr. 2 an Position 624): ἐξήλατο ὡς ἀττέλεβος ὁ ἐπίμικτός σου ὡς ἀκρίς Ἀνέτειλεν ὁ [38;5;4mἥλιος[0m καὶ ἐξεπετάσθη καὶ ἐπελάθετο τοῦ τόπου αὐτῆς Τὸν λαὸν λέγει
[1mpta0001.pta006.pta-grc1:6[0m (Ergebnis nr. 3 an Position 650): καὶ ὡς ἀκρὶς ἐπιβεβηκυῖα φραγμῷ ἐν ἡμέρᾳ παγετοῦ Ἀνέτειλεν ὁ [38;5;4mἥλιος[0m καὶ ἐξεπετάσθη καὶ οὐκ ἐπέγνω τὸν τόπ

In [11]:
context = get_broader_context("pta0001.pta003.pta-grc1.xml",4881,100)
print(context)

σάρκα Αὕτη ἡ ἐπαγγελία τοῦ ΠατρόςἈλλὰ πόθεν δῆλον ὅτι ἐκ προσώπου τοῦ Πατρὸς ἦν ἡ ἐπαγγελία Ἐπειδὴ οἶδα καὶ τὸν Υἱὸν λαλοῦντα ἐν τοῖς προφήταις πόθεν τοῦτο εἰ ὁ Πατὴρ ἦν ὁ λέγων Ἐκχεῶ ἀπὸ τοῦ Πνεύματός μου ἐπὶ πᾶσαν σάρκα καὶ προφητεύσουσιν οἱ υἱοὶ ὑμῶν καὶ αἱ θυγατέρες ὑμῶν Καὶ δώσω σημεῖα ἐν τῷ οὐρανῷ ἄνω καὶ ἐν τῇ γῇ κάτω αἷμα καὶ πῦρ καὶ ἀτμίδα καπνοῦ Εἶπον πολλάκις αἷμα τὸ ἐκ τῆς πλευρᾶς σημεῖον γὰρ ἦν ἀπὸ νεκροῦ σώματος αἷμα προερχόμενον ἰδεῖν αἷμα τὸ ἀπὸ πλευρᾶς πῦρ τὸ ἐπὶ τῶν ἀποστόλων Αἷμα καὶ πῦρ καὶ ἀτμίδα καπνοῦ Ὁ [38;5;4mἥλιος[0m μεταστραφήσεται εἰς σκότος καὶ ἡ σελήνη εἰς αἷμα πρὶν ἐλθεῖν τὴν ἡμέραν Κυρίου τὴν μεγάλην καὶ ἐπιφανῆ Ὅρα τὸν λαλοῦντα Θεὸν Πατέρα πῶς τὸν Υἱὸν ἑρμηνεύει λέγων Πρὶν ἐλθεῖν τὴν ἡμέραν Κυρίου καὶ οὐ λέγει Τὴν ἡμέραν τὴν ἐμήν Εἰ γὰρ σὺ δίδως σημεῖα πῶς εἰς ἄλλον μεταφέρεις τὴν ἔννοιαν λέγων Πρὶν ἐλθεῖν τὴν ἡμέραν Κυρίου τὴν μεγάλην καὶ ἐπιφανῆ Πρόσεχε Ἴσως ἐρεῖ τις τὸν προφήτην λέγειν ἐκ προσώπου ἰδίου Πρὶν ἐλθεῖν τὴν ἡμέραν Ἀλλ οὐκ ἦν εἰπεῖν τὸν 