# Dateien von TLG-E laden

> Voraussetzung: CLTK funktioniert maximal mit Python 3.9

Briefe von Basilius von Caesarea, Gregor von Nazianz, Gregor von Nyssa, Libanius, Synesius, Julian und Theodoret von Kyrrhos

In [13]:
import re, os, unicodedata
from cltk.data.fetch import FetchCorpus

In [1]:
# import TLG as corpus
corpus_downloader = FetchCorpus(language="grc")
corpus_downloader.import_corpus('tlg', '/mnt/c/Users/a_v_s/tlg/')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Konvertierung in Unicode
# Der Ordner "grc/works" muss bereits vorhanden sein
from cltk.corpora.grc.tlg.tlgu import TLGU
# files to be converted: 
# Basilius = 2040 – Briefe 004, 
# Gregor von Nazianz = 2022 - Briefe 001, 
# Gregor von Nyssa = 2017 – Briefe 033, 
# Theodoret = 4089 – Briefe 005, 006, 007, 
# Libanius = 2200 – Briefe 001, 
# Synesius = 2006 – Briefe 001, 
# Julian = 2003 – Briefe 013
filesA = ["2040","2022","2017","2200"]
filesB = ["4089","2006","2003"]
t = TLGU()
for file in filesA:
    t.convert("~/cltk_data/originals/tlg/tlg"+file+".txt", "~/cltk_data/grc/works/tlg"+file, divide_works=True, extra_args=['X'])
# Thdt, Synesius, Julian: anderes Nummerierungsschema 
for file in filesB:
    t.convert("~/cltk_data/originals/tlg/tlg"+file+".txt", "~/cltk_data/grc/works/tlg"+file, divide_works=True, extra_args=['Y'])

In [89]:
# Bereinigung der Dateien
# Hyphenation und Sonderzeichen (—,(),<>,〈〉,[],†) löschen, Apostroph korrigieren
# Normalisierung nach Unicode NFC
# Ordner "works-clean" muss bereits vorhanden sein

def remove_sonderzeichen(inputText):
    inputText = re.sub(r'[()<>〈〉\[\]†]+', r'', inputText) 
    inputText = re.sub(r'—', r' ', inputText) 
    return re.sub(r'\s{2,}',r' ',inputText) # remove multiple space
def correct_apostrophe(inputText):
    # correct apostroph = U+2019, nicht U+0027
    return re.sub(r"'",r"’", inputText)
def reflow(infile, outfile):
    # from https://stackoverflow.com/a/71025588
    with open(infile) as source, open(outfile, "w") as dest:
        holdover = ""
        for line in source.readlines():
            line = correct_apostrophe(remove_sonderzeichen(unicodedata.normalize("NFC", line)))
            line = line.rstrip("\n")
            line = line.rstrip(" ")
            if line.endswith("-"):
                lin, _, e = line.rpartition(" ")
            else:
                lin, e = line, ""
            dest.write(f"{holdover}{lin} \n")
            holdover = e[:-1]

files = ["tlg2040-004","tlg2022-001","tlg2017-033","tlg4089-005","tlg4089-006","tlg4089-007", "tlg2200-001", "tlg2006-001", "tlg2003-013"]

for file in files:
    reflow(os.path.expanduser("~/cltk_data/grc/works/"+file+".txt"),os.path.expanduser("~/cltk_data/grc/works-clean/"+file+".txt"))

## Auszug der einzelnen Briefe

In [92]:
def split_into_letters(infile, outfolder):
    if not outfolder:
        os.makedirs(outfolder)
    with open(infile) as source:
        contents = source.read()
    letters = re.split(r"\n(?=\d+)", contents)[1:]
    fileprefix = "".join(infile.split("/")[-1:]).split(".txt")[0]
    os.chdir(outfolder)
    for letter in letters:
        counter = int(re.search("\d+", letter).group())
        thiscounter = f"{counter:04d}"
        with open(fileprefix+"_ep"+thiscounter+".txt", "w") as dest:
            dest.write(letter)
    print(infile,counter)

In [93]:
files = ["tlg2040-004","tlg2022-001","tlg2017-033","tlg4089-005","tlg4089-006","tlg4089-007", "tlg2200-001", "tlg2006-001", "tlg2003-013"]

for file in files:
    split_into_letters(os.path.expanduser("~/cltk_data/grc/works-clean/"+file+".txt"),os.path.expanduser("~/cltk_data/grc/letters"))

/home/stockhausen/cltk_data/grc/works-clean/tlg2040-004.txt 366
/home/stockhausen/cltk_data/grc/works-clean/tlg2022-001.txt 249
/home/stockhausen/cltk_data/grc/works-clean/tlg2017-033.txt 5
/home/stockhausen/cltk_data/grc/works-clean/tlg4089-005.txt 52
/home/stockhausen/cltk_data/grc/works-clean/tlg4089-006.txt 95
/home/stockhausen/cltk_data/grc/works-clean/tlg4089-007.txt 147
/home/stockhausen/cltk_data/grc/works-clean/tlg2200-001.txt 1544
/home/stockhausen/cltk_data/grc/works-clean/tlg2006-001.txt 159
/home/stockhausen/cltk_data/grc/works-clean/tlg2003-013.txt 157


## Statistik

In [102]:
def letters_statistics(infile):
    with open(infile) as source:
        contents = source.read()
    letters = re.split(r"\n(?=\d+)", contents)[1:]
    author = "".join(infile.split("/")[-1:]).split(".txt")[0]
    letter_stat = []
    for letter in letters:
        statistics = {}
        counter = int(re.search("\d+", letter).group())
        thiscounter = f"{counter:04d}"
        try:
            adressee = re.search("\{([\s\S\n]+?)\}", letter).group(1)
        except AttributeError:
            # if no adressee mentioned
            adressee = ""
        length = len(re.findall("[^\S+?]", letter)) # words = non-whitespace
        statistics["author"] = author
        statistics["no"] = thiscounter
        statistics["adressee"] = adressee
        statistics["words"] = length
        letter_stat.append(statistics)
    return letter_stat

In [117]:
import csv
files = ["tlg2040-004","tlg2022-001","tlg2017-033","tlg4089-005","tlg4089-006","tlg4089-007", "tlg2200-001", "tlg2006-001", "tlg2003-013"]

statistics = []
for file in files:
    stat = letters_statistics(os.path.expanduser("~/cltk_data/grc/works-clean/"+file+".txt"))
    statistics.extend(stat)

with open(os.path.expanduser("~/Workshop-PAG/statistics/letters_statistics.csv"), "w") as statfile:
    writer = csv.DictWriter(statfile, fieldnames=['author', 'no', 'adressee', 'words'])
    writer.writeheader()
    writer.writerows(statistics)