In [1]:
from cltk import NLP
from cltk.data.fetch import FetchCorpus

### Download Tesserae Corpus

In [2]:
corpus_downloader = FetchCorpus(language="grc")
corpus_downloader.import_corpus("grc_text_tesserae")
cltk_nlp = NLP(language='grc')

‎𐤀 CLTK version '1.0.6'.
Pipeline for language 'Ancient Greek' (ISO: 'grc'): `GreekNormalizeProcess`, `GreekStanzaProcess`, `GreekEmbeddingsProcess`, `StopsProcess`, `GreekNERProcess`.


### Paul's Epistles

In [3]:
paul = [
    # undisputed
    'new_testament.i_thessalonians',
    'new_testament.galatians',
    'new_testament.i_corinthians',
    'new_testament.philippians',
    'new_testament.philemon',
    'new_testament.ii_corinthians',
    'new_testament.romans',
    
    # undecided
    'new_testament.colossians',
    'new_testament.ii_thessalonians',
    
    # disputed
    'new_testament.ephesians',
    'new_testament.i_timothy',
    'new_testament.ii_timothy',
    'new_testament.titus',
    
    # refuted
    'new_testament.hebrews',
]

### Get Corpus Directory From Local .env File

In [4]:
import os
import re
from dotenv import load_dotenv
from tqdm.notebook import tqdm
from collections import defaultdict as dd

In [5]:
load_dotenv()

DATA_DIR = os.getenv('DATA_DIR')
files = os.listdir(DATA_DIR)

### Read Texts

In [6]:
keys = []
texts = dd(list)
authors = dd(list)

In [7]:
def read_text(file):
    filepath = '{}/{}'.format(DATA_DIR, file)
    text = re.sub('<[^<]+>', "", open(filepath, encoding="utf8").read())
    return re.sub('\n', "", text)

In [8]:
for file in tqdm(files):
    key = os.path.splitext(file)[0]
    author = key.split('.')[0]
    authors[author].append(key)
    text = read_text(file)
    texts[key] = text

  0%|          | 0/821 [00:00<?, ?it/s]

### Annotate Docs

In [9]:
docs = dd(list)
for key in tqdm(paul[:1]):
    docs[key] = cltk_nlp.analyze(text=texts[key])

  0%|          | 0/1 [00:00<?, ?it/s]

  return torch._C._cuda_getDeviceCount() > 0


### Normalize Doc Features

In [10]:
def get_features(doc):
    features = {
        'word_freq': dd(int),
        'lemma_freq': dd(int),
        'governor_freq': dd(int),
        'upos_freq': dd(int),
        'pos_freq': dd(int),
        'xpos_freq': dd(int),
        'dependancy_freq': dd(int),
    }
    
    for word in doc:
        features['word_freq'][word.string] += 1
        features['lemma_freq'][word.lemma] += 1
        features['governor_freq'][word.governor] += 1
        features['upos_freq'][word.upos] += 1
        features['pos_freq'][str(word.pos)] += 1
        features['xpos_freq'][word.xpos] += 1
        features['dependancy_freq'][word.dependency_relation] += 1
    
    return features

### Extract Features From Docs

In [11]:
data = {}

for doc in docs:
    features = get_features(docs[doc])
    for key in features:
        features[key] = sorted(features[key].items(), key=lambda item: item[1])
    data[doc] = features

### Suffix Tree

In [12]:
from pystlm.stlm import STLM
from pystlm.suffixtree import SuffixTree
from pystlm.sequence import Sequence

trie = SuffixTree()

doc = docs[paul[0]]

tags = []

for word in doc:
    tags.append(word.upos)
    trie.add(word.upos)

trie.update_all_counts()
stlm = STLM(trie)
seq = Sequence()

for t in list(vocabulary):
    seq.push_back(t)

NameError: name 'vocabulary' is not defined

In [None]:
from collections import Counter

sentences = doc.sentences
sentence_tags = []

start_counter = Counter()
end_counter = Counter()

for sent in sentences:
    if len(sent) < 2: continue
    pos_tags = []
    for word in sent:
        pos_tags.append(word.upos)
    sentence_tags.append(pos_tags)
    
    start_counter[pos_tags[0]] += 1
    end_counter[pos_tags[len(pos_tags)-1]] +=1