In [1]:
from cltk import NLP
from cltk.data.fetch import FetchCorpus

### Download Tesserae Corpus

In [2]:
corpus_downloader = FetchCorpus(language="grc")
corpus_downloader.import_corpus("grc_text_tesserae")
cltk_nlp = NLP(language='grc')

‚Äéê§Ä CLTK version '1.0.6'.
Pipeline for language 'Ancient Greek' (ISO: 'grc'): `GreekNormalizeProcess`, `GreekStanzaProcess`, `GreekEmbeddingsProcess`, `StopsProcess`, `GreekNERProcess`.


### Paul's Epistles

In [3]:
paul = [
    # undisputed
    'new_testament.i_thessalonians',
    'new_testament.galatians',
    'new_testament.i_corinthians',
    'new_testament.philippians',
    'new_testament.philemon',
    'new_testament.ii_corinthians',
    'new_testament.romans',
    
    # undecided
    'new_testament.colossians',
    'new_testament.ii_thessalonians',
    
    # disputed
    'new_testament.ephesians',
    'new_testament.i_timothy',
    'new_testament.ii_timothy',
    'new_testament.titus',
    
    # refuted
    'new_testament.hebrews',
]

### Get Corpus Directory From Local .env File

In [4]:
import os
import re
from dotenv import load_dotenv
from tqdm.notebook import tqdm
from collections import defaultdict as dd

In [5]:
load_dotenv()

DATA_DIR = os.getenv('DATA_DIR')
files = os.listdir(DATA_DIR)

### Read Texts

In [6]:
keys = []
texts = dd(list)
authors = dd(list)

In [7]:
def read_text(file):
    filepath = '{}/{}'.format(DATA_DIR, file)
    text = re.sub('<[^<]+>', "", open(filepath, encoding="utf8").read())
    return re.sub('\n', "", text)

In [8]:
for file in tqdm(files):
    key = os.path.splitext(file)[0]
    author = key.split('.')[0]
    authors[author].append(key)
    text = read_text(file)
    texts[key] = text

  0%|          | 0/821 [00:00<?, ?it/s]

### Annotate Docs

In [9]:
docs = dd(list)
for key in tqdm(paul[:1]):
    docs[key] = cltk_nlp.analyze(text=texts[key])

  0%|          | 0/1 [00:00<?, ?it/s]

  return torch._C._cuda_getDeviceCount() > 0


In [10]:
docs

defaultdict(list,
            {'new_testament.i_thessalonians': Doc(language='grc', words=[Word(index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string='Œ†ŒëŒ•ŒõŒüœÇ', pos=proper_noun, lemma='Œ†ŒëŒ•ŒõŒüœÇ', stem=None, scansion=None, xpos='Ne', upos='PROPN', dependency_relation='nsubj', governor=11, features={Case: [nominative], Gender: [masculine], Number: [singular]}, category={F: [neg], N: [pos], V: [neg]}, embedding=array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                    0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                    0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                    0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                    0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
                    0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), stop=False, named_entity=False, syllabl

### Normalize Doc Features

In [11]:
def get_features(doc):
    features = {
        'word_freq': dd(int),
        'lemma_freq': dd(int),
        'governor_freq': dd(int),
        'upos_freq': dd(int),
        'pos_freq': dd(int),
        'xpos_freq': dd(int),
        'dependancy_freq': dd(int),
    }
    
    for word in doc:
        features['word_freq'][word.string] += 1
        features['lemma_freq'][word.lemma] += 1
        features['governor_freq'][word.governor] += 1
        features['upos_freq'][word.upos] += 1
        features['pos_freq'][str(word.pos)] += 1
        features['xpos_freq'][word.xpos] += 1
        features['dependancy_freq'][word.dependency_relation] += 1
    
    return features

### Extract Features From Docs

In [12]:
data = {}

for doc in docs:
    features = get_features(docs[doc])
    for key in features:
        features[key] = sorted(features[key].items(), key=lambda item: item[1])
    data[doc] = features

In [13]:
data

{'new_testament.i_thessalonians': {'word_freq': [('Œ†ŒëŒ•ŒõŒüœÇ', 1),
   ('ŒöŒëŒô Œ£ŒôŒõŒüŒ•ŒëŒùŒüœÇ', 1),
   ('ŒöŒëŒô Œ§ŒôŒúŒüŒòŒïŒüœÇ', 1),
   ('·ºêŒ∫Œ∫ŒªŒ∑œÉ·Ω∑·æ≥', 1),
   ('ŒòŒµœÉœÉŒ±ŒªŒøŒΩŒπŒ∫·Ω≥œâŒΩ', 1),
   ('œÄŒ±œÑœÅ·Ω∂', 1),
   ('ŒßœÅŒπœÉœÑ·ø∑:', 1),
   ('Œµ·º∞œÅ·ΩµŒΩŒ∑.', 1),
   ('Œï·ΩêœáŒ±œÅŒπœÉœÑŒø·ø¶ŒºŒµŒΩ', 1),
   ('œÄŒøŒπŒø·ΩªŒºŒµŒΩŒøŒπ', 1),
   ('œÄœÅŒøœÉŒµœÖœá·ø∂ŒΩ', 1),
   ('ŒºŒΩŒ∑ŒºŒøŒΩŒµ·ΩªŒøŒΩœÑŒµœÇ', 1),
   ('·ºîœÅŒ≥ŒøœÖ', 1),
   ('Œ∫·ΩπœÄŒøœÖ', 1),
   ('·ΩëœÄŒøŒºŒøŒΩ·øÜœÇ', 1),
   ('·ºêŒªœÄ·Ω∑Œ¥ŒøœÇ', 1),
   ('Œµ·º∞Œ¥·ΩπœÑŒµœÇ,', 1),
   ('·ºÄŒ¥ŒµŒªœÜŒø·Ω∂', 1),
   ('·º†Œ≥Œ±œÄŒ∑Œº·Ω≥ŒΩŒøŒπ', 1),
   ('[œÑŒø·ø¶]', 1),
   ('·ºêŒ∫ŒªŒøŒ≥·Ω¥ŒΩ', 1),
   ('·ºêŒ≥ŒµŒΩ·ΩµŒ∏Œ∑', 1),
   ('Œ¥œÖŒΩ·Ω±ŒºŒµŒπ', 1),
   ('œÄŒΩŒµ·ΩªŒºŒ±œÑŒπ', 1),
   ('·ºÅŒ≥·Ω∑·ø≥', 1),
   ('œÄŒªŒ∑œÅŒøœÜŒøœÅ·Ω∑·æ≥', 1),
   ('œÄŒøŒªŒª·øá,', 1),
   ('Œø·º∑ŒøŒπ', 1),
   ('·ºêŒ≥ŒµŒΩ·ΩµŒ∏Œ∑œÑŒµ', 1),
   ('Œ¥ŒµŒæ·Ω±ŒºŒµŒΩŒøŒπ', 1),
   ('œáŒ±œÅ·æ∂œÇ', 1),
   ('œÄŒΩŒµ·ΩªŒºŒ±œÑŒøœÇ', 1),
   ('·ºÅŒ≥·Ω∑ŒøœÖ,', 1

### Suffix Tree

In [14]:
from pystlm.stlm import STLM
from pystlm.suffixtree import SuffixTree
from pystlm.sequence import Sequence

trie = SuffixTree()

doc = docs[paul[0]]

for w in doc.tokens:
    trie.add(w)

trie.update_all_counts()
stlm = STLM(trie)
seq = Sequence()

for t in doc.tokens[:3]:
    seq.push_back(t)

stlm.prob(seq)

5.106335513765347e-07