In [1]:
from cltk import NLP
from cltk.data.fetch import FetchCorpus

### Download Tesserae Corpus

In [2]:
corpus_downloader = FetchCorpus(language="grc")
corpus_downloader.import_corpus("grc_text_tesserae")
cltk_nlp = NLP(language='grc')

‎𐤀 CLTK version '1.0.11'.
Pipeline for language 'Ancient Greek' (ISO: 'grc'): `GreekNormalizeProcess`, `GreekStanzaProcess`, `GreekEmbeddingsProcess`, `StopsProcess`, `GreekNERProcess`.


### Paul's Epistles

In [3]:
paul = [
    # undisputed
    'new_testament.i_thessalonians',
    'new_testament.galatians',
    'new_testament.i_corinthinians',
    'new_testament.philippians',
    'new_testament.philemon',
    'new_testament.ii_corinthinians',
    'new_testament.romans',
    
    # undecided
    'new_testament.colossians',
    'new_testament.ii_thessalonians',
    
    # disputed
    'new_testament.ephesians',
    'new_testament.i_timothy',
    'new_testament.ii_timothy',
    'new_testament.titus',
    
    # refuted
    'new_testament.hebrews',
]

In [4]:
not_paul = [
   'new_testament.mathew', 
   'new_testament.mark', 
   'new_testament.luke',
   'new_testament.john', 
   'new_testament.revelation', 
   'clement.exhortation', 
   'clement.protrepticus', 
   'demosthenes.letters', 
   'euripides.electra', 
   'plutarch.romulus', 
   'aeschylus.seven_against_thebes', 
   'appian.civil_wars.part.1', 
   'aristophanes.lysistrata', 
   'aristotle.nicomachean_ethics', 
   'basil_of_caesarea.de_legendis',
   'flavius_josephus.antiquitates_judaicae.part.1', 
   'gregory_of_nazianzus.christus_patiens', 
   'herodotus.histories.part.7', 
   'homer.odyssey.part.16', 
   'new_testament.acts', 
   'plato.meno'
]

### Get Corpus Directory From Local .env File

In [5]:
import os
import re
from dotenv import load_dotenv
from tqdm.notebook import tqdm
from collections import defaultdict as dd

In [6]:
load_dotenv()

DATA_DIR = os.getenv('DATA_DIR')
files = os.listdir(DATA_DIR)

### Read Texts

In [7]:
texts = dd(list)
authors = dd(list)

In [8]:
def read_text(file):
    filepath = '{}/{}'.format(DATA_DIR, file)
    text = re.sub('<[^<]+>', "", open(filepath, encoding="utf8").read())
    return re.sub('\n', "", text)

In [9]:
for file in tqdm(files):
    key = os.path.splitext(file)[0]
    author = key.split('.')[0]
    authors[author].append(key)
    text = read_text(file)
    texts[key] = text

  0%|          | 0/821 [00:00<?, ?it/s]

### Annotate Docs

In [10]:
docs = dd(list)

for p in tqdm(paul):
    docs[p] = cltk_nlp.analyze(text=texts[p])

  0%|          | 0/14 [00:00<?, ?it/s]

In [11]:
for np in tqdm(not_paul):
    docs[np] = cltk_nlp.analyze(text=texts[np])

  0%|          | 0/21 [00:00<?, ?it/s]

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]


In [69]:
save_obj(docs, "docs1")

In [70]:
docs = load_obj("docs1")

### Normalize Doc Features

In [12]:
def get_word(doc, i, n):
    if i > n - 1:
        return None
    return doc[i]

def get_bigram(W):
    if W[1] is None:
        return []
    
    w_gram = "{} {}".format(W[0].string, W[1].string)
    p_gram = "{} {}".format(W[0].upos, W[1].upos)
    
    return [w_gram, p_gram]

def get_trigram(W):
    if W[2] is None:
        return []
    
    w_gram = "{} {} {}".format(W[0].string, W[1].string, W[2].string)
    p_gram = "{} {} {}".format(W[0].upos, W[1].upos, W[2].upos)
    
    return [w_gram, p_gram]

def increment_features(F, V):
    for i,k in enumerate(F):
        if i > len(V) - 1:
            break
        F[k][V[i]] += 1
    return F

def get_percents(F, f, n):
    for k, v in F[f].items():
        F[f][k] = F[f][k] / n
    return F[f]

In [13]:
def get_features(doc):
    n = len(doc.words)
    
    features = {
        'word_bigram': dd(int),
        'word_trigram': dd(int),
        'pos_bigram': dd(int),
        'pos_trigram': dd(int)
    }

    for i, w in enumerate(doc):
        w2 = get_word(doc, i+1, n)
        w3 = get_word(doc, i+2, n)
        
        bigrams = get_bigram([w, w2])
        trigrams = get_trigram([w, w2, w3])
        n_grams = bigrams + trigrams
        features = increment_features(features, n_grams)
        
    for f in features:
        features[f] = get_percents(features, f, n)
    
    return features

### Extract Features From Docs

In [14]:
CATS = [0, 1]

data = {}

for doc in docs:
    features = get_features(docs[doc])
    
    for key in features:
        features[key] = sorted(features[key].items(), key=lambda item: item[1])
        
    data[doc] = features

### Split Data

In [15]:
import pandas as pd
import numpy as np 

def get_df(d, c):
    df = pd.DataFrame({'text': d, 'cat': c}, index=[0])
    for f in features:
        for x in data[d][f]:
            df[x[0]] = x[1]
    return df

def get_pos_df(D):
    df = pd.DataFrame()
    for d in tqdm(D):
        df = df.append(get_df(d, CATS[1]), ignore_index=True)
    df = df.fillna(0)
    return df

def get_neg_df(D, E):
    df = pd.DataFrame()
    for d in tqdm(D):
        if d not in E:
            df = df.append(get_df(d, CATS[0]), ignore_index=True)
    df = df.fillna(0)
    return df

In [28]:
pos_df = get_pos_df(paul[:6])
neg_df = get_neg_df(docs, paul[:6])

df = pos_df.append(neg_df, ignore_index=True)
df = df.fillna(0)
df

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

Unnamed: 0,text,cat,ΠΑΥΛΟς ΚΑΙ ΣΙΛΟΥΑΝΟς,ΚΑΙ ΣΙΛΟΥΑΝΟς ΚΑΙ ΤΙΜΟΘΕΟς,ΚΑΙ ΤΙΜΟΘΕΟς τῇ,τῇ ἐκκλησίᾳ,ἐκκλησίᾳ Θεσσαλονικέων,Θεσσαλονικέων ἐν,ἐν θεῷ,θεῷ πατρὶ,...,AUX X ADV,AUX INTJ ADJ,INTJ PROPN X,AUX ADJ INTJ,X ADP PRON,AUX NOUN INTJ,NUM PRON DET,SCONJ SCONJ AUX,X ADV INTJ,AUX ADV INTJ
0,new_testament.i_thessalonians,1,0.000679,0.000679,0.000679,0.000679,0.000679,0.000679,0.000679,0.000679,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,new_testament.galatians,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,new_testament.i_corinthinians,1,0.0,0.0,0.0,0.00044,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,new_testament.philippians,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,new_testament.philemon,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,new_testament.ii_corinthinians,1,0.0,0.0,0.0,0.000224,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,new_testament.romans,0,0.0,0.0,0.0,0.0,0.0,0.0,0.000141,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,new_testament.colossians,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001265,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,new_testament.ii_thessalonians,0,0.00122,0.00122,0.00122,0.00122,0.00122,0.00122,0.00122,0.00122,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,new_testament.ephesians,0,0.0,0.0,0.0,0.000413,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [63]:
# NOTE: Make sure to run the previous cell each time before you run this one, because this permanently drops data 
#       from the tables 

from sklearn import preprocessing

data_test = df.sample(frac=0.3, random_state=300)
data_train = df.drop(data_test.index)

train_y = data_train.cat.values
test_y = data_test.cat.values

data_train.drop(data_test.columns[[0, 1]], axis=1, inplace=True)
data_test.drop(data_test.columns[[0, 1]], axis=1, inplace=True)

train_X = np.array(data_train.values) 
test_X = np.array(data_test.values) 

print(train_X.shape, train_y.shape)
print(test_X.shape, test_y.shape)

print(train_y)
print(test_y)

(25, 559439) (25,)
(10, 559439) (10,)
[1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 1 0]


In [64]:
from sklearn.linear_model import LogisticRegression

logisticRegr = LogisticRegression(solver='liblinear')
logisticRegr.fit(train_X, train_y) 

LogisticRegression(solver='liblinear')

In [65]:
result = logisticRegr.predict(test_X)
result

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [66]:
logisticRegr.predict_proba(test_X)

array([[0.75444266, 0.24555734],
       [0.75691738, 0.24308262],
       [0.75526826, 0.24473174],
       [0.7548589 , 0.2451411 ],
       [0.75474748, 0.24525252],
       [0.75497662, 0.24502338],
       [0.75573902, 0.24426098],
       [0.75508164, 0.24491836],
       [0.7538342 , 0.2461658 ],
       [0.75602899, 0.24397101]])

In [67]:
accuracy_score = 0 

for i, label in enumerate(test_y):
    if result[i] == label:
        accuracy_score = accuracy_score + 1 
        
accuracy_score = accuracy_score / len(test_y)
accuracy_score

0.9

In [50]:
unknown_df = get_pos_df(paul[7:])
train_X = np.array(data_train.values)

  0%|          | 0/7 [00:00<?, ?it/s]

In [59]:
import tensorflow as tf
print("TensorFlow version: {}".format(tf.__version__))
print("Eager execution: {}".format(tf.executing_eagerly()))

TensorFlow version: 2.5.0-rc1
Eager execution: True


In [72]:
train_y_tf = train_y.reshape((-1,1))

model = tf.keras.Sequential([
    tf.keras.layers.Dense(units=300, activation='relu'),
    tf.keras.layers.Dense(units=600, activation='relu'),
    tf.keras.layers.Dense(units=300, activation='relu'),
    #tf.keras.layers.Dropout(.1),
    tf.keras.layers.Dense(units=len(CATS), activation='softmax')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(train_X, train_y_tf, epochs=20) 

Epoch 1/20


ValueError: in user code:

    C:\Users\othm\Anaconda3\envs\greek\lib\site-packages\tensorflow\python\keras\engine\training.py:855 train_function  *
        return step_function(self, iterator)
    C:\Users\othm\Anaconda3\envs\greek\lib\site-packages\tensorflow\python\keras\engine\training.py:845 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\othm\Anaconda3\envs\greek\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1285 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\othm\Anaconda3\envs\greek\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2833 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\othm\Anaconda3\envs\greek\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:3608 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\othm\Anaconda3\envs\greek\lib\site-packages\tensorflow\python\keras\engine\training.py:838 run_step  **
        outputs = model.train_step(data)
    C:\Users\othm\Anaconda3\envs\greek\lib\site-packages\tensorflow\python\keras\engine\training.py:796 train_step
        loss = self.compiled_loss(
    C:\Users\othm\Anaconda3\envs\greek\lib\site-packages\tensorflow\python\keras\engine\compile_utils.py:204 __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    C:\Users\othm\Anaconda3\envs\greek\lib\site-packages\tensorflow\python\keras\losses.py:155 __call__
        losses = call_fn(y_true, y_pred)
    C:\Users\othm\Anaconda3\envs\greek\lib\site-packages\tensorflow\python\keras\losses.py:259 call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    C:\Users\othm\Anaconda3\envs\greek\lib\site-packages\tensorflow\python\util\dispatch.py:206 wrapper
        return target(*args, **kwargs)
    C:\Users\othm\Anaconda3\envs\greek\lib\site-packages\tensorflow\python\keras\losses.py:1754 binary_crossentropy
        backend.binary_crossentropy(
    C:\Users\othm\Anaconda3\envs\greek\lib\site-packages\tensorflow\python\util\dispatch.py:206 wrapper
        return target(*args, **kwargs)
    C:\Users\othm\Anaconda3\envs\greek\lib\site-packages\tensorflow\python\keras\backend.py:5023 binary_crossentropy
        return nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
    C:\Users\othm\Anaconda3\envs\greek\lib\site-packages\tensorflow\python\util\dispatch.py:206 wrapper
        return target(*args, **kwargs)
    C:\Users\othm\Anaconda3\envs\greek\lib\site-packages\tensorflow\python\ops\nn_impl.py:132 sigmoid_cross_entropy_with_logits
        raise ValueError("logits and labels must have the same shape (%s vs %s)" %

    ValueError: logits and labels must have the same shape ((None, 2) vs (None, 1))


In [68]:
import pickle 

def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)