In [7]:
from cltk import NLP
from cltk.data.fetch import FetchCorpus

### Download Tesserae Corpus

In [8]:
corpus_downloader = FetchCorpus(language="grc")
corpus_downloader.import_corpus("grc_text_tesserae")
cltk_nlp = NLP(language='grc')

‎𐤀 CLTK version '1.0.11'.
Pipeline for language 'Ancient Greek' (ISO: 'grc'): `GreekNormalizeProcess`, `GreekStanzaProcess`, `GreekEmbeddingsProcess`, `StopsProcess`, `GreekNERProcess`.


### Separate Pauline from Non-Pauline

In [9]:
paul = [
    # undisputed
    'new_testament.i_thessalonians',
    'new_testament.galatians',
    'new_testament.i_corinthinians',
    'new_testament.philippians',
    'new_testament.philemon',
    'new_testament.ii_corinthinians',
    'new_testament.romans',
    
    # undecided
    'new_testament.colossians',
    'new_testament.ii_thessalonians',
    
    # disputed
    'new_testament.ephesians',
    'new_testament.i_timothy',
    'new_testament.ii_timothy',
    'new_testament.titus',
    
    # refuted
    'new_testament.hebrews',
]

In [10]:
not_paul = [
   'new_testament.mathew', 
   'new_testament.mark', 
   'new_testament.luke',
   'new_testament.john', 
   'new_testament.revelation', 
   'clement.exhortation', 
   'clement.protrepticus', 
   'demosthenes.letters', 
   'euripides.electra', 
   'plutarch.romulus', 
   'aeschylus.seven_against_thebes', 
   'appian.civil_wars.part.1', 
   'aristophanes.lysistrata', 
   'aristotle.nicomachean_ethics', 
   'basil_of_caesarea.de_legendis',
   'flavius_josephus.antiquitates_judaicae.part.1', 
   'gregory_of_nazianzus.christus_patiens', 
   'herodotus.histories.part.7', 
   'homer.odyssey.part.16', 
   'new_testament.acts', 
   'plato.meno'
]

### Get Corpus Directory From Local .env File

In [11]:
import os
import re
from dotenv import load_dotenv
from tqdm.notebook import tqdm
from collections import defaultdict as dd

In [12]:
load_dotenv()

DATA_DIR = os.getenv('DATA_DIR')
files = os.listdir(DATA_DIR)

### Read Texts

In [13]:
texts = dd(list)
authors = dd(list)

In [14]:
def read_text(file):
    filepath = '{}/{}'.format(DATA_DIR, file)
    text = re.sub('<[^<]+>', "", open(filepath, encoding="utf8").read())
    return re.sub('\n', "", text)

In [15]:
for file in tqdm(files):
    key = os.path.splitext(file)[0]
    author = key.split('.')[0]
    authors[author].append(key)
    text = read_text(file)
    texts[key] = text

  0%|          | 0/821 [00:00<?, ?it/s]

### Annotate Docs

In [16]:
import pickle 

def save_obj(obj, name ):
    with open('./'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('./' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

def get_annotations(T):
    D = dd(list)

    for t in tqdm(T):
        D[t] = cltk_nlp.analyze(text=T[t])
    return D

In [18]:
## docs = get_annotations(paul + not_paul)
## save_obj(docs, "docs")

docs = load_obj("docs")

### Normalize Features

In [19]:
def get_word(doc, i, n):
    if i > n - 1:
        return None
    return doc[i]

def get_bigram(W):
    if W[1] is None:
        return []
    
    w_gram = "{} {}".format(W[0].string, W[1].string)
    p_gram = "{} {}".format(W[0].upos, W[1].upos)
    
    return [w_gram, p_gram]

def get_trigram(W):
    if W[2] is None:
        return []
    
    w_gram = "{} {} {}".format(W[0].string, W[1].string, W[2].string)
    p_gram = "{} {} {}".format(W[0].upos, W[1].upos, W[2].upos)
    
    return [w_gram, p_gram]

def increment_features(F, V):
    for i,k in enumerate(F):
        if i > len(V) - 1:
            break
        F[k][V[i]] += 1
    return F

def get_percents(F, f, n):
    for k, v in F[f].items():
        F[f][k] = F[f][k] / n
    return F[f]

In [20]:
def get_features(doc):
    n = len(doc.words)
    
    features = {
        'word_bigram': dd(int),
        'word_trigram': dd(int),
        'pos_bigram': dd(int),
        'pos_trigram': dd(int)
    }

    for i, w in enumerate(doc):
        w2 = get_word(doc, i+1, n)
        w3 = get_word(doc, i+2, n)
        
        bigrams = get_bigram([w, w2])
        trigrams = get_trigram([w, w2, w3])
        n_grams = bigrams + trigrams
        features = increment_features(features, n_grams)
        
    for f in features:
        features[f] = get_percents(features, f, n)
    
    return features

### Extract Features From Docs

In [21]:
CATS = [0, 1]

data = {}

for doc in docs:
    features = get_features(docs[doc])
    
    for key in features:
        features[key] = sorted(features[key].items(), key=lambda item: item[1])
        
    data[doc] = features

### Categorize Data

In [22]:
import pandas as pd
import numpy as np 
from more_itertools import take

def topN(row, n):
    x = row.to_dict() # convert the input row to a dictionary 
    x = {k: v for k, v in sorted(x.items(), key=lambda item: -item[1])} # sort the dictionary based on their values 
    n_items = take(n, x.items()) # extract the first n values from the dictionary 
    return n_items

def get_df(d, c, n):
    df = pd.DataFrame(index=[0]) 
    for f in features:
        for x in data[d][f]:
            df[x[0]] = x[1]
    topColsTuples = topN(df.iloc[0], n)
    topColsNames = [ entry[0] for entry in topColsTuples ]
    finalDf = df[topColsNames]        
    finalDf.insert(0, 'cat', c)        
    finalDf.insert(0, 'text', d) 
    return finalDf

def get_pos_df(D, n):
    df = pd.DataFrame()
    for d in tqdm(D):
        df = df.append(get_df(d, CATS[1], n), ignore_index=True)
    df = df.fillna(0)
    return df

def get_neg_df(D, E, n):
    df = pd.DataFrame()
    for d in tqdm(D):
        if d not in E:
            df = df.append(get_df(d, CATS[0], n), ignore_index=True)
    df = df.fillna(0)
    return df

In [23]:
#n = 10 # number of highest entries to include 

#pos_df = get_pos_df(paul[:6], n)
#neg_df = get_neg_df(docs, paul[:6], n)

#df = pos_df.append(neg_df, ignore_index=True)
#df = df.fillna(0)

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

In [25]:
#save_obj(df, "df")
df = load_obj("df")

### Split Data

In [95]:
def split_data(df):
    c = df.copy()
    
    test = c.sample(frac=0.3, random_state=300)
    train = c.drop(test.index)

    test_y = test.cat.values
    train_y = train.cat.values

    train.drop(train.columns[[0, 1]], axis=1, inplace=True)
    test.drop(test.columns[[0, 1]], axis=1, inplace=True)

    train_X = np.array(train.values) 
    test_X = np.array(test.values)
    
    return (train_X, train_y, test_X, test_y)

In [96]:
train_X, train_y, test_X, test_y = split_data(df)

### Logistic Regression Classifier

In [97]:
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression

In [98]:
logisticRegr = LogisticRegression(solver='liblinear')
logisticRegr.fit(train_X, train_y) 

result = logisticRegr.predict(test_X)
prob = logisticRegr.predict_proba(test_X)

print('PREDICTION:\n{}\n'.format(result))
print('PROBABILITIES:\n{}'.format(prob))

PREDICTION:
[0 0 0 0 0 0 0 0 0 0]

PROBABILITIES:
[[0.75503342 0.24496658]
 [0.75647066 0.24352934]
 [0.75466475 0.24533525]
 [0.75424633 0.24575367]
 [0.75414557 0.24585443]
 [0.75435813 0.24564187]
 [0.75467746 0.24532254]
 [0.75470133 0.24529867]
 [0.75379878 0.24620122]
 [0.75578804 0.24421196]]


### Accuracy

In [99]:
accuracy_score = 0 

for i, label in enumerate(test_y):
    if result[i] == label:
        accuracy_score = accuracy_score + 1 
        
print('ACCURACY: {}'.format(accuracy_score / len(test_y)))

ACCURACY: 0.9


### Neural Net

In [100]:
import tensorflow as tf

In [101]:
X = train_X
y = train_y.reshape((-1,1))

model = tf.keras.Sequential([
    tf.keras.layers.Dense(20, activation='sigmoid'),
    tf.keras.layers.Dropout(.1),
    tf.keras.layers.Dense(5, activation='sigmoid'),
    tf.keras.layers.Dense(len(CATS))
])

model.compile(
    loss=tf.losses.BinaryCrossentropy(),
    optimizer=tf.optimizers.Adam(),
    metrics=['accuracy']
)

model.fit(X, y, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x21d14b1fc70>