In [1]:
from cltk import NLP
from cltk.data.fetch import FetchCorpus

### Download Tesserae Corpus

In [28]:
corpus_downloader = FetchCorpus(language="grc")
corpus_downloader.import_corpus("grc_text_tesserae")
cltk_nlp = NLP(language='grc')

‎𐤀 CLTK version '1.0.6'.
Pipeline for language 'Ancient Greek' (ISO: 'grc'): `GreekNormalizeProcess`, `GreekStanzaProcess`, `GreekEmbeddingsProcess`, `StopsProcess`, `GreekNERProcess`.


### Paul's Epistles

In [23]:
paul = [
    # undisputed
    'new_testament.i_thessalonians',
    'new_testament.galatians',
    'new_testament.i_corinthinians',
    'new_testament.philippians',
    'new_testament.philemon',
    'new_testament.ii_corinthinians',
    'new_testament.romans',
    
    # undecided
    'new_testament.colossians',
    'new_testament.ii_thessalonians',
    
    # disputed
    'new_testament.ephesians',
    'new_testament.i_timothy',
    'new_testament.ii_timothy',
    'new_testament.titus',
    
    # refuted
    'new_testament.hebrews',
]

### Get Corpus Directory From Local .env File

In [4]:
import os
import re
from dotenv import load_dotenv
from tqdm.notebook import tqdm
from collections import defaultdict as dd

In [5]:
load_dotenv()

DATA_DIR = os.getenv('DATA_DIR')
files = os.listdir(DATA_DIR)

### Read Texts

In [10]:
texts = dd(list)
authors = dd(list)

In [11]:
def read_text(file):
    filepath = '{}/{}'.format(DATA_DIR, file)
    text = re.sub('<[^<]+>', "", open(filepath, encoding="utf8").read())
    return re.sub('\n', "", text)

In [12]:
for file in tqdm(files):
    key = os.path.splitext(file)[0]
    author = key.split('.')[0]
    authors[author].append(key)
    text = read_text(file)
    texts[key] = text

  0%|          | 0/821 [00:00<?, ?it/s]

### Annotate Docs

In [75]:
docs = dd(list)

for p in paul:
    docs[p] = cltk_nlp.analyze(text=texts[p])

### Normalize Doc Features

In [77]:
def get_word(doc, i, n):
    if i > n - 1:
        return None
    return doc[i]

def get_bigram(W):
    if W[1] is None:
        return []
    
    w_gram = "{} {}".format(W[0].string, W[1].string)
    p_gram = "{} {}".format(W[0].upos, W[1].upos)
    
    return [w_gram, p_gram]

def get_trigram(W):
    if W[2] is None:
        return []
    
    w_gram = "{} {} {}".format(W[0].string, W[1].string, W[2].string)
    p_gram = "{} {} {}".format(W[0].upos, W[1].upos, W[2].upos)
    
    return [w_gram, p_gram]

def increment_features(F, V):
    for i,k in enumerate(F):
        if i > len(V) - 1:
            break
        F[k][V[i]] += 1
    return F

def get_percents(F, f, n):
    for k, v in F[f].items():
        F[f][k] = F[f][k] / n
    return F[f]

In [78]:
def get_features(doc):
    n = len(doc.words)
    
    features = {
        'word_bigram': dd(int),
        'word_trigram': dd(int),
        'pos_bigram': dd(int),
        'pos_trigram': dd(int)
    }

    for i, w in enumerate(doc):
        w2 = get_word(doc, i+1, n)
        w3 = get_word(doc, i+2, n)
        
        bigrams = get_bigram([w, w2])
        trigrams = get_trigram([w, w2, w3])
        n_grams = bigrams + trigrams
        features = increment_features(features, n_grams)
        
    for f in features:
        features[f] = get_percents(features, f, n)
    
    return features

### Extract Features From Docs

In [81]:
CATS = [0, 1]

data = {}

for doc in docs:
    features = get_features(docs[doc])
    
    for key in features:
        features[key] = sorted(features[key].items(), key=lambda item: item[1])
        
    data[doc] = features

### Split Data

In [85]:
import pandas as pd
import numpy as np 

def get_df(d, c):
    df = pd.DataFrame({'text': d, 'cat': c}, index=[0])
    for f in features:
        for x in data[d][f]:
            df[x[0]] = x[1]
    return df

def get_pos_df(D):
    df = pd.DataFrame()
    for d in tqdm(D):
        df = df.append(get_df(d, CATS[1]), ignore_index=True)
    df = df.fillna(0)
    return df

def get_neg_df(D, E):
    df = pd.DataFrame()
    for d in tqdm(D):
        if d not in E:
            df = df.append(get_df(d, CATS[0]), ignore_index=True)
    df = df.fillna(0)
    return df

In [87]:
pos_df = get_pos_df(paul[:6])
neg_df = get_neg_df(docs, paul[:6])

df = pos_df.append(neg_df, ignore_index=True)
df = df.fillna(0)
df

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

Unnamed: 0,text,cat,ΠΑΥΛΟς ΚΑΙ ΣΙΛΟΥΑΝΟς,ΚΑΙ ΣΙΛΟΥΑΝΟς ΚΑΙ ΤΙΜΟΘΕΟς,ΚΑΙ ΤΙΜΟΘΕΟς τῇ,τῇ ἐκκλησίᾳ,ἐκκλησίᾳ Θεσσαλονικέων,Θεσσαλονικέων ἐν,ἐν θεῷ,θεῷ πατρὶ,...,CCONJ DET PROPN,PROPN X VERB,X VERB CCONJ,PROPN PROPN ADJ,NUM NOUN PROPN,PROPN CCONJ CCONJ,CCONJ CCONJ PROPN,PROPN PRON DET,ADV NUM NOUN,PROPN PROPN VERB
0,new_testament.i_thessalonians,1,0.000679,0.000679,0.000679,0.000679,0.000679,0.000679,0.000679,0.000679,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,new_testament.galatians,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,new_testament.i_corinthinians,1,0.0,0.0,0.0,0.00044,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,new_testament.philippians,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,new_testament.philemon,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,new_testament.ii_corinthinians,1,0.0,0.0,0.0,0.000224,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,new_testament.romans,0,0.0,0.0,0.0,0.0,0.0,0.0,0.000141,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,new_testament.colossians,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001265,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,new_testament.ii_thessalonians,0,0.00122,0.00122,0.00122,0.00122,0.00122,0.00122,0.00122,0.00122,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,new_testament.ephesians,0,0.0,0.0,0.0,0.000413,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
from sklearn import preprocessing

test = df.sample(frac=0.8, random_state=200)
train = df.drop(test.index)

train_y = train.cat.values
test_y = test.cat.values

data_train.drop(data_test.columns[[0]], axis=1, inplace=True)
data_test.drop(data_test.columns[[0]], axis=1, inplace=True)

train_X = np.array(data_train.values) 
test_X = np.array(data_test.values) 

print(train_X.shape, train_y.shape)
print(test_X.shape, test_y.shape)

print(train_X)
print(train_y)

In [None]:
from sklearn.linear_model import LogisticRegression

logisticRegr = LogisticRegression(solver='liblinear')
logisticRegr.fit(train_X, train_y) 

In [None]:
logisticRegr.predict(test_X)

In [None]:
import tensorflow as tf
print("TensorFlow version: {}".format(tf.__version__))
print("Eager execution: {}".format(tf.executing_eagerly()))

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(units=300, activation='relu'),
    tf.keras.layers.Dense(units=600, activation='relu'),
    tf.keras.layers.Dense(units=300, activation='relu'),
    tf.keras.layers.Dropout(.1),
    tf.keras.layers.Dense(units=len(cats), activation='softmax')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(train_X, train_y, epochs=20) 