In [2]:
from cltk import NLP
from cltk.data.fetch import FetchCorpus

AttributeError: type object 'gensim._matutils.array' has no attribute '__reduce_cython__'

### Download Tesserae Corpus

In [None]:
#corpus_downloader = FetchCorpus(language="grc")
#corpus_downloader.import_corpus("grc_text_tesserae")
cltk_nlp = NLP(language='grc')

### Paul's Epistles

In [None]:
paul = [
    # undisputed
    'new_testament.i_thessalonians',
    'new_testament.galatians',
    'new_testament.i_corinthians',
    'new_testament.philippians',
    'new_testament.philemon',
    'new_testament.ii_corinthians',
    'new_testament.romans',
    
    # undecided
    'new_testament.colossians',
    'new_testament.ii_thessalonians',
    
    # disputed
    'new_testament.ephesians',
    'new_testament.i_timothy',
    'new_testament.ii_timothy',
    'new_testament.titus',
    
    # refuted
    'new_testament.hebrews',
]

### Get Corpus Directory From Local .env File

In [None]:
import os
import re
from dotenv import load_dotenv
from tqdm.notebook import tqdm
from collections import defaultdict as dd

In [None]:
load_dotenv()

DATA_DIR = os.getenv('DATA_DIR')
files = os.listdir(DATA_DIR)

### Read Texts

In [None]:
keys = []
texts = dd(list)
authors = dd(list)

In [None]:
def read_text(file):
    filepath = '{}/{}'.format(DATA_DIR, file)
    text = re.sub('<[^<]+>', "", open(filepath, encoding="utf8").read())
    return re.sub('\n', "", text)

In [None]:
for file in tqdm(files):
    key = os.path.splitext(file)[0]
    author = key.split('.')[0]
    authors[author].append(key)
    text = read_text(file)
    texts[key] = text

### Annotate Docs

In [None]:
docs = dd(list)
for key in tqdm(paul[:1]): # train 
    docs[key] = cltk_nlp.analyze(text=texts[key])
    
for key in tqdm(paul[3:4]): # train 
    docs[key] = cltk_nlp.analyze(text=texts[key])
    
for key in tqdm(paul[1:2]): # test
    docs[key] = cltk_nlp.analyze(text=texts[key])
    
for key in tqdm(authors['plato'][:1]): # predict
    docs[key] = cltk_nlp.analyze(text=texts[key])

### Normalize Doc Features

In [None]:
def get_features(doc):
    features = {
        'word_bigram': dd(int),
        'word_trigram': dd(int),
        'pos_bigram': dd(int),
        'pos_trigram': dd(int)
    }
    
    for i, word in enumerate(doc):        
        if (i+1 > len(doc.words)-1):
            break  
        word_bigram = "{word_one} {word_two}".format(word_one = doc[i].string, word_two = doc[i+1].string)   
        features['word_bigram'][word_bigram] += 1 
        pos_bigram = "{word_one} {word_two}".format(word_one = doc[i].upos, word_two = doc[i+1].upos)  
        features['pos_bigram'][pos_bigram] += 1 
        
        if (i+2 > len(doc.words)-1):
            break  
        word_trigram = "{word_one} {word_two} {word_three}".format(word_one = doc[i].string, word_two = doc[i+1].string, word_three = doc[i+2].string)  
        features['word_trigram'][word_trigram] += 1 
        pos_trigram = "{word_one} {word_two} {word_three}".format(word_one = doc[i].upos, word_two = doc[i+1].upos, word_three = doc[i+2].upos)
        features['pos_trigram'][pos_trigram] += 1 
    
    for key, value in features['word_bigram'].items():
        features['word_bigram'][key] = features['word_bigram'][key] / len(doc.words)
    for key, value in features['word_trigram'].items():
        features['word_trigram'][key] = features['word_trigram'][key] / len(doc.words)
    for key, value in features['pos_bigram'].items():
        features['pos_bigram'][key] = features['pos_bigram'][key] / len(doc.words)
    for key, value in features['pos_trigram'].items():
        features['pos_trigram'][key] = features['pos_trigram'][key] / len(doc.words)
    
    return features

### Extract Features From Docs

In [None]:
data = {}

for doc in docs:
    features = get_features(docs[doc])
    for key in features:
        features[key] = sorted(features[key].items(), key=lambda item: item[1])
    data[doc] = features

### Suffix Tree

In [None]:
from pystlm.stlm import STLM
from pystlm.suffixtree import SuffixTree
from pystlm.sequence import Sequence

trie = SuffixTree()

doc = docs[paul[0]]

tags = []

for word in doc:
    tags.append(word.upos)
    trie.add(word.upos)

trie.update_all_counts()
stlm = STLM(trie)
seq = Sequence()

In [None]:
from collections import Counter

sentences = doc.sentences
sentence_tags = []

start_counter = Counter()
end_counter = Counter()

for sent in sentences:
    if len(sent) < 2: continue
    pos_tags = []
    for word in sent:
        pos_tags.append(word.upos)
    sentence_tags.append(pos_tags)
    
    start_counter[pos_tags[0]] += 1
    end_counter[pos_tags[len(pos_tags)-1]] +=1

### Neural Net

In [None]:
for key, value in data.items():
    print (key)

In [None]:
data['new_testament.i_thessalonians']

In [None]:
import pandas as pd
import numpy as np 

cats = [0, 1]

# training data 
df = pd.DataFrame({'cat': 1}, index=[0])

data_train_1 = pd.DataFrame({'cat': 1}, index=[0])
data_train_1['text'] = 'new_testament.i_thessalonians'
for bigram in data['new_testament.i_thessalonians']['pos_bigram']: 
    data_train_1[bigram[0]] = bigram[1]
for bigram in data['new_testament.i_thessalonians']['word_bigram']: 
    data_train_1[bigram[0]] = bigram[1]
for trigram in data['new_testament.i_thessalonians']['pos_trigram']: 
    data_train_1[trigram[0]] = trigram[1]
for trigram in data['new_testament.i_thessalonians']['word_trigram']: 
    data_train_1[trigram[0]] = trigram[1]
    
data_train_2 = pd.DataFrame({'cat': 1}, index=[0])
data_train_2['text'] = 'new_testament.philippians'
for bigram in data['new_testament.philippians']['pos_bigram']: 
    data_train_2[bigram[0]] = bigram[1]
for bigram in data['new_testament.philippians']['word_bigram']: 
    data_train_2[bigram[0]] = bigram[1]
for trigram in data['new_testament.philippians']['pos_trigram']: 
    data_train_2[trigram[0]] = trigram[1]
for trigram in data['new_testament.philippians']['word_trigram']: 
    data_train_2[trigram[0]] = trigram[1]
    
data_train_3 = pd.DataFrame({'cat': 0}, index=[0])
data_train_3['text'] = 'plato.alcibiades_1'
for bigram in data['plato.alcibiades_1']['pos_bigram']: 
    data_train_3[bigram[0]] = bigram[1]
for bigram in data['plato.alcibiades_1']['word_bigram']: 
    data_train_3[bigram[0]] = bigram[1]
for trigram in data['plato.alcibiades_1']['pos_trigram']: 
    data_train_3[trigram[0]] = trigram[1]
for trigram in data['plato.alcibiades_1']['word_trigram']: 
    data_train_3[trigram[0]] = trigram[1]
    
# test data 
data_test = pd.DataFrame({'cat': 1}, index=[0])
data_test['text'] = 'new_testament.galatians'
for bigram in data['new_testament.galatians']['pos_bigram']: 
    data_test[bigram[0]] = bigram[1]
for bigram in data['new_testament.galatians']['word_bigram']: 
    data_test[bigram[0]] = bigram[1]
for trigram in data['new_testament.galatians']['pos_trigram']: 
    data_test[trigram[0]] = trigram[1]
for trigram in data['new_testament.galatians']['word_trigram']: 
    data_test[trigram[0]] = trigram[1]
    
df = df.append(data_train_1, ignore_index = True)
df = df.append(data_train_2, ignore_index = True)
df = df.append(data_train_3, ignore_index = True)
df = df.append(data_test, ignore_index = True)
df = df.fillna(0)

In [None]:
data_train = df.loc[df['text'] == 'new_testament.i_thessalonians' || df['text'] == 'new_testament.philippians' || df['text'] == 'plato.alcibiades_1']

In [None]:
data_test = df.loc[df['text'] == 'new_testament.i_thessalonians' || df['text'] == 'new_testament.philippians' || df['text'] == 'plato.alcibiades_1']

In [None]:
from sklearn import preprocessing

test = data_test
train = data_train

train_y = train.cat.values
test_y = test.cat.values

data_train.drop(data_test.columns[[0]], axis=1, inplace=True)
data_test.drop(data_test.columns[[0]], axis=1, inplace=True)

train_X = np.array(data_train.values) 
test_X = np.array(data_test.values) 

print(train_X.shape, train_y.shape)
print(test_X.shape, test_y.shape)

print(train_X)
print(train_y)

In [None]:
from sklearn.linear_model import LogisticRegression

logisticRegr = LogisticRegression(solver='liblinear')
logisticRegr.fit(train_X, train_y) 

In [None]:
logisticRegr.predict(test_X)

In [None]:
import tensorflow as tf
print("TensorFlow version: {}".format(tf.__version__))
print("Eager execution: {}".format(tf.executing_eagerly()))

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(units=300, activation='relu'),
    tf.keras.layers.Dense(units=600, activation='relu'),
    tf.keras.layers.Dense(units=300, activation='relu'),
    tf.keras.layers.Dropout(.1),
    tf.keras.layers.Dense(units=len(cats), activation='softmax')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(train_X, train_y, epochs=20) 