In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Tokenization
sentences = ['I love my dog','I hate my cat','You love my dog','I love only my dog']

token = Tokenizer(num_words = 100)
token.fit_on_texts(sentences)

In [3]:
word_index = token.word_index
print(word_index)
vocab_size = len(word_index)

{'my': 1, 'i': 2, 'love': 3, 'dog': 4, 'hate': 5, 'cat': 6, 'you': 7, 'only': 8}


In [None]:
# Turn sentences into sequences
sequences = token.texts_to_sequences(sentences)
print(sequences)

# Pad the sequences with zeros for uniform length
padded = pad_sequences(sequences)
print(padded)

In [None]:
labels = np.array([1,0,1,0]) # 1 is True, 0 is False

In [None]:
embedding_dim = 6

model = tf.keras.Sequential([ 
    tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length = 5), 
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'), 
    tf.keras.layers.Dense(1, activation='sigmoid')])

model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
num_epochs = 500
history = model.fit(padded, labels, epochs = num_epochs, verbose = 2)

In [None]:
model_output = model.predict(padded)
print(model_output)

## Now we begin by reading in a formatting the English translations of the four Gospels. Our goal is to build a discriminator, which will be able to predict whether or not a given verse came from the Gospel of John.

In [4]:
bible = pd.read_csv("kjv_data.csv")
matthew_verses = np.where(bible['book'] == 'Matthew')[0]
mark_verses = np.where(bible['book'] == 'Mark')[0]
luke_verses = np.where(bible['book'] == 'Luke')[0]
john_verses = np.where(bible['book'] == 'John')[0]

In [5]:
Gospels_token = Tokenizer();
Matthew = []; Mark = []; Luke = []; John = []
for i in matthew_verses:
    Matthew += [bible['text'][i]]
for i in mark_verses:
    Mark += [bible['text'][i]]
for i in luke_verses:
    Luke += [bible['text'][i]]
for i in john_verses:
    John += [bible['text'][i]]

Gospels_token.fit_on_texts(Matthew)
Gospels_token.fit_on_texts(Mark)
Gospels_token.fit_on_texts(Luke)
Gospels_token.fit_on_texts(John)
Gospels_token.word_index

{'and': 1,
 'the': 2,
 'of': 3,
 'he': 4,
 'him': 5,
 'that': 6,
 'unto': 7,
 'to': 8,
 'they': 9,
 'in': 10,
 'them': 11,
 'i': 12,
 'not': 13,
 'is': 14,
 'shall': 15,
 'a': 16,
 'said': 17,
 'for': 18,
 'his': 19,
 'ye': 20,
 'was': 21,
 'but': 22,
 'be': 23,
 'it': 24,
 'which': 25,
 'jesus': 26,
 'when': 27,
 'with': 28,
 'you': 29,
 'thou': 30,
 'me': 31,
 'man': 32,
 'have': 33,
 'into': 34,
 'all': 35,
 'this': 36,
 'then': 37,
 'were': 38,
 'came': 39,
 'my': 40,
 'as': 41,
 'had': 42,
 'come': 43,
 'son': 44,
 'from': 45,
 'saying': 46,
 'say': 47,
 'god': 48,
 'on': 49,
 'out': 50,
 'thee': 51,
 'there': 52,
 'will': 53,
 'one': 54,
 'things': 55,
 'are': 56,
 'thy': 57,
 'went': 58,
 'father': 59,
 'if': 60,
 'lord': 61,
 'what': 62,
 'by': 63,
 'hath': 64,
 'saith': 65,
 'do': 66,
 'disciples': 67,
 'also': 68,
 'up': 69,
 'no': 70,
 'these': 71,
 'their': 72,
 'at': 73,
 'her': 74,
 'answered': 75,
 'go': 76,
 'therefore': 77,
 'your': 78,
 'we': 79,
 'now': 80,
 'day': 8

In [6]:
Matthew_sequences = Gospels_token.texts_to_sequences(Matthew)
Mark_sequences = Gospels_token.texts_to_sequences(Mark)
Luke_sequences = Gospels_token.texts_to_sequences(Luke)
John_sequences = Gospels_token.texts_to_sequences(John)

# FYI, the longest verse in the Gospels is 62 words long
input_len = 62
Matthew_padded = pad_sequences(Matthew_sequences, maxlen = input_len)
Mark_padded = pad_sequences(Mark_sequences, maxlen = input_len)
Luke_padded = pad_sequences(Luke_sequences, maxlen = input_len)
John_padded = pad_sequences(John_sequences, maxlen = input_len)
Gospels_padded = np.concatenate((Matthew_padded, Mark_padded, Luke_padded, John_padded))

# Now to make the vector of labels. Start with a 1 for John, 0 for all other gospels
Gospels_labels = np.concatenate((np.zeros(len(Matthew),), np.zeros(len(Mark),), np.zeros(len(Luke),), np.ones(len(John),)))

In [7]:
# Split data into training and testing, half and half
features, feature_len = Gospels_padded.shape
features_idx = np.arange(features)
np.random.shuffle(features_idx)
print('Total number of verses in Gospels: ', len(Gospels_labels))

# take the first 2000 verses for training, rest for testing
training_split = 2800
features_training = Gospels_padded[features_idx[:training_split],:]
features_testing  = Gospels_padded[features_idx[training_split:],:]
labels_training   = Gospels_labels[features_idx[:training_split]]
labels_testing    = Gospels_labels[features_idx[training_split:]]

Total number of verses in Gospels:  3779


In [8]:
embedding_dim = 63

model = tf.keras.Sequential([ 
    tf.keras.layers.Embedding(len(Gospels_token.word_index)+1, embedding_dim, input_length = input_len), 
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'), 
    tf.keras.layers.Dense(1, activation='sigmoid')])

model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [9]:
num_epochs = 50
history = model.fit(features_training, labels_training, epochs = num_epochs, verbose = 2)

Train on 2800 samples
Epoch 1/50
2800/2800 - 2s - loss: 0.5910 - accuracy: 0.7714
Epoch 2/50
2800/2800 - 1s - loss: 0.5191 - accuracy: 0.7718
Epoch 3/50
2800/2800 - 1s - loss: 0.4859 - accuracy: 0.7718
Epoch 4/50
2800/2800 - 1s - loss: 0.4203 - accuracy: 0.7893
Epoch 5/50
2800/2800 - 1s - loss: 0.3559 - accuracy: 0.8446
Epoch 6/50
2800/2800 - 1s - loss: 0.2988 - accuracy: 0.8850
Epoch 7/50
2800/2800 - 1s - loss: 0.2508 - accuracy: 0.9046
Epoch 8/50
2800/2800 - 2s - loss: 0.2110 - accuracy: 0.9271
Epoch 9/50
2800/2800 - 3s - loss: 0.1787 - accuracy: 0.9371
Epoch 10/50
2800/2800 - 2s - loss: 0.1511 - accuracy: 0.9500
Epoch 11/50
2800/2800 - 2s - loss: 0.1295 - accuracy: 0.9604
Epoch 12/50
2800/2800 - 2s - loss: 0.1129 - accuracy: 0.9654
Epoch 13/50
2800/2800 - 1s - loss: 0.0987 - accuracy: 0.9689
Epoch 14/50
2800/2800 - 1s - loss: 0.0848 - accuracy: 0.9754
Epoch 15/50
2800/2800 - 1s - loss: 0.0756 - accuracy: 0.9796
Epoch 16/50
2800/2800 - 1s - loss: 0.0661 - accuracy: 0.9804
Epoch 17/50

In [12]:
model_output = model.predict(features_testing)
predictions = np.round(np.array(model_output)[:,0])
print(predictions.T)

[0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 1. 1.
 0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0.
 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1.
 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.

In [11]:
np.sum(np.round(predictions) == labels_testing)/(features - training_split)

0.8539325842696629

In [44]:
embedding_dim = 40

model = tf.keras.Sequential([ 
    tf.keras.layers.Embedding(len(Gospels_token.word_index)+1, embedding_dim, input_length = input_len), 
    tf.keras.layers.Dense(48, activation='relu'),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(12, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')])

model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_21"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_21 (Embedding)     (None, 62, 40)            140560    
_________________________________________________________________
dense_69 (Dense)             (None, 62, 48)            1968      
_________________________________________________________________
global_average_pooling1d_24  (None, 48)                0         
_________________________________________________________________
dense_70 (Dense)             (None, 24)                1176      
_________________________________________________________________
dense_71 (Dense)             (None, 12)                300       
_________________________________________________________________
dense_72 (Dense)             (None, 1)                 13        
Total params: 144,017
Trainable params: 144,017
Non-trainable params: 0
_______________________________________________

In [45]:
num_epochs = 50
history = model.fit(features_training, labels_training, epochs = num_epochs, verbose = 2)

Train on 2800 samples
Epoch 1/50
2800/2800 - 2s - loss: 0.5939 - accuracy: 0.7589
Epoch 2/50
2800/2800 - 1s - loss: 0.5189 - accuracy: 0.7718
Epoch 3/50
2800/2800 - 1s - loss: 0.4413 - accuracy: 0.7854
Epoch 4/50
2800/2800 - 1s - loss: 0.2935 - accuracy: 0.8775
Epoch 5/50
2800/2800 - 1s - loss: 0.1986 - accuracy: 0.9204
Epoch 6/50
2800/2800 - 1s - loss: 0.1437 - accuracy: 0.9393
Epoch 7/50
2800/2800 - 1s - loss: 0.1045 - accuracy: 0.9643
Epoch 8/50
2800/2800 - 1s - loss: 0.0766 - accuracy: 0.9743
Epoch 9/50
2800/2800 - 1s - loss: 0.0609 - accuracy: 0.9793
Epoch 10/50
2800/2800 - 1s - loss: 0.0467 - accuracy: 0.9864
Epoch 11/50
2800/2800 - 1s - loss: 0.0347 - accuracy: 0.9882
Epoch 12/50
2800/2800 - 1s - loss: 0.0277 - accuracy: 0.9911
Epoch 13/50
2800/2800 - 1s - loss: 0.0209 - accuracy: 0.9950
Epoch 14/50
2800/2800 - 1s - loss: 0.0167 - accuracy: 0.9961
Epoch 15/50
2800/2800 - 1s - loss: 0.0129 - accuracy: 0.9975
Epoch 16/50
2800/2800 - 1s - loss: 0.0101 - accuracy: 0.9975
Epoch 17/50

In [46]:
model_output = model.predict(features_testing)
predictions = np.round(np.array(model_output)[:,0])
np.sum(np.round(predictions) == labels_testing)/(features - training_split)

0.8518896833503575

## A few simple tests...

In [91]:
print(bible['text'][john_verses[100]])
print('Probability this is from John: ', model.predict(John_padded)[100])

print(bible['text'][mark_verses[100]])
print('Probability this is from John: ', model.predict(Mark_padded)[100])

print(bible['text'][matthew_verses[100]])
print('Probability this is from John: ', model.predict(Matthew_padded)[100])

print(bible['text'][luke_verses[100]])
print('Probability this is from John: ', model.predict(Luke_padded)[100])

Then there arose a question between some of John's disciples and the Jews about purifying.
Probability this is from John:  [0.9999965]
Verily I say unto you, All sins shall be forgiven unto the sons of men, and blasphemies wherewith soever they shall blaspheme:
Probability this is from John:  [0.]
Blessed are ye, when men shall revile you, and persecute you, and shall say all manner of evil against you falsely, for my sake.
Probability this is from John:  [0.]
And when eight days were accomplished for the circumcising of the child, his name was called JESUS, which was so named of the angel before he was conceived in the womb.
Probability this is from John:  [0.]
