CSS 709 Bible Translation Analysis
--------------------------------------------------

This notebook is used to generate word2vec distances for certain concepts within bible translations, as part of CSS 709 (Natural Language Processing) course. The input consists of existing bible texts, typically in .txt format. The output consists of an array containing the differences in distance between words "Holy" and "Evil" for an array of concepts.

In [9]:
# Importing modules
import re
import numpy as np
import pandas as pd
import os
import gensim
from gensim.models.callbacks import CallbackAny2Vec
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from gensim.utils import tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Max\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


The following block of code defines the utility functions necessary to run the code.

In [10]:
# This function is used to import and preprocess non-UTF encoded text
def data_import(fname):
    # Import text file and clean it
    with open(fname, newline = '') as f:
            data = f.readlines()

    data = [re.sub(r'<.*?>', '', word) for word in data] # remove HTML tags - just in case some are missed.
    data = [re.sub(r'’s|\'s', '', word) for word in data] # remove possessive "s"
    data = [re.sub(r'\n|\\\\t', '', word) for word in data] # remove line breaks, tab breaks
    data = [re.sub(r'[^\w\s]|_', '', word) for word in data] # remove punctuation and underscore
    data = [re.sub(r'\w*\d\w*', '', word) for word in data] # remove character strings that contain a digit
    data = [re.sub(r'\d', '', word) for word in data] # remove digits  
    data = [word.lower() for word in data] # convert the text to lowercase
    data = [word.split() for word in data] # split the sentences into words        
    data = [sent for sent in data if sent != []] # remove empty tokens

    return data


# This function is used to import and  preprocess UTF encoded text
def data_importUTF(fname):
    # Import text file and clean it. This version is used for UTF encoding
    with open(fname, encoding="utf8", newline = '') as f:
        data = f.readlines()


    data = [re.sub(r'<.*?>', '', word) for word in data] # remove HTML tags - just in case some are missed.
    data = [re.sub(r'’s|\'s', '', word) for word in data] # remove possessive "s"
    data = [re.sub(r'\n|\\\\t', '', word) for word in data] # remove line breaks, tab breaks
    data = [re.sub(r'[^\w\s]|_', '', word) for word in data] # remove punctuation and underscore
    data = [re.sub(r'\w*\d\w*', '', word) for word in data] # remove character strings that contain a digit
    data = [re.sub(r'\d', '', word) for word in data] # remove digits 
    data = [word.lower() for word in data] # convert the text to lowercase
    data = [word.split() for word in data] # split the sentences into words     
    data = [sent for sent in data if sent != []] # remove empty tokens
        
    return data

# this function calculates the sentiment value for each concept passed into the function
# based on the two words representing positive and negative sentiments.
def distance_array(concepts, goodword, badword, model):
    temp_array = []
    # Calculate the distance between each concept and the good/bad words, capturing the values in an array.
    for i in range(len(concepts)):
        temp_array.append(model.wv.similarity(concepts[i], goodword) - model.wv.similarity(concepts[i], badword))
    return temp_array

# convert sentences to words and remove accents.
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=de-accentize
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

# remove stopwords based on the list of stopwords       
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

This function is used to calculate output loss to evaluate how many epochs should be used. 
It is not used when running the final script and is commented out

In [12]:
'''
class LossLogger(CallbackAny2Vec):
    # Output loss at each epoch
    def __init__(self):
        self.epoch = 1
        self.losses = [0]

    def on_epoch_begin(self, model):
        #print(f'Epoch: {self.epoch}', end='\t')

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        lossDelta = loss - self.losses[self.epoch - 1]
        self.losses.append(loss)
        #print(f'  Loss: {lossDelta}')
        self.epoch += 1

loss_logger = LossLogger()
'''

"\nclass LossLogger(CallbackAny2Vec):\n    # Output loss at each epoch\n    def __init__(self):\n        self.epoch = 1\n        self.losses = [0]\n\n    def on_epoch_begin(self, model):\n        #print(f'Epoch: {self.epoch}', end='\t')\n\n    def on_epoch_end(self, model):\n        loss = model.get_latest_training_loss()\n        lossDelta = loss - self.losses[self.epoch - 1]\n        self.losses.append(loss)\n        #print(f'  Loss: {lossDelta}')\n        self.epoch += 1\n\nloss_logger = LossLogger()\n"

The next block of code calculates all relevant statistics for the input Bible.

In [None]:
# This is where we can specify the input text.
fname = 'WEB.txt'

# Depending on the file type, the appropriate data import function is used
#data = data_import(fname)
data = data_importUTF(fname)

# this section defines the stopwords
stop_words = stopwords.words('english')
# adding custom stopwords, mainly from medieval english
stop_words.extend(['shall', 'unto', 'thou', 'thy', 'ye', 'thee', 'upon', 'shalt', 'hath', 'also', 'us', 'hast', 'thine'])


data_words = list(sent_to_words(data))

# remove stop words
data_words = remove_stopwords(data_words)

# Define concepts to be examined
concepts = ['man', 'woman', 'angel', 'devil', 'money', 'bread', 'wine', 'home', 'sword',
            'soldier', 'goat', 'priest', 'wilderness', 'servant', 'king', 'egypt',
            'jerusalem', 'babylon', 'family', 'knowledge']

# if the concept words need to be changed, the list below will be used as a reference when changing the words back.
'''
concepts = ['man', 'woman', 'angel', 'devil', 'money', 'bread', 'wine', 'home', 'sword',
            'soldier', 'goat', 'priest', 'wilderness', 'servant', 'king', 'egypt',
            'jerusalem', 'babylon', 'family', 'knowledge']
            '''

# define the two words representing the positive and negative dimension
good_word = 'holy'
bad_word = 'evil'

# initialize the list that will contain our results
distance_matrix = []

# model params:
CONTEXT_WINDOW = 5
NEGATIVES = 15
MIN_COUNT = 1
EPOCHS = 20

# create the word2vec model 10 times, each time recording the sentiment values and storing them.
for i in range(10):
    bible_model = gensim.models.Word2Vec(sentences = data_words,
                                         workers = 4,
                                         min_count = MIN_COUNT,
                                         negative = NEGATIVES,
                                         window = CONTEXT_WINDOW,
                                         vector_size = 100,
                                         sg = 0,
                                         #callbacks = [loss_logger],
                                         #compute_loss = True,
                                         epochs = EPOCHS) 
    
    #print(bible_model.wv.most_similar('evil'))
    #print(bible_model.wv.most_similar('angel'))
    #print(bible_model.wv.similarity('holy', 'evil'))

    # update the result matrix for each model run
    distance_matrix.append(distance_array(concepts, good_word, bad_word, bible_model))

# convert the result matrix into an numpy matrix
np_matrix = np.array(distance_matrix)

# use the numpy mean function to find the mean values for sentiments
final_output = np.mean(np_matrix, axis = 0)

# print out the results to be copied into excel spreadsheet.
for i in range (len(final_output)):
    print(final_output[i])
