In [203]:
import re
import matplotlib.pyplot as plt
import numpy as np
import math
from nltk.stem import PorterStemmer
from collections import Counter, defaultdict
import time
from itertools import chain
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import sklearn
from sklearn import ensemble
from sklearn.metrics import classification_report
import scipy
from scipy import sparse
import csv
from statistics import mean
from sklearn.model_selection import train_test_split
from scipy import stats
import nltk 
from nltk.tokenize import word_tokenize

## Text analysis

In [204]:
f = open('englishST.txt', 'r')
STwords = [word.rstrip() for word in f.readlines()]
ps = PorterStemmer()

In [205]:
# corpora_map = {"OT":1, "NT":2, "Quran":3}

corpora_doc_counts = {"OT":0, "NT":0, "Quran":0}
OT_doc_tokens = [defaultdict(list)]
Quran_doc_tokens = []
NT_doc_tokens = []

inv_index = defaultdict(lambda: defaultdict(int))

with open('bible_and_quran.tsv', 'r') as f:
    reader = csv.reader(f, delimiter='\t')

    for row in reader:
        # corpora_num = corpora_map[row[0]]
        verse = row[1]
        corpora_doc_counts[row[0]] += 1


        verse_processed = [ps.stem(word) for word in re.findall(r'\w+', verse) if word.lower() not in STwords]
        #verse_processed= [word for word in verse_processed if word not in STwords]
        for word in set(verse_processed):
            inv_index[word][row[0]] += 1
            ### {bible: {OT:2 , NT:2 , Quaran:2}}

        if row[0] == "OT":
            OT_doc_tokens.append(verse_processed)
        elif row[0] == "NT":
            NT_doc_tokens.append(verse_processed)
        else:
            Quran_doc_tokens.append(verse_processed)




In [206]:
def calculate_nterms(inv_index):
    
    N11_OT, N11_NT, N11_Quran = defaultdict(int),defaultdict(int),defaultdict(int)
    N10_OT, N10_NT, N10_Quran = defaultdict(int),defaultdict(int),defaultdict(int)
    N01_OT, N01_NT, N01_Quran = defaultdict(int),defaultdict(int),defaultdict(int)
    N00_OT, N00_NT, N00_Quran = defaultdict(int),defaultdict(int),defaultdict(int)
    
    for word,val in inv_index.items():
        N11_OT[word] = val["OT"]
        N01_OT[word] = corpora_doc_counts["OT"] - val["OT"]
        N10_OT[word] = val["NT"] + val["Quran"]
        N00_OT[word] = (corpora_doc_counts["NT"]+ corpora_doc_counts["Quran"]) - (val["NT"] + val["Quran"])
        
        N11_NT[word] = val["NT"]
        N01_NT[word] = corpora_doc_counts["NT"] - val["NT"]
        N10_NT[word] = val["OT"] + val["Quran"]
        N00_NT[word] = (corpora_doc_counts["OT"]+corpora_doc_counts["Quran"]) - (val["OT"] + val["Quran"])
        
        N11_Quran[word] = val["Quran"]
        N01_Quran[word] = corpora_doc_counts["Quran"] - val["Quran"]
        N10_Quran[word] = val["OT"] + val["NT"]
        N00_Quran[word] = (corpora_doc_counts["OT"]+corpora_doc_counts["NT"]) - (val["OT"] + val["NT"])
    
    return N11_OT,N01_OT,N10_OT,N00_OT, N11_NT,N01_NT,N10_NT,N00_NT, N11_Quran,N01_Quran,N10_Quran, N00_Quran

In [207]:
def MI(N,N11,N01,N10,N00,vocab):        
        MI = {}
  
        for word in vocab:
            N1X = N11[word] + N10[word]
            N0X = N00[word] + N01[word]
            NX1 = N01[word] + N11[word]
            NX0 = N00[word] + N10[word]
            try:
                first = (N11[word]/N)*math.log2((N*N11[word])/(N1X*NX1))
            except (ValueError,ZeroDivisionError):
                first = 0
            try:
                second = (N01[word]/N)*math.log2((N*N01[word])/(N0X*NX1))
            except (ValueError,ZeroDivisionError):
                second = 0
            try:
                third = (N10[word]/N)*math.log2((N*N10[word])/(N1X*NX0))
            except (ValueError,ZeroDivisionError):
                third = 0
            try:
                fourth = (N00[word]/N)*math.log2((N*N00[word])/(N0X*NX0))
            except (ValueError,ZeroDivisionError):
                fourth = 0

            MI[word] = first+second+third+fourth
    
        return {k: v for k, v in sorted(MI.items(), key=lambda item: item[1], reverse=True)}

In [208]:
def Chi(N11,N01,N10,N00,vocab):
    Chi = {}
    
    for word in vocab:
        try:
            numTerm1 = (N11[word]+N10[word]+N01[word]+N00[word])
            numTerm2 = ((N11[word]*N00[word])-(N10[word]*N01[word]))**2
            denTerm1 = (N11[word]+N01[word])
            denTerm2 = (N11[word]+N10[word])
            denTerm3 = (N10[word]+N00[word])
            denTerm4 = (N01[word]+N00[word])
            Chi[word] = (numTerm1 * numTerm2)/(denTerm1 * denTerm2 * denTerm3 * denTerm4)
        except ZeroDivisionError:
            Chi[word] = 0
    
    return {k: v for k, v in sorted(Chi.items(), key=lambda item: item[1], reverse=True)}

In [209]:
N = corpora_doc_counts["Quran"]+corpora_doc_counts["NT"]+corpora_doc_counts["OT"]
N11_OT,N01_OT,N10_OT,N00_OT, N11_NT,N01_NT,N10_NT,N00_NT, N11_Quran,N01_Quran,N10_Quran,N00_Quran = calculate_nterms(inv_index)
MI_OT = MI(N,N11_OT,N01_OT,N10_OT,N00_OT,inv_index.keys())
Chi_OT = Chi(N11_OT,N01_OT,N10_OT,N00_OT,inv_index.keys())

MI_NT = MI(N,N11_NT,N01_NT,N10_NT,N00_NT,inv_index.keys())
Chi_NT = Chi(N11_NT,N01_NT,N10_NT,N00_NT,inv_index.keys())

MI_quran = MI(N,N11_Quran,N01_Quran,N10_Quran,N00_Quran,inv_index.keys())
Chi_quran = Chi(N11_Quran,N01_Quran,N10_Quran,N00_Quran,inv_index.keys())

In [210]:
MI_OT

{'jesu': 0.03865642296857681,
 'israel': 0.03637950500182374,
 'king': 0.03137734313453207,
 'lord': 0.03071273778930124,
 'ot': 0.02271016008478581,
 'christ': 0.020603855836814237,
 'believ': 0.018544325242376834,
 'son': 0.01639084398731795,
 'god': 0.016129394858056728,
 'muhammad': 0.0160872814929012,
 'judah': 0.01445048948172949,
 'land': 0.014392319127286,
 'torment': 0.013102893380498524,
 'faith': 0.011983021597475932,
 'hous': 0.011963278312342857,
 'receiv': 0.010504486984305934,
 'david': 0.01022726276141957,
 'discipl': 0.009691265769285064,
 'revel': 0.009386059327645115,
 'unbeliev': 0.008703141314896303,
 'egypt': 0.008338457650589532,
 'suffer': 0.00824117536620728,
 'quran': 0.007872283956816628,
 'disbeliev': 0.006954965486126062,
 'offer': 0.006930165733447997,
 'children': 0.006922942977507751,
 'citi': 0.006687922194636179,
 'preach': 0.006645454297764733,
 'unjust': 0.006616485161295923,
 'hand': 0.006609183013708409,
 'nt': 0.006562149033913036,
 'hundr': 0.006

In [211]:
MI_NT

{'jesu': 0.05662963922525349,
 'christ': 0.03449451244115645,
 'lord': 0.023783884082444827,
 'israel': 0.015377196038873421,
 'discipl': 0.015265886637701018,
 'peopl': 0.011502018060738844,
 'king': 0.011461225021200179,
 'nt': 0.01094299833428795,
 'ot': 0.010911579368191919,
 'land': 0.010318888657265943,
 'peter': 0.010287146868953833,
 'paul': 0.010287146868953833,
 'thing': 0.009253256280789877,
 'spirit': 0.007739911452335798,
 'john': 0.007180997640348571,
 'church': 0.006761343153929483,
 'judah': 0.006228182478986985,
 'pharise': 0.005781588641924426,
 'gospel': 0.0053148299185464196,
 'apostl': 0.0052018124745561636,
 'world': 0.005065462404867951,
 'grace': 0.004917685606861091,
 'jew': 0.004759512013894537,
 'simon': 0.004549995545556414,
 'assuredli': 0.004380927865368209,
 'muhammad': 0.004362347081509243,
 'immedi': 0.004318626010000206,
 'faith': 0.004273048563868168,
 'synagogu': 0.004030498555043338,
 'demon': 0.003946348785696927,
 'teacher': 0.003599404592068977,


In [212]:
MI_quran

{'god': 0.031315057358951216,
 'muhammad': 0.030213244627015343,
 'torment': 0.020586944429588798,
 'believ': 0.020228021478441268,
 'messeng': 0.01596337474190813,
 'king': 0.01585793201151665,
 'israel': 0.015573946813467326,
 'quran': 0.01473700802932229,
 'revel': 0.014482839486013775,
 'unbeliev': 0.01306507415583372,
 'guidanc': 0.01280541494832731,
 'disbeliev': 0.01265407734148562,
 'son': 0.012396106515521876,
 'deed': 0.011197876377753147,
 'unjust': 0.011099761808834853,
 'man': 0.01087477521629764,
 'forgiv': 0.010599187790579657,
 'creat': 0.010583378901595636,
 'hell': 0.009740413739005716,
 'ot': 0.008876563864178223,
 'reward': 0.008836367097524256,
 'truth': 0.008318197784199972,
 'evid': 0.008237239718973877,
 'hous': 0.008170546003651682,
 'peopl': 0.00783738851734751,
 'reveal': 0.007735957792121968,
 'repli': 0.007663938438228129,
 'guid': 0.007246825088750623,
 'book': 0.0071547766888245224,
 'suffer': 0.006693099336391879,
 'judah': 0.006479328469452591,
 'reject

In [213]:
Chi_OT

{'jesu': 1334.8698276250661,
 'lord': 1213.3493535935352,
 'israel': 1177.8435083487318,
 'king': 1044.3327373812112,
 'christ': 709.8083088299884,
 'god': 691.887103073095,
 'believ': 682.3722177293455,
 'ot': 631.651530007109,
 'son': 620.2789291542364,
 'muhammad': 553.8751401887283,
 'land': 518.2088078451267,
 'faith': 484.0123607192929,
 'torment': 465.8993361335546,
 'hous': 438.1107969275539,
 'receiv': 431.56552308666653,
 'judah': 429.1077071373241,
 'david': 350.46256904284036,
 'discipl': 339.59336220960813,
 'revel': 329.03008486795017,
 'suffer': 311.15625648261687,
 'unbeliev': 299.34272280179573,
 'quran': 270.73464806464597,
 'thing': 264.76202776355467,
 'egypt': 264.1978446658786,
 'truth': 263.7948653990638,
 'children': 260.1138955646877,
 'hand': 255.8566924611773,
 'deed': 251.42369804592133,
 'messeng': 250.29976951301984,
 'citi': 249.75056171954773,
 'forgiv': 248.44426349956478,
 'preach': 244.79425071225606,
 'offer': 244.66187044065046,
 'unjust': 239.40483

In [214]:
Chi_NT

{'jesu': 2908.4639588941686,
 'christ': 1697.6844705205804,
 'lord': 857.4801210896427,
 'discipl': 778.8954521225505,
 'nt': 539.6680471448283,
 'peter': 507.3512846102345,
 'paul': 507.3512846102345,
 'thing': 461.75895810454455,
 'israel': 458.4989706404623,
 'spirit': 406.49446398658444,
 'peopl': 386.7692709782207,
 'john': 373.19741323549994,
 'king': 363.06863147430965,
 'church': 340.552741248568,
 'land': 311.6269694046532,
 'pharise': 285.24658843252763,
 'gospel': 282.7857005270417,
 'world': 271.719829836737,
 'grace': 266.7591215705243,
 'ot': 264.09589771810636,
 'apostl': 256.65413703483637,
 'jew': 253.83633595110805,
 'immedi': 234.142013040136,
 'assuredli': 230.74501026827014,
 'simon': 224.5056740458874,
 'faith': 216.63602145406932,
 'demon': 207.20884326342903,
 'synagogu': 204.76082694199238,
 'teacher': 189.46597907748287,
 'galile': 177.63829865958365,
 'baptiz': 176.22347230074104,
 'preach': 171.444589704172,
 'judah': 169.2099682466374,
 'pilat': 167.3999756

In [215]:
Chi_quran

{'muhammad': 1667.1794155129128,
 'god': 1515.8516920306722,
 'torment': 1204.0429811331248,
 'believ': 1197.8308197898732,
 'messeng': 944.7981741649254,
 'revel': 846.7442820846965,
 'quran': 814.9187417325003,
 'unbeliev': 763.4216955655063,
 'guidanc': 730.7404634709078,
 'disbeliev': 708.9024843667343,
 'deed': 660.3567554831479,
 'unjust': 648.7707074363959,
 'forgiv': 630.6325564043611,
 'creat': 629.9579515642539,
 'hell': 579.3304616795376,
 'reward': 524.2055668050467,
 'evid': 487.8856635636424,
 'truth': 471.9140276235203,
 'reveal': 455.7700920892837,
 'repli': 442.11258440776203,
 'guid': 431.79664089982697,
 'israel': 425.0127835747377,
 'king': 424.3356067925696,
 'book': 407.4813218197649,
 'suffer': 391.0260910894634,
 'son': 386.7813386591314,
 'reject': 381.93353864200037,
 'peopl': 378.1633762185004,
 'human': 360.51508575483837,
 'exist': 350.93912978482456,
 'equal': 349.56898787426616,
 'true': 345.2098507433812,
 'grant': 344.4706803570881,
 'worship': 335.7232

## Topic analysis

In [216]:
def topic_analysis(corpus1,corpus2,corpus3):

    combined_corpus = corpus1+corpus2+corpus3
    common_texts = [list(x) for x in combined_corpus]
    common_dictionary = Dictionary(common_texts)
    common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
    
    c1 = [common_dictionary.doc2bow(text) for text in corpus1]
    c2 = [common_dictionary.doc2bow(text) for text in corpus2]
    c3 = [common_dictionary.doc2bow(text) for text in corpus3]

    num_topics = 20

    lda = LdaModel(common_corpus, num_topics=num_topics, id2word=common_dictionary, random_state = 25, iterations=500)
    
    topics_scores1 = defaultdict(float)
    topics_scores2 = defaultdict(float)
    topics_scores3 = defaultdict(float)

    for doc in c1:
        for docid, prob in lda.get_document_topics(doc):
            topics_scores1[docid] += prob
        
    for doc in c2:
        for docid, prob in lda.get_document_topics(doc):
            topics_scores2[docid] += prob
            
    for doc in c3:
        for docid, prob in lda.get_document_topics(doc):
            topics_scores3[docid] += prob
    
    topics_scores1 = {key:val/len(c1) for key,val in topics_scores1.items()}
    top_topics1 = [key for key, val in sorted(topics_scores1.items(), key=lambda item: item[1], reverse=True)]
    
    topics_scores2 = {key:val/len(c2) for key,val in topics_scores2.items()}
    top_topics2 = [key for key, val in sorted(topics_scores2.items(), key=lambda item: item[1], reverse=True)]
    
    topics_scores3 = {key:val/len(c3) for key,val in topics_scores3.items()}
    top_topics3 = [key for key, val in sorted(topics_scores3.items(), key=lambda item: item[1], reverse=True)]

    return lda, top_topics1, top_topics2, top_topics3

In [217]:
lda,t1,t2,t3 = topic_analysis(OT_doc_tokens,NT_doc_tokens,Quran_doc_tokens)
print("OT Top Topics:", t1[0])
print("NT Top Topics:",t2[0])
print("Quran Top Topics:", t3[0])

OT Top Topics: 6
NT Top Topics: 0
Quran Top Topics: 1


In [218]:
print(f"Topic {t1[0]} best scores: {lda.print_topic(t1[0])}")

Topic 6 best scores: 0.119*"son" + 0.063*"father" + 0.049*"brother" + 0.039*"law" + 0.038*"hous" + 0.025*"abraham" + 0.023*"wife" + 0.018*"wive" + 0.016*"joseph" + 0.015*"wrath"


In [219]:
print(f"Topic {t2[0]} best scores: {lda.print_topic(t2[0])}")

Topic 0 best scores: 0.059*"word" + 0.048*"call" + 0.047*"lord" + 0.044*"follow" + 0.043*"angel" + 0.042*"knowledg" + 0.041*"god" + 0.041*"spirit" + 0.034*"strive" + 0.032*"peopl"


In [220]:
print(f"Topic {t3[0]} best scores: {lda.print_topic(t3[0])}")

Topic 1 best scores: 0.270*"god" + 0.052*"earth" + 0.051*"heaven" + 0.051*"lord" + 0.043*"deed" + 0.036*"judgment" + 0.025*"thing" + 0.024*"day" + 0.020*"peopl" + 0.019*"favor"
