## Noun Corpus

In [7]:
from gensim import corpora, models, similarities
import pyLDAvis.gensim
import json
from operator import itemgetter
import pandas as pd
import spacy

### Set up

In [8]:
path = '../noun_corpus/'

# load metadata for later use
with open('../data/doc2metadata.json', encoding='utf8', mode='r') as f:
    doc2metadata = json.load(f)
    
# load dictionary and corpus for the noun models
dictionary = corpora.Dictionary.load(path + 'noun_corpus.dict')
corpus = corpora.MmCorpus(path + 'noun_corpus.mm')

# load noun_25 model
noun_25 = models.ldamodel.LdaModel.load(path + 'noun_25.model')

# load noun_50 model
noun_50 = models.ldamodel.LdaModel.load(path + 'noun_50.model')

# load nount_500 model
noun_75 = models.ldamodel.LdaModel.load(path + 'noun_75.model')


### 25 Topics, default hyper-parameters

In [9]:
noun_25_viz = pyLDAvis.gensim.prepare(noun_25, corpus, dictionary)
pyLDAvis.display(noun_25_viz)

### 250 Topics, default hyper-parameters

In [10]:
noun_50_viz = pyLDAvis.gensim.prepare(noun_50, corpus, dictionary)
pyLDAvis.display(noun_50_viz)

ValidationError: 
 * Not all rows (distributions) in doc_topic_dists sum to 1.

### 75 Topics, default hyper-parameters

In [11]:
noun_75_viz = pyLDAvis.gensim.prepare(noun_75, corpus, dictionary)
pyLDAvis.display(noun_75_viz)

## Clustering
One of the applications for this topic model is to use it for clustering articles from the *JBL*. A way of evaluating the model is to ask, in a big picture perspective, how much of the corpus is it able to cluster? Here I have defined a function which identifies what percentage of the corpus the model is able to cluster into one topic, into multiple topics, or into no topics at all. The threshold I have decided to use is that a document must have a minimum threshold of 20% to belong to a topic. That is to say, 20% of the words used in a document must come from a particular topic if that document is to be clustered with other documents in that topic.

In [17]:
def cluster_test(corpus, model):
    docs_with_1_topic = 0
    docs_with_multiple_topics = 0
    docs_with_no_topics = 0
    total_docs = 0
    for doc in corpus:
        topics = model.get_document_topics(doc, minimum_probability=0.20)
        total_docs += 1
        if len(topics) == 1:
            docs_with_1_topic += 1
        elif len(topics) > 1:
            docs_with_multiple_topics += 1
        else:
            docs_with_no_topics += 1
    print('Corpus assigned to a single topic:', (docs_with_1_topic / total_docs) * 100, '%')
    print('Corpus assigned to multiple topics:', (docs_with_multiple_topics / total_docs) * 100, '%')
    print('corpus assigned to no topics:', (docs_with_no_topics / total_docs) * 100, '%')

### Clustering: Noun 100 Topics, alpha = 'symmetric'

In [12]:
cluster_test(corpus, noun_l00)

  gammad = self.alpha + expElogthetad * np.dot(cts / phinorm, expElogbetad.T)


Corpus assigned to a single topic: 55.54129225502782 %
Corpus assigned to multiple topics: 25.706033376123234 %
corpus assigned to no topics: 18.75267436884895 %


In [13]:
cluster_test(corpus, noun_250)

Corpus assigned to a single topic: 45.23962344886607 %
Corpus assigned to multiple topics: 8.932391955498503 %
corpus assigned to no topics: 45.82798459563543 %


In [15]:
cluster_test(corpus, noun_500)

Corpus assigned to a single topic: 28.38040222507488 %
Corpus assigned to multiple topics: 2.2036799315361573 %
corpus assigned to no topics: 69.41591784338897 %


## Information Retrieval
A final way of evaluating this model is to see if it is able to provide useful information retrieval. To evaluate the model in this way, it will be given abstracts from a few different articles from more recent issues of *JBL* which it has not yet seen. Then, it will be evaluated on whether or not it is able to return similar articles from the corpus it has seen. Abstracts are taken from the following articles:
* Greene, N. E. (2017). Creation, destruction, and a Psalmist's plea: rethinking the poetic structure of Psalm 74. *Journal Of Biblical Literature*, 136 (1), 85-101. doi:10.15699/jbl.1361.2017.156672
* Hollenback, G. M. (2017). Who Is Doing What to Whom Revisited: Another Look at Leviticus 18:22 and 20:13. *Journal Of Biblical Literature*, 136 (3), 529-537. doi:10.15699/jbl.1363.2017.161166
* Dinkler, M. B. (2017). Building Character on the Road to Emmaus: Lukan Characterization in Contemporary Literary Perspective. *Journal Of Biblical Literature*, 136(3), 687-706. doi:10.15699/jbl.1363.2017.292918

In [21]:
# build indicies for similarity quiries
index_100 = similarities.MatrixSimilarity(noun_l00[corpus])  
index_250 = similarities.MatrixSimilarity(noun_250[corpus])  
index_500 = similarities.MatrixSimilarity(noun_500[corpus])  

  gammad = self.alpha + expElogthetad * np.dot(cts / phinorm, expElogbetad.T)


In [58]:
def retrieval_test(new_doc, lda, index):
    new_bow = dictionary.doc2bow(new_doc)  # change new document to bag of words representation
    new_vec = lda[new_bow]  # change new bag of words to a vector
    index.num_best = 10  # set index to generate 10 best results
    matches = (index[new_vec])
    for match in matches:
        score = str(match[1])
        key = 'doc_' + str(match[0])
        article_dict = doc2metadata[key]
        author = article_dict['author']
        title = article_dict['title']
        year = article_dict['pub_year']
        print(key + ' ' + author + ' "' + title + '" ' + year + '\n\tsimilarity score -> ' + score + '\n')

In [59]:
nlp = spacy.load('en')
stop_words = spacy.en.STOPWORDS

def get_noun_lemmas(text):
    doc = nlp(text)
    tokens = [token for token in doc]
    noun_tokens = [token for token in tokens if token.tag_ == 'NN' or token.tag_ == 'NNP' or token.tag_ == 'NNS']
    noun_lemmas = [noun_token.lemma_ for noun_token in noun_tokens if noun_token.is_alpha]
    noun_lemmas = [noun_lemma for noun_lemma in noun_lemmas if noun_lemma not in stop_words]
    return noun_lemmas

In [60]:
# load and process Greene, N. E. (2017)
with open('../abstracts/greene.txt', encoding='utf8', mode='r') as f:
    text = f.read()
    greene = get_noun_lemmas(text)
    
#load and process Hollenback, G. M. (2017)
with open('../abstracts/hollenback.txt', encoding='utf8', mode='r') as f:
    text = f.read()
    hollenback = get_noun_lemmas(text)

# load and process Dinkler, M. B. (2017)
with open('../abstracts/dinkler.txt', encoding='utf8', mode='r') as f:
    text = f.read()
    dinkler = get_noun_lemmas(text)

### Finding articles similar to Greene, N. E. (2017). Creation, destruction, and a Psalmist's plea: rethinking the poetic structure of Psalm 74

#### Noun 100 Topics, alpha = 'symmetric'

In [61]:
retrieval_test(greene, noun_l00, index_100)

doc_2855 Jefferson, Helen Genevieve "psalm 93" 1952
	similarity score -> 0.8308091163635254

doc_9217 Briggs, Charles A. "an inductive study of selah" 1899
	similarity score -> 0.8224514126777649

doc_5418 Buss, Martin J. "the psalms of asaph and korah" 1963
	similarity score -> 0.8063031435012817

doc_9314 Peters, John P. "notes on some ritual uses of the psalms" 1910
	similarity score -> 0.7978718876838684

doc_7877 Allen, Leslie C. "review of the identity of the individual in the psalms" 1989
	similarity score -> 0.7976692318916321

doc_804 Peters, John P. "another folk song" 1921
	similarity score -> 0.7954691648483276

doc_1745 Hauser, Alan Jon "jonah: in pursuit of the dove" 1985
	similarity score -> 0.790578305721283

doc_1012 Morgenstern, Julian "psalm 121" 1939
	similarity score -> 0.771424412727356

doc_1411 Berry, George R. "the titles of the psalms" 1914
	similarity score -> 0.7675870656967163

doc_123 ARMSTRONG, RYAN M. "psalms dwelling together in unity: the placement of 

In [62]:
retrieval_test(greene, noun_250, index_250)

doc_5418 Buss, Martin J. "the psalms of asaph and korah" 1963
	similarity score -> 0.804955780506134

doc_7176 Malchow, Bruce V. "review of psalm 102 im kontext des vierten psalmenbuches" 1997
	similarity score -> 0.8040056824684143

doc_7286 Miller, Patrick D. "review of die komposition des psalters: ein formgeschichtlicher ansatz" 1997
	similarity score -> 0.7588654160499573

doc_9217 Briggs, Charles A. "an inductive study of selah" 1899
	similarity score -> 0.7568011283874512

doc_5369 Gerstenberger, Erhard "review of the psalms in israel's worship" 1963
	similarity score -> 0.7510282397270203

doc_3761 Childs, Brevard S. "review of asylie und schutzorakel am zionheiligtum" 1969
	similarity score -> 0.7487475872039795

doc_7877 Allen, Leslie C. "review of the identity of the individual in the psalms" 1989
	similarity score -> 0.7459195852279663

doc_2855 Jefferson, Helen Genevieve "psalm 93" 1952
	similarity score -> 0.7360318303108215

doc_1968 Soll, William Michael "the question o

In [63]:
retrieval_test(greene, noun_500, index_500)

doc_7176 Malchow, Bruce V. "review of psalm 102 im kontext des vierten psalmenbuches" 1997
	similarity score -> 0.8650175333023071

doc_123 ARMSTRONG, RYAN M. "psalms dwelling together in unity: the placement of psalms 133 and 134 in two different psalms collections" 2012
	similarity score -> 0.824644148349762

doc_2344 Allen, Leslie C. "review of the message of the psalms: a theological commentary" 1986
	similarity score -> 0.8203396201133728

doc_5418 Buss, Martin J. "the psalms of asaph and korah" 1963
	similarity score -> 0.8113301396369934

doc_2855 Jefferson, Helen Genevieve "psalm 93" 1952
	similarity score -> 0.8091059327125549

doc_7877 Allen, Leslie C. "review of the identity of the individual in the psalms" 1989
	similarity score -> 0.8033403158187866

doc_7286 Miller, Patrick D. "review of die komposition des psalters: ein formgeschichtlicher ansatz" 1997
	similarity score -> 0.7999415397644043

doc_9217 Briggs, Charles A. "an inductive study of selah" 1899
	similarity scor

### Finding articles similar to Hollenback, G. M. (2017). Who Is Doing What to Whom Revisited: Another Look at Leviticus 18:22 and 20:13.

#### Noun 100 Topics

In [65]:
retrieval_test(hollenback, noun_l00, index_100)

doc_8946 McLay, Tim "review of the oracle of tyre: the septuagint of isaiah xxiii as version and vision" 2000
	similarity score -> 0.7071447372436523

doc_8707 Greenspoon, Leonard "review of the og and th versions of daniel" 1999
	similarity score -> 0.6857782602310181

doc_2254 Auld, A. Graeme "review of textual studies in the book of joshua" 1986
	similarity score -> 0.6741912961006165

doc_7766 Hopkins, David C. "review of property rights in the eighth-century prophets: the conflict and its background" 1989
	similarity score -> 0.6701995134353638

doc_7805 Bird, Phyllis A. "review of frauen im alten israel: eine begriffsgeschichtliche und sozialrechtliche studie zur stellung der frau im alten testament" 1993
	similarity score -> 0.6665436625480652

doc_8822 Moore, Michael S. "review of flight and freedom in the ancient near east" 2002
	similarity score -> 0.6569191813468933

doc_1908 Greenspoon, Leonard J. "review of the greek text of judges. recensional developments" 1983
	similari

#### Noun 250 Topics

In [66]:
retrieval_test(hollenback, noun_250, index_250)

doc_79 THIESSEN, MATTHEW "revisiting the προσήλυτος in "the lxx"" 2013
	similarity score -> 0.44779250025749207

doc_4258 Arieti, James A. "the vocabulary of septuagint amos" 1974
	similarity score -> 0.44036346673965454

doc_552 Meek, Theophile James " the translation of gêr in the hexateuch and its bearing on the documentary hypothesis " 1930
	similarity score -> 0.43562477827072144

doc_3825 Nida, Eugene A. "implications of contemporary linguistics for biblical scholarship" 1972
	similarity score -> 0.4244069755077362

doc_7698 Anderson, Gary "review of the purification offering in the priestly literature, its meaning and function" 1989
	similarity score -> 0.41102105379104614

doc_387 BÜCHNER, DIRK "'eξιλ" 2010
	similarity score -> 0.4053155183792114

doc_8586 Wolters, Al "semantic borrowing and inner-greek corruption in lxx zechariah 11:8" 1999
	similarity score -> 0.39218002557754517

doc_116 MOFFITT, DAVID M. "p.duk. inv. 727r: new evidence for the meaning and provenance of the 

#### Noun 500 Topics

In [67]:
retrieval_test(hollenback, noun_500, index_500)

doc_5882 Mullins, Terence Y. "topos as a new testament form" 1980
	similarity score -> 0.471707820892334

doc_2437 Enslin, Morton S. "review of the english new testament" 1949
	similarity score -> 0.4455508887767792

doc_9039 Brawley, Robert L. "review of homoeroticism in the biblical world: a historical perspective" 2001
	similarity score -> 0.44228595495224

doc_8757 Walsh, Jerome T. "leviticus 18:22 and 20:13: who is doing what to whom?" 2001
	similarity score -> 0.42464834451675415

doc_2058 Brunt, John C. "more on the topos as a new testament form" 1985
	similarity score -> 0.4179156720638275

doc_2658 Scott, R. B. Y. "review of the holy bible. vol. iii: the sapiential books, job to sirach" 1957
	similarity score -> 0.3897961378097534

doc_5809 Reumann, John "review of the new testament octapla: eight english versions of the new testament in the tyndale-king james tradition" 1962
	similarity score -> 0.3839614987373352

doc_5469 Meek, Theophile J. "old testament translation princi

### Finding articles similar to Dinkler, M. B. (2017). Building Character on the Road to Emmaus: Lukan Characterization in Contemporary Literary Perspective.

#### Noun 100 topics

In [69]:
retrieval_test(dinkler, noun_l00, index_100)

doc_312 Sylva, Dennis "review of dialogue and drama: elements of greek tragedy in the fourth gospel" 2006
	similarity score -> 0.904798150062561

doc_8158 Tyson, Joseph B. "review of the lukan voice: confusion and irony in the gospel of luke" 1988
	similarity score -> 0.898211658000946

doc_8712 Brodie, Thomas L. "review of the discipleship paradigm: readers and anonymous characters in the fourth gospel" 1999
	similarity score -> 0.8799789547920227

doc_9260 IVERSON, KELLY R. "a centurion's "confession": a performance-critical analysis of mark 15:39" 2011
	similarity score -> 0.8666025400161743

doc_1952 Praeder, Susan Marie "review of mark as story: an introduction to the narrative of a gospel" 1984
	similarity score -> 0.8603408932685852

doc_7467 Carroll, John T. "review of the death of the messiah: from gethsemane to the grave: a commentary on the passion narratives in the four gospels" 1996
	similarity score -> 0.8503568172454834

doc_6919 Stegner, William Richard "review of the t

#### Noun 250 Topics

In [70]:
retrieval_test(dinkler, noun_250, index_250)

doc_8158 Tyson, Joseph B. "review of the lukan voice: confusion and irony in the gospel of luke" 1988
	similarity score -> 0.8520952463150024

doc_276 Yamasaki, Gary "point of view in a gospel story: what difference does it make? luke 19:1-10 as a test case" 2006
	similarity score -> 0.7471709251403809

doc_8712 Brodie, Thomas L. "review of the discipleship paradigm: readers and anonymous characters in the fourth gospel" 1999
	similarity score -> 0.6802945137023926

doc_7507 Brown, Raymond E. "review of tiempo de anuncio: estudio de lc 1,5-2,52" 1996
	similarity score -> 0.6748663783073425

doc_2145 Fokkelman, J. P. "review of the art of biblical narrative" 1983
	similarity score -> 0.6625063419342041

doc_7626 Smith, D. Moody "review of the fourth gospel and its predecessor: from narrative source to present gospel" 1990
	similarity score -> 0.6612517237663269

doc_7261 Meadors, Edward "review of jesus' walking on the sea: an investigation of the origin of the narrative account" 1998
	

#### Noun 500 Topics

In [71]:
retrieval_test(dinkler, noun_500, index_500)

doc_5910 Beardslee, William A. "review of structural exegesis: from theory to practice: exegesis of mark 15 and 16; hermeneutical implications" 1980
	similarity score -> 0.6751855611801147

doc_7684 Matthews, Christopher R. "review of  die wir-passagen der apostelgeschicte: ein lukanisches stilmittel aus jüdischer tradition " 1991
	similarity score -> 0.6431180238723755

doc_5909 Donahue, John R. "review of mark's treatment of the jewish leaders" 1980
	similarity score -> 0.6390593647956848

doc_5911 Nardoni, Enrique "review of  la transfiguración de jesús y el diálogo sobre elías según el evangelio de san marcos " 1980
	similarity score -> 0.6070594787597656

doc_7099 Davies, Philip R. "method and madness: some remarks on doing history with the bible" 1995
	similarity score -> 0.5942044258117676

doc_8872 Landy, Francis "review of "why ask my name?" anonymity and identity in biblical narrative" 2000
	similarity score -> 0.5909191370010376

doc_3366 Grobel, Kendrick "review of form-cri