In [17]:
from gensim import corpora, models, similarities
import pyLDAvis.gensim
import spacy
import json

In [2]:
path = '../general_corpus/'
dictionary = corpora.Dictionary.load(path + 'general_corpus.dict')
corpus = corpora.MmCorpus(path + 'general_corpus.mm')

## Topic Coherence: General 100 Topics, alpha = 'symmetric'

In [4]:
general_100_symmetric = models.ldamodel.LdaModel.load(path + 'alpha_symmetric/general100.model')
general_100_symmetric_viz = pyLDAvis.gensim.prepare(general_100_symmetric, corpus, dictionary)
pyLDAvis.display(general_100_symmetric_viz)

## Topic Coherence: General 100 Topics, alpha = 'asymmetric'

In [5]:
general_100_asymmetric = models.ldamodel.LdaModel.load(path + 'alpha_asymmetric/general_100_asymmetric.model')
general_100_asymmetric_viz = pyLDAvis.gensim.prepare(general_100_asymmetric, corpus, dictionary)
pyLDAvis.display(general_100_asymmetric_viz)

## Topic Coherence: General 100 Topics, alpha = 'auto'

In [6]:
general_100_auto = models.ldamodel.LdaModel.load(path + 'alpha_auto/general_100_auto.model')
general_100_auto_viz = pyLDAvis.gensim.prepare(general_100_auto, corpus, dictionary)
pyLDAvis.display(general_100_auto_viz)

## Clustering

In [7]:
def doc_assignment_test(corpus, model):
    docs_with_1_topic = 0
    docs_with_multiple_topics = 0
    docs_with_no_topics = 0
    total_docs = 0
    for doc in corpus:
        topics = model.get_document_topics(doc, minimum_probability=0.20)
        total_docs += 1
        if len(topics) == 1:
            docs_with_1_topic += 1
        elif len(topics) > 1:
            docs_with_multiple_topics += 1
        else:
            docs_with_no_topics += 1
    print('Corpus assigned to a single topic:', (docs_with_1_topic / total_docs) * 100, '%')
    print('Corpus assigned to multiple topics:', (docs_with_multiple_topics / total_docs) * 100, '%')
    print('corpus assigned to no topics:', (docs_with_no_topics / total_docs) * 100, '%')

### Clustering: General 100 Topics, alpha = 'symmetric'

In [8]:
doc_assignment_test(corpus, general_100_symmetric)

Corpus assigned to a single topic: 57.53102267864784 %
Corpus assigned to multiple topics: 26.839965768078734 %
corpus assigned to no topics: 15.629011553273427 %


### Clustering: General 100 Topics, alpha = 'asymmetric'

In [9]:
doc_assignment_test(corpus, general_100_asymmetric)

Corpus assigned to a single topic: 57.74497218656397 %
Corpus assigned to multiple topics: 22.83910997004707 %
corpus assigned to no topics: 19.41591784338896 %


### Clustering: General 100 Topics, alpha = 'auto'

In [10]:
doc_assignment_test(corpus, general_100_auto)

Corpus assigned to a single topic: 58.01240907145914 %
Corpus assigned to multiple topics: 24.12280701754386 %
corpus assigned to no topics: 17.864783910997005 %


## Information Retrieval
A final way of evaluating this model is to see if it is able to provide useful information retrieval. To evaluate the model in this way, it will be given abstracts from a few different articles from more recent issues of *JBL* which it has not yet seen. Then, it will be evaluated on whether or not it is able to return similar articles from the corpus it has seen. Abstracts are taken from the following articles:
* Greene, N. E. (2017). Creation, destruction, and a Psalmist's plea: rethinking the poetic structure of Psalm 74. *Journal Of Biblical Literature*, 136 (1), 85-101. doi:10.15699/jbl.1361.2017.156672
* Hollenback, G. M. (2017). Who Is Doing What to Whom Revisited: Another Look at Leviticus 18:22 and 20:13. *Journal Of Biblical Literature*, 136 (3), 529-537. doi:10.15699/jbl.1363.2017.161166
* Dinkler, M. B. (2017). Building Character on the Road to Emmaus: Lukan Characterization in Contemporary Literary Perspective. *Journal Of Biblical Literature*, 136(3), 687-706. doi:10.15699/jbl.1363.2017.292918

In [11]:
index_symmetric = similarities.MatrixSimilarity(general_100_symmetric[corpus])  # build index for similarity queries
index_asymmetric = similarities.MatrixSimilarity(general_100_asymmetric[corpus])  # build index for similarity queries
index_auto = similarities.MatrixSimilarity(general_100_auto[corpus])  # build index for similarity queries

In [12]:
def retrieval_test(new_doc, lda, index):
    new_bow = dictionary.doc2bow(new_doc)  # change new document to bag of words representation
    new_vec = lda[new_bow]  # change new bag of words to a vector
    index.num_best = 5  # set index to generate 5 best results
    matches = (index[new_vec])
    for match in matches:
        score = str(match[1])
        key = 'doc_' + str(match[0])
        article_dict = doc2metadata[key]
        author = article_dict['author']
        title = article_dict['title']
        year = article_dict['pub_year']
        print(author + ' "' + title + '" ' + year + '\n\tsimilarity score -> ' + score + '\n')

In [15]:
nlp = spacy.load('en')
stop_words = spacy.en.STOPWORDS

def get_lemmas(text):
    doc = nlp(text)
    tokens = [token for token in doc]
    lemmas = [token.lemma_ for token in tokens if token.is_alpha]
    lemmas = [lemma for lemma in lemmas if lemma not in stop_words]
    return lemmas

#### Load documents

In [18]:
with open('../data/doc2metadata.json', encoding='utf8', mode='r') as f:
    doc2metadata = json.load(f)

In [19]:
with open('../abstracts/greene.txt', encoding='utf8', mode='r') as f:
    text = f.read()
    greene = get_lemmas(text)

In [20]:
with open('../abstracts/hollenback.txt', encoding='utf8', mode='r') as f:
    text = f.read()
    hollenback = get_lemmas(text)

In [21]:
with open('../abstracts/dinkler.txt', encoding='utf8', mode='r') as f:
    text = f.read()
    dinkler = get_lemmas(text)

### Finding articles similar to Greene, N. E. (2017). Creation, destruction, and a Psalmist's plea: rethinking the poetic structure of Psalm 74

#### General 100 Topics, alpha = 'symmetric'

In [22]:
retrieval_test(greene, general_100_symmetric, index_symmetric)

Gillingham, S. "review of the message of the psalter: an eschatological programme in the book of psalms" 1999
	similarity score -> 0.8872696757316589

Miller, Patrick D. "review of die komposition des psalters: ein formgeschichtlicher ansatz" 1997
	similarity score -> 0.8649082779884338

Malchow, Bruce V. "review of psalm 102 im kontext des vierten psalmenbuches" 1997
	similarity score -> 0.8606864213943481

Limburg, James "review of  jahwe wird kommen, zu herrschen über die erde: ps 90-110 als komposition " 1997
	similarity score -> 0.829866349697113

Jerome F. D. Creach "review of the songs of ascents (psalms 120-134): their place in israelite history and religion" 1999
	similarity score -> 0.8277723789215088



#### General 100 Topics, alpha = 'asymmetric'

In [23]:
retrieval_test(greene, general_100_asymmetric, index_asymmetric)

Ceresko, Anthony R. "review of  voyez de vos yeux: étude structurelle de vingt psaumes, dont le psaume 119 " 1995
	similarity score -> 0.8567506074905396

Miller, Patrick D. "review of die komposition des psalters: ein formgeschichtlicher ansatz" 1997
	similarity score -> 0.849635124206543

Allen, Leslie C. "the value of rhetorical criticism in psalm 69" 1986
	similarity score -> 0.8250746726989746

Watts, James W. "review of  merveilles à nos yeux: etude structurelle de vingt psaumes dont celui de 1 ch 16,8-36 " 1997
	similarity score -> 0.8058336973190308

Limburg, James "review of  jahwe wird kommen, zu herrschen über die erde: ps 90-110 als komposition " 1997
	similarity score -> 0.805189847946167



#### General 100 Topics, alpha = 'auto'

In [24]:
retrieval_test(greene, general_100_auto, index_auto)

Limburg, James "review of  jahwe wird kommen, zu herrschen über die erde: ps 90-110 als komposition " 1997
	similarity score -> 0.8791912794113159

Watts, James W. "review of  merveilles à nos yeux: etude structurelle de vingt psaumes dont celui de 1 ch 16,8-36 " 1997
	similarity score -> 0.875058650970459

Jerome F. D. Creach "review of the songs of ascents (psalms 120-134): their place in israelite history and religion" 1999
	similarity score -> 0.8635956048965454

Allen, Leslie C. "the value of rhetorical criticism in psalm 69" 1986
	similarity score -> 0.8616266846656799

Miller, Patrick D. "review of die komposition des psalters: ein formgeschichtlicher ansatz" 1997
	similarity score -> 0.8575048446655273



### Finding articles similar to Hollenback, G. M. (2017). Who Is Doing What to Whom Revisited: Another Look at Leviticus 18:22 and 20:13.

#### General 100 Topics, alpha = 'symmetric'

In [25]:
retrieval_test(hollenback, general_100_symmetric, index_symmetric)

Bird, Phyllis A. "review of frauen im alten israel: eine begriffsgeschichtliche und sozialrechtliche studie zur stellung der frau im alten testament" 1993
	similarity score -> 0.8541148900985718

Sharp, Carolyn J. "review of gender in the book of jeremiah: a feminist-literary reading" 2000
	similarity score -> 0.8219231367111206

Adams, Karin "metaphor and dissonance: a reinterpretation of hosea 4:13-14" 2008
	similarity score -> 0.7924692034721375

Collins, John J. "review of marriage as a covenant: a study of biblical law and ethics governing marriage, developed from the perspective of malachi" 1995
	similarity score -> 0.7918387651443481

Walsh, Jerome T. "leviticus 18:22 and 20:13: who is doing what to whom?" 2001
	similarity score -> 0.7850314378738403



#### General 100 Topics, alpha = 'asymmetric'

In [26]:
retrieval_test(hollenback, general_100_asymmetric, index_asymmetric)

Miller, James E. "a critical response to karin adams's reinterpretation of hosea 4:13-14" 2009
	similarity score -> 0.8336148858070374

Adams, Karin "metaphor and dissonance: a reinterpretation of hosea 4:13-14" 2008
	similarity score -> 0.8207341432571411

Sharp, Carolyn J. "review of gender in the book of jeremiah: a feminist-literary reading" 2000
	similarity score -> 0.8148511648178101

Walsh, Jerome T. "leviticus 18:22 and 20:13: who is doing what to whom?" 2001
	similarity score -> 0.7562586069107056

DAVIS, ANDREW R. "the literary effect of gender discord in the book of ruth" 2013
	similarity score -> 0.7486808896064758



#### General 100 Topics, alpha = 'auto'

In [27]:
retrieval_test(hollenback, general_100_auto, index_auto)

Floyd, Michael H. " the <rle>מַשָּׂא<pdf> (maśśāʾ) as a type of prophetic book " 2002
	similarity score -> 0.6873390674591064

Melugin, Roy F. "review of a theology of exile: judgment/ deliverance in jeremiah and ezekiel" 1979
	similarity score -> 0.6831732392311096

Wagenaar, Jan A. "review of micah" 2001
	similarity score -> 0.6784759759902954

Knierim, Rolf "review of der bittende mensch: bittritual und klagelied des einzelnen im alten testament" 1983
	similarity score -> 0.674983561038971

Exum, J. Cheryl "promise and fulfillment: narrative art in judges 13" 1980
	similarity score -> 0.6600040197372437



### Finding articles similar to Dinkler, M. B. (2017). Building Character on the Road to Emmaus: Lukan Characterization in Contemporary Literary Perspective.

#### General 100 Topics, alpha = 'symmetric'

In [29]:
retrieval_test(dinkler, general_100_symmetric, index_symmetric)

Kelber, Werner H. "review of die passion jesu als verhaltensmodell: literarische und theologische analyse der traditions-und redaktionsgeschichte der markuspassion" 1976
	similarity score -> 0.9130895733833313

Cousland, J. R. C. "review of the controversy stories in the gospel of matthew: their redaction, form and relevance for the relationship between the matthean community and formative judaism" 2003
	similarity score -> 0.9088087677955627

Landry, David "review of jesus the intercessor: prayer and christology in luke-acts" 1995
	similarity score -> 0.9065011143684387

Driggers, Ira Brent "review of reading mark: engaging the gospel" 2004
	similarity score -> 0.8984276056289673

Matthews, Christopher R. "review of the blind, the lame, and the poor: character types in luke-acts" 1999
	similarity score -> 0.8944092392921448



#### General 100 Topics, alpha = 'asymmetric'

In [31]:
retrieval_test(dinkler, general_100_asymmetric, index_asymmetric)

Campbell, William Sanger "review of "but it is not so among you": echoes of power in mark 10:32-45" 2005
	similarity score -> 0.8839129209518433

Driggers, Ira Brent "review of reading mark: engaging the gospel" 2004
	similarity score -> 0.8657383322715759

Tiede, David L. "review of urchristliche wundergeschichten: ein beitrag zur formgeschichtlichen erforschung der synoptischen evangelien" 1976
	similarity score -> 0.8544032573699951

Collins, Adela Yarbro "review of a myth of innocence: mark and christian origins" 1989
	similarity score -> 0.83297199010849

Legrand, Lucien "review of the third gospel for the third world. vol. 1, preface and infancy narrative (luke 1:1-2:52)" 1998
	similarity score -> 0.8327959775924683



#### General 100 Topics, alpha = 'auto'

In [32]:
retrieval_test(dinkler, general_100_auto, index_auto)

Driggers, Ira Brent "review of reading mark: engaging the gospel" 2004
	similarity score -> 0.9241958856582642

Moore, Stephen D. "review of reading mark from the outside: eco and iser leave their marks" 1996
	similarity score -> 0.8905816674232483

Collins, Adela Yarbro "review of irony in mark's gospel: text and subtext" 1993
	similarity score -> 0.8775089979171753

Tyson, Joseph B. "review of the lukan voice: confusion and irony in the gospel of luke" 1988
	similarity score -> 0.8709434270858765

Stegner, William Richard "review of the transfiguration: a source- and redaction-critical study of luke 9:28-36" 1995
	similarity score -> 0.864206075668335

