### Imports

In [1]:
# processing
from operator import methodcaller
import csv
import re
import numpy as np
import pandas as pd
from pprint import pprint

# gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt

#sci-kit
from sklearn import feature_extraction



### Processing

In [2]:
blacklist = [t.strip() for t in next(csv.reader(open("blacklist.csv", 'r')))]

inPath = "raw.csv"

inFile = open(inPath, 'r')
inReader = csv.reader(inFile)

docTokens = dict()


next(inReader)
for inRow in inReader:
    term = inRow[0]
    sentence = inRow[2]
    docID = inRow[3]
    
    token = "_".join([t for t in term.split(":") if re.match(r'[^\W\d]*$', t) and not t in blacklist])
    
    if not token in blacklist and len(token) > 0:
        if docID in docTokens:
            docTokens[docID] += [token]
        else:
            docTokens[docID] = [token]

docIDs = list(docTokens.keys())
data = list(docTokens.values())

### Topic Modeling

In [4]:
dictionary = corpora.Dictionary(data)
texts = data

corpus = [dictionary.doc2bow(text) for text in texts]

In [15]:
numTops = 20

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                       id2word=dictionary,
                                       num_topics=numTops, 
                                       random_state=100,
                                       update_every=1,
                                       chunksize=8,
                                       passes=1,
                                       alpha='auto',
                                       per_word_topics=True)


print(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0, '0.038*"ray" + 0.033*"c_bond" + 0.025*"crystal" + 0.022*"solution" + 0.020*"cell" + 0.020*"effect" + 0.019*"x_ray" + 0.016*"mixture" + 0.016*"evaporation" + 0.013*"datum"'), (1, '0.014*"intensity" + 0.012*"coordinate" + 0.010*"presence" + 0.008*"metal_atom" + 0.008*"surface" + 0.008*"reflection" + 0.007*"theory" + 0.007*"spectrum" + 0.006*"stack" + 0.006*"curve"'), (2, '0.042*"middot" + 0.030*"anion" + 0.029*"cation" + 0.022*"water" + 0.022*"atom" + 0.014*"middot_minus" + 0.012*"water_molecule" + 0.012*"middot_middot" + 0.010*"hydrogen" + 0.009*"donor"'), (3, '0.040*"dihedral_angle" + 0.039*"angle" + 0.033*"inversion" + 0.029*"configuration" + 0.026*"center" + 0.024*"plane" + 0.023*"inversion_center" + 0.021*"methylene" + 0.020*"ab_plane" + 0.017*"deviation"'), (4, '0.019*"crystal_pack" + 0.012*"n_o" + 0.012*"n_o_h" + 0.010*"substitution" + 0.009*"series" + 0.007*"region" + 0.006*"der" + 0.006*"start_material" + 0.005*"dynamic" + 0.004*"change"'), (5, '0.035*"axis" + 0.030*"solid_

### Visualization

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
pyLDAvis.show(vis)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))



Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [05/Jul/2019 13:40:46] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [05/Jul/2019 13:40:46] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [05/Jul/2019 13:40:46] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [05/Jul/2019 13:40:46] "GET /LDAvis.js HTTP/1.1" 200 -


### Document Tagging

In [21]:
i = 1

out =  csv.writer(open("output.csv", 'w'), lineterminator = '\n')
out.writerow(["prob", "topic", "docID"])

d2t = dict()
t2d = dict()


for ID in docIDs:
    
    #print("Document " + str(i) + ": ")
    
    doc = docTokens[ID]
    
    store = list(lda_model[dictionary.doc2bow(doc)])[0]
    
    #print(store)
    
    for pair in store:
        
        out.writerow([pair[1], pair[0], ID])
        
        if ID in d2t:
            d2t[ID] += [pair]
        else:
            d2t[ID] = [pair]

        if pair[0] in t2d:
            print(pair[0])
            t2d[pair[0]] += [(ID, pair[1])]
        else:
            t2d[pair[0]] = [(ID, pair[1])]

    
    i += 1

    

2
5
8
10
11
14
16
3
8
10
11
16
1
2
3
7
8
10
11
16
0
2
3
5
8
10
11
16
0
3
5
8
10
11
16
0
2
3
5
8
10
11
16
5
8
11
16
0
2
3
4
5
7
8
10
11
14
16
0
3
5
7
8
10
11
16
0
2
3
5
8
10
11
13
16
3
8
10
11
13
16
0
5
7
8
10
11
14
16
19
0
3
4
8
10
11
12
16
0
2
3
5
7
8
10
11
12
14
16
2
6
8
9
11
12
15
16
0
2
3
5
7
8
10
11
14
16
0
1
2
3
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
0
3
8
11
12
14
16
0
1
2
3
5
6
7
8
9
10
11
13
14
15
16
17
19
0
2
3
5
8
10
11
12
13
14
16
0
1
2
3
5
7
8
10
11
14
16
0
3
7
8
10
11
12
15
16
4
8
9
11
13
14
17
19
0
2
3
8
11
16
0
1
2
3
8
10
11
13
16
18
0
2
3
5
8
9
11
16
19
0
2
3
5
6
7
8
10
11
14
16
18
0
3
4
5
8
11
16
18
0
2
3
5
6
8
10
11
14
16
0
2
3
5
8
10
11
16
17
2
3
8
9
11
16
0
3
5
8
11
16
0
2
3
5
7
8
10
11
14
16
0
2
3
7
8
10
11
15
16
0
3
5
7
8
10
11
12
15
16
0
2
3
5
6
7
8
10
11
12
16
0
3
5
6
8
11
16
0
5
6
7
8
9
14
16
17
19
0
1
2
3
4
8
12
14
16
18
1
4
8
11
16
17
18
19
0
1
3
7
8
10
11
14
16
19
0
1
3
8
9
11
16
0
2
3
8
11
16
0
1
2
3
5
6
7
8
9
10
11
14
16
17
19
0
2
3
4
5
6
7
8
10
11
14
16

0
2
6
8
10
13
15
16
17
19
0
3
5
7
8
11
15
16
0
3
5
8
10
11
12
16
2
4
5
7
8
9
10
11
14
15
16
0
2
3
5
7
8
10
11
12
16
0
1
3
5
8
11
12
14
16
18
0
3
5
8
9
13
14
15
17
19
0
3
6
8
10
11
16
0
3
4
8
11
13
15
16
0
2
3
5
8
10
11
12
16
0
3
7
8
11
16
2
4
5
8
11
16
0
3
4
5
6
8
9
13
14
15
16
17
18
19
0
2
3
4
5
6
8
10
11
16
0
1
6
8
9
13
14
16
17
19
0
2
5
7
8
10
12
16
17
18
19
0
2
3
5
8
10
11
15
16
0
1
3
5
8
9
12
13
14
16
17
18
0
5
8
9
11
13
14
15
19
0
1
3
8
11
13
14
16
17
19
0
1
2
3
5
7
8
10
11
14
16
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
0
3
5
8
10
11
12
16
17
1
3
5
7
8
9
11
15
16
17
0
1
2
4
6
8
11
14
16
0
3
6
7
8
10
11
12
14
16
0
1
3
5
6
8
11
16
18
19
0
2
3
5
7
8
10
11
16
0
2
3
5
8
10
11
16
0
2
3
5
8
10
11
12
16
0
3
4
5
7
8
11
16
18
0
3
5
6
8
11
13
16
0
2
4
5
6
7
8
9
10
11
14
16
19
0
1
5
7
8
10
11
13
14
15
16
17
19
0
2
3
7
8
9
11
12
15
16
0
3
5
7
8
11
12
16
0
3
5
8
10
11
16
0
1
5
6
8
9
13
14
16
17
18
19
0
1
3
4
5
8
11
14
16
17
18
19
0
3
5
6
7
8
9
10
11
13
14
16
18
19
0
2
3
5
7
8
10
11
16

0
3
4
5
6
8
9
11
12
13
14
17
18
19
0
1
2
3
4
5
7
8
10
11
14
15
16
17
18
19
0
1
3
5
6
7
9
11
12
14
15
17
18
0
1
5
6
9
11
12
13
14
15
17
18
19
0
1
6
8
9
13
14
17
18
19
0
1
3
5
8
14
16
17
18
19
0
1
2
5
6
8
9
11
12
13
14
17
19
0
1
4
5
6
8
9
13
14
15
17
19
0
1
2
3
5
8
10
11
16
0
2
3
5
8
9
10
11
16
17
19
4
6
7
8
9
11
13
14
15
16
17
19
0
2
3
4
8
11
12
16
0
1
2
3
5
8
10
11
16
0
2
5
6
7
8
10
11
13
14
17
19
0
1
5
8
11
12
14
16
0
2
3
4
8
9
11
16
0
2
3
5
8
9
11
12
16
0
3
4
7
8
11
12
16
0
1
7
8
10
11
13
14
15
19
0
1
2
5
6
7
8
10
14
16
17
19
2
3
4
5
6
7
8
9
11
12
16
0
2
3
8
9
11
15
16
17
0
1
2
3
5
7
8
9
10
11
12
13
16
0
1
6
7
9
10
11
14
17
19
0
1
2
3
6
8
9
10
13
14
17
18
19
0
1
2
3
4
6
7
8
10
11
13
16
0
1
3
4
8
11
14
16
19
1
2
3
5
8
9
10
11
12
16
18
19
0
1
3
4
8
9
11
14
15
16
17
0
2
3
5
8
10
11
12
13
16
18
3
4
8
9
11
14
15
16
0
1
4
5
6
7
8
9
10
11
12
13
14
15
17
0
2
3
5
7
8
10
11
13
14
16
19
0
3
6
7
8
10
11
15
16
0
7
8
10
11
13
14
16
17
18
19
2
3
7
8
11
12
16
0
3
7
8
11
12
16
0
1
2
3
5
6
7
8
9
10
11

0
2
3
5
8
10
11
12
16
0
2
3
6
7
8
9
12
13
14
16
17
18
19
0
2
3
5
6
7
8
9
11
12
14
16
17
18
2
3
5
6
8
9
10
11
15
16
18
0
1
3
6
8
11
14
16
17
3
8
10
11
12
13
16
0
3
5
7
8
10
11
12
13
15
16
18
0
3
5
7
8
11
14
16
17
0
2
4
5
6
8
9
15
16
17
0
3
8
11
14
16
17
19
0
2
3
5
6
7
8
10
11
12
14
16
0
7
8
10
11
13
14
16
17
19
0
2
3
5
6
7
8
9
10
11
12
13
14
16
17
19
0
4
5
6
10
11
14
15
16
17
18
19
2
3
7
8
10
11
12
15
16
18
0
4
6
8
13
14
15
16
17
18
19
2
3
4
5
7
8
9
10
11
12
15
16
18
3
7
8
9
10
11
12
15
16
18
0
2
3
4
7
8
10
11
12
16
0
3
7
8
9
10
11
15
16
0
2
3
5
8
11
12
16
0
1
2
4
5
6
7
8
9
10
11
14
15
16
18
19
0
1
4
5
7
8
9
11
13
14
15
16
18
0
2
3
5
7
8
10
11
16
0
1
2
3
4
5
6
7
8
9
10
15
16
17
18
0
2
3
5
7
8
10
11
16
19
0
2
3
5
6
7
8
9
11
13
16
0
2
3
6
8
10
11
16
19
0
3
4
6
7
8
10
14
15
18
19
0
1
6
8
9
10
12
14
19
0
2
3
5
8
10
11
16
0
2
3
5
6
7
8
10
11
12
14
16
1
6
7
8
11
13
14
16
19
1
8
9
13
14
17
18
19
0
2
7
8
14
16
18
19
0
2
4
5
7
8
9
11
12
13
14
16
18
0
2
3
5
8
9
10
11
14
16
17
18
0
1
4
5
6
7
8
9
1

In [23]:
import operator


    
d2tWriter = csv.writer(open("d2t.csv", 'w'), lineterminator = '\n')
t2dWriter = csv.writer(open("t2d.csv", "w"), lineterminator = "\n")    



d2tWriter.writerow(["doc ID", "topic pairs"])
for ID in d2t:
    d2tWriter.writerow([ID, sorted(d2t[ID], key=operator.itemgetter(1), reverse=True)])
    
t2dWriter.writerow(["topic", "docID pairs"])
for topic in t2d:
    t2dWriter.writerow([topic]+ sorted(t2d[topic], key=operator.itemgetter(1), reverse=True))

### Searches

In [48]:
import sys
import urllib.request
from urllib.error import HTTPError


BASE_URL = 'http://dx.doi.org/'

def getTitle(doi):
    url = BASE_URL + doi
    req = urllib.request.Request(url)
    req.add_header('Accept', 'application/x-bibtex')
    try:
        with urllib.request.urlopen(req) as f:
            bibtex = f.read().decode()
        start = bibtex.find("title = {")
        end = bibtex.find("},", start)
        return bibtex[start + 9:end]
        
        
    except HTTPError as e:
        if e.code == 404:
            return('DOI not found.')
        else:
            return('Service unavailable.')

In [81]:
search("10.1107/S0021889801004009", 0.15)


def search(docID, resistance):
    
    print("For the document: " + getTitle(docID))
    
    topics = d2t[docID]
    
    related = list()
    
    for topic in topics:
        
        if topic[1] > resistance:
            
            print("Some themes" + str(model.print_topic(0)))
            
            for newID in t2d[topic[0]]:
                if topic[1]*newID[1] > resistance:
                    related.append(getTitle(newID[0]))
    
    
    print("We found: " + str(related))
    

TypeError: slice indices must be integers or None or have an __index__ method