### Imports

In [1]:
# processing
from operator import methodcaller
import csv
import re
import numpy as np
import pandas as pd
from pprint import pprint

# gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt

#sci-kit
from sklearn import feature_extraction



### Processing

In [2]:
blacklist = [t.strip() for t in next(csv.reader(open("blacklist.csv", 'r')))]

inPath = "raw.csv"

inFile = open(inPath, 'r')
inReader = csv.reader(inFile)

docTokens = dict()


next(inReader)
for inRow in inReader:
    term = inRow[0]
    sentence = inRow[2]
    docID = inRow[3]
    
    token = "_".join([t for t in term.split(":") if re.match(r'[^\W\d]*$', t) and not t in blacklist])
    
    if not token in blacklist and len(token) > 0:
        if docID in docTokens:
            docTokens[docID] += [token]
        else:
            docTokens[docID] = [token]

docIDs = list(docTokens.keys())
data = list(docTokens.values())

### Topic Modeling

In [4]:
dictionary = corpora.Dictionary(data)
texts = data

corpus = [dictionary.doc2bow(text) for text in texts]

In [89]:
numTops = 50

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                       id2word=dictionary,
                                       num_topics=numTops, 
                                       random_state=100,
                                       update_every=1,
                                       chunksize=8,
                                       passes=1,
                                       alpha='auto',
                                       per_word_topics=True)


print(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(45, '0.015*"solvent_molecule" + 0.015*"sum" + 0.011*"purification" + 0.011*"crystal_symmetry" + 0.009*"partial_weight_sum_experimental_total_function" + 0.008*"ether" + 0.007*"copper" + 0.006*"cyanide" + 0.006*"rigid_body" + 0.005*"oliver_et_al_previously_report_polymorph"'), (24, '0.052*"n" + 0.026*"amine" + 0.011*"family" + 0.011*"twin" + 0.010*"o_c" + 0.010*"dipeptide" + 0.009*"perturbation" + 0.009*"transport" + 0.007*"domain_ratio" + 0.007*"transmembrane_protein"'), (15, '0.041*"length" + 0.024*"dimension" + 0.014*"der" + 0.013*"chloride" + 0.011*"extension" + 0.011*"bond_length" + 0.011*"planarity" + 0.010*"literature" + 0.008*"angle_bond_length" + 0.005*"interest"'), (30, '0.026*"torsion" + 0.023*"c_torsion_angle" + 0.021*"zigzag_chain" + 0.017*"respectively_deg" + 0.014*"c_c_torsion_angle" + 0.013*"carbonyl" + 0.012*"transfer" + 0.012*"c_torsion" + 0.012*"proton" + 0.009*"x_ray_analysis"'), (13, '0.045*"occupancy" + 0.034*"solvent" + 0.015*"counter" + 0.015*"stack" + 0.010*"h

### Visualization

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
pyLDAvis.show(vis)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))



Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [05/Jul/2019 13:40:46] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [05/Jul/2019 13:40:46] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [05/Jul/2019 13:40:46] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [05/Jul/2019 13:40:46] "GET /LDAvis.js HTTP/1.1" 200 -


### Document Tagging

In [90]:
i = 1

out =  csv.writer(open("output.csv", 'w'), lineterminator = '\n')
out.writerow(["prob", "topic", "docID"])

d2t = dict()
t2d = dict()


for ID in docIDs:
    
    #print("Document " + str(i) + ": ")
    
    doc = docTokens[ID]
    
    store = list(lda_model[dictionary.doc2bow(doc)])[0]
    
    #print(store)
    
    for pair in store:
        
        out.writerow([pair[1], pair[0], ID])
        
        if ID in d2t:
            d2t[ID] += [pair]
        else:
            d2t[ID] = [pair]

        if pair[0] in t2d:
            print(pair[0])
            t2d[pair[0]] += [(ID, pair[1])]
        else:
            t2d[pair[0]] = [(ID, pair[1])]

    
    i += 1

    

6
10
6
9
10
34
41
3
6
9
10
23
25
34
39
41
3
5
6
10
25
27
35
41
10
25
27
41
6
9
10
34
41
5
6
10
20
27
41
5
6
9
10
41
5
6
9
10
16
39
41
3
6
10
23
39
41
6
9
10
20
39
41
5
6
9
10
20
27
39
41
5
6
10
16
37
39
41
3
5
6
10
41
1
3
6
10
30
34
3
6
10
16
39
41
5
6
9
10
16
20
23
39
41
1
3
10
16
30
33
36
41
5
6
9
10
20
23
26
28
41
42
6
10
23
27
30
41
5
6
10
25
34
35
41
9
10
15
16
37
39
41
1
2
6
7
10
11
35
40
42
43
6
9
10
16
23
25
33
34
41
45
4
9
10
20
25
41
6
10
20
30
35
40
41
43
4
5
6
10
15
41
0
6
10
15
27
41
1
5
6
9
10
14
20
30
34
36
41
3
5
6
10
34
41
46
3
5
6
9
10
13
15
20
25
34
39
41
45
6
10
14
23
39
41
5
6
10
16
27
41
43
0
5
6
10
12
15
16
30
34
38
41
0
1
10
16
20
24
27
29
30
34
35
38
41
1
3
5
10
22
25
27
28
41
45
5
6
9
10
18
27
29
36
41
44
1
5
6
15
16
17
28
30
38
42
43
3
6
10
11
15
17
27
28
33
41
42
43
47
48
6
10
11
12
17
19
31
35
41
44
48
1
3
6
10
22
23
27
28
31
41
47
6
10
19
20
23
25
35
41
1
6
9
10
25
34
35
39
41
5
6
9
10
20
25
41
42
5
6
10
19
30
32
41
44
1
4
5
6
10
14
19
25
29
41
5
6
10
14
2

41
42
1
5
6
9
10
18
27
37
41
3
5
6
9
10
16
20
21
30
37
38
39
41
4
5
6
10
40
41
5
6
8
10
11
12
21
23
26
27
28
31
40
42
49
6
7
10
11
32
41
42
43
47
48
0
6
10
23
28
40
41
42
44
4
5
6
10
18
41
1
5
6
10
17
28
39
41
0
10
17
20
34
35
39
41
49
6
9
10
14
16
22
23
25
37
41
1
2
3
6
7
8
10
12
18
21
26
28
29
40
41
42
48
0
1
3
6
7
10
14
20
23
34
41
43
44
3
5
6
10
11
14
16
20
41
49
5
6
9
10
20
21
25
29
31
39
40
41
5
6
9
10
14
23
25
27
29
41
3
5
6
10
16
25
27
41
44
5
8
9
10
22
23
25
27
34
41
5
6
10
14
16
37
40
41
3
6
8
9
10
16
17
20
21
39
41
4
5
6
9
10
37
39
40
41
5
6
10
15
20
22
27
41
5
6
10
14
25
27
29
39
41
5
6
9
10
30
37
39
41
0
1
5
6
7
9
10
15
20
22
23
24
27
29
34
35
37
41
42
44
49
3
6
9
10
37
41
3
5
8
9
10
14
22
23
24
25
34
39
40
41
5
6
9
10
24
39
41
5
6
9
10
16
20
28
39
40
41
42
44
6
10
16
39
41
5
8
10
16
20
37
38
39
41
0
3
5
6
8
10
16
18
28
30
39
41
42
6
9
10
16
19
20
22
25
27
30
37
41
2
6
11
15
21
28
33
38
42
43
44
46
47
48
0
9
10
14
16
18
33
34
35
36
41
49
3
5
9
10
13
16
18
20
34
37
39
41
49

2
3
5
6
10
16
17
27
28
29
41
2
5
6
7
12
26
28
29
31
32
33
35
36
40
41
42
43
46
47
7
11
13
19
20
21
28
29
31
35
38
43
46
47
1
6
7
9
10
27
28
29
34
35
38
41
5
6
9
10
11
20
29
41
42
47
0
1
2
3
5
6
7
8
9
10
11
12
16
17
20
21
22
23
25
26
27
28
29
31
34
35
36
37
38
39
40
41
42
43
47
48
49
0
2
6
10
18
20
29
39
41
1
6
15
21
26
32
36
41
43
44
47
48
2
6
7
9
11
12
14
17
19
21
26
28
31
32
33
37
40
42
43
46
47
1
5
6
10
13
20
26
27
31
41
44
46
48
0
6
7
10
16
41
44
3
5
6
10
16
35
40
41
1
3
7
9
10
16
17
22
34
35
41
49
2
6
7
8
10
12
14
17
21
26
28
37
40
42
45
47
48
2
5
10
11
21
23
28
36
37
41
42
44
47
2
4
7
10
17
21
26
31
33
35
41
42
46
47
48
6
9
10
30
34
39
41
2
10
17
18
20
21
25
26
27
28
29
31
32
33
42
43
5
6
10
11
12
21
26
27
31
32
40
42
43
46
48
3
5
6
9
10
41
5
6
9
10
18
41
6
9
10
14
17
20
22
27
35
38
39
41
5
6
7
10
11
14
15
20
21
25
26
29
35
39
42
47
48
49
5
6
9
10
18
22
25
33
41
6
8
10
11
21
28
29
38
41
42
43
6
10
11
12
19
21
27
29
31
32
41
42
43
44
46
47
4
5
6
9
10
16
20
28
29
34
39
41
42
47
4
5

2
4
5
6
8
11
17
21
25
29
31
32
33
35
36
40
42
43
46
47
48
2
7
8
10
11
13
14
19
20
31
33
38
41
42
44
46
49
3
5
6
9
10
14
20
24
37
41
0
9
10
14
18
20
21
22
23
25
27
34
41
6
10
18
22
25
27
28
34
41
1
2
6
10
26
28
31
41
6
9
10
16
18
20
41
5
6
7
8
11
15
17
20
21
30
31
35
39
40
42
46
47
48
49
0
6
10
11
12
15
26
28
29
32
33
36
37
38
39
41
42
45
47
1
5
6
9
10
15
17
24
25
27
31
36
37
39
42
46
49
9
10
20
21
22
23
25
27
33
34
39
41
0
3
5
6
9
10
16
20
30
41
9
10
18
20
21
22
25
27
33
34
35
0
6
10
13
18
30
34
39
41
49
5
6
10
14
24
39
40
41
0
3
5
6
10
14
15
29
41
6
10
16
38
39
41
6
7
10
11
12
17
21
26
27
31
33
38
41
42
43
44
46
47
0
4
6
10
16
17
23
25
41
49
3
5
6
9
10
34
41
49
3
6
9
10
16
20
41
6
10
26
28
29
30
31
35
37
38
41
42
43
44
46
47
48
10
11
12
15
17
21
27
32
35
36
38
39
41
42
44
45
6
10
13
15
26
28
29
41
42
44
45
47
5
6
10
22
25
29
41
43
5
6
10
11
31
41
42
46
47
5
6
10
29
36
38
40
41
0
1
2
6
7
10
13
15
18
26
28
31
34
37
42
45
46
48
0
1
2
3
4
6
8
10
12
15
21
28
29
30
40
41
47
48
2
5
6
7
8
10


1
6
11
17
21
25
26
28
31
32
33
35
36
38
41
42
44
46
47
48
2
5
9
10
13
17
18
20
21
27
28
33
39
40
41
42
43
44
47
48
3
5
6
7
10
12
13
23
30
35
36
40
41
44
49
10
13
17
26
28
29
33
41
42
43
45
46
0
5
9
10
20
22
23
30
34
39
41
3
5
6
7
8
10
16
18
19
20
22
24
30
41
6
10
16
18
38
41
1
6
7
8
10
17
21
24
28
29
31
36
41
46
3
5
6
9
10
12
16
17
18
23
33
34
36
37
41
5
6
10
16
20
28
41
1
2
3
6
8
10
12
13
18
19
28
35
41
42
46
49
2
5
6
10
11
12
15
21
41
45
48
7
8
15
18
21
36
42
43
46
47
4
5
9
10
14
16
20
22
23
25
34
38
39
41
44
2
4
6
7
26
28
31
32
39
40
42
43
44
46
47
48
49
3
4
5
6
8
10
13
16
20
23
25
27
33
37
38
39
41
44
0
3
5
6
9
10
13
16
20
23
27
31
37
41
44
0
5
6
9
10
14
16
20
22
25
28
33
34
35
41
0
10
14
16
25
37
38
39
41
3
6
9
10
13
25
30
34
35
38
41
49
1
4
5
6
7
8
10
14
15
18
20
21
23
24
25
28
29
33
41
42
47
48
49
0
1
4
5
6
9
10
11
16
18
19
20
21
28
34
36
41
42
44
5
6
9
10
20
37
41
0
1
5
6
9
10
12
15
17
20
25
27
29
30
31
35
36
41
42
43
46
47
48
5
6
8
10
16
34
40
41
1
6
8
10
24
25
26
28
41
49
5
6

In [91]:
import operator


    
d2tWriter = csv.writer(open("d2t.csv", 'w'), lineterminator = '\n')
t2dWriter = csv.writer(open("t2d.csv", "w"), lineterminator = "\n")    



d2tWriter.writerow(["doc ID", "topic pairs"])
for ID in d2t:
    d2tWriter.writerow([ID, sorted(d2t[ID], key=operator.itemgetter(1), reverse=True)])
    
t2dWriter.writerow(["topic", "docID pairs"])
for topic in t2d:
    t2dWriter.writerow([topic]+ sorted(t2d[topic], key=operator.itemgetter(1), reverse=True))

### Searches

In [92]:
import sys
import urllib.request
from urllib.error import HTTPError


BASE_URL = 'http://dx.doi.org/'

def getTitle(doi):
    url = BASE_URL + doi
    req = urllib.request.Request(url)
    req.add_header('Accept', 'application/x-bibtex')
    try:
        with urllib.request.urlopen(req) as f:
            bibtex = f.read().decode()
        start = bibtex.find("title = {")
        end = bibtex.find("},", start)
        return bibtex[start + 9:end]
        
        
    except HTTPError as e:
        if e.code == 404:
            return('DOI not found.')
        else:
            return('Service unavailable.')

In [101]:
search("10.1107/S0021889803000281", 0.25)


def search(docID, resistance):
    
    print("For the document: " + getTitle(docID))
    
    topics = d2t[docID]
    
    related = list()
    
    for topic in topics:
        
        if topic[1] > resistance:
            
            for newID in t2d[topic[0]]:
                if topic[1]*newID[1] > resistance:
                    related.append(getTitle(newID[0]))
    
    
    print("We found: " + str(related))
    

For the document: Small-angle neutron scattering by porous alumina membranes made of aligned cylindrical channels
We found: ['(Dimercaptomethylenepropanedinitrilato-S,S{\\textquotesingle})(pyridine)(triphenylphosphine)palladium({II}){\\textendash}Acetonitrile (1/1)', 'Crystal chemical studies of the 5f-series of elements. I. New structure types', 'Small-angle neutron scattering by porous alumina membranes made of aligned cylindrical channels', 'Bis($\\upmu$-diethylphosphido-$\\upkappa$2P:P)bis[bis(2,4,6-trimethylphenyl)indium({III})]', 'Unprecedented conformational flexibility revealed in the ligand-binding domains of {theBovicola} ovisecdysone receptor ({EcR}) and ultraspiracle ({USP}) subunits', 'Crystal chemical studies of the 5f-series of elements. X. Sulfides and oxysulfides', 'Al14Ba8La26.3Ru18Sr53.7O167: a variant of cubic perovskite with isolated {RuO}6units']


In [123]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


def get_cosine_sim(*strs): 
    vectors = [t for t in get_vectors(*strs)]
    return cosine_similarity(vectors)
    
def get_vectors(*strs):
    text = [t for t in strs]
    vectorizer = CountVectorizer(text)
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()

def search(docID, resistance):
    
    title = getTitle(docID)
    print("For the document: " + title)
    
    str1 = " ".join(docTokens[docID])
    
    found = list()
    for ID in docIDs:
        tokens = docTokens[ID]
        str2 = " ".join(tokens)
        if get_cosine_sim(str1, str2)[0][1] > resistance:
            found.append(getTitle(ID))
    
    
    found.remove(title)
    print("We found: " + str(found))
    
search("10.1107/S0021889803000281", 0.13)

For the document: Small-angle neutron scattering by porous alumina membranes made of aligned cylindrical channels
We found: ['First data acquired on the {extendedQ}-range small-angle neutron scattering ({EQ}-{SANS}) diffractometer at the Spallation Neutron Source', 'The structure of pumice by neutron diffraction', 'Small-angle scattering curves of concentrated polymer solutions', 'Illustration of the anisotropic Porod law', 'A direct observation of counterion condensation around cylindrical micelles']
