### Imports

In [30]:
# processing
from operator import methodcaller
import csv
import re
import numpy as np
import pandas as pd
from pprint import pprint
import string


# gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt


#sci-kit
from sklearn import feature_extraction

### Processing

In [37]:
blacklist = [t.strip() for t in next(csv.reader(open("blacklist.csv", 'r')))]
levels = [1, 2, 3]



inPath = "raw.csv"

inFile = open(inPath, 'r')
inReader = csv.reader(inFile)

docTokens = dict()


next(inReader)
for inRow in inReader:
    term = inRow[0]
    sentence = inRow[2]
    docID = inRow[3]
    
    
    
    
    token = "_".join([t for t in term.split(":") if re.match(r'[^\W\d]*$', t) and not t in blacklist])
    
    level = token.count("_")
    
    if level in levels and not token in blacklist and len(token) > 0:
        if docID in docTokens:
            docTokens[docID] += [token]
        else:
            docTokens[docID] = [token]

docIDs = list(docTokens.keys())
data = list(docTokens.values())

### LDA Modeling

In [38]:
dictionary = corpora.Dictionary(data)
texts = data

corpus = [dictionary.doc2bow(text) for text in texts]

In [40]:
numTops = 20

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                       id2word=dictionary,
                                       num_topics=numTops, 
                                       random_state=100,
                                       update_every=1,
                                       chunksize=8,
                                       passes=1,
                                       alpha='auto',
                                       per_word_topics=True)


print(lda_model.print_topics())

#select dist topic
#for each dist id
#where p > bound
#for each term
#check if it's in previous list

doc_lda = lda_model[corpus]

[(0, '0.055*"crystal_structure" + 0.032*"water_molecule" + 0.029*"middot_middot" + 0.015*"hydrogen_bond" + 0.010*"stack_interaction" + 0.009*"rotation_axis" + 0.009*"uncoordinated_water_molecule" + 0.009*"perchlorate_anion" + 0.008*"c_n_bond" + 0.007*"half_occupancy"'), (1, '0.029*"unit_cell" + 0.023*"x_ray_diffraction" + 0.023*"b_axis" + 0.022*"small_angle" + 0.021*"inversion_centre" + 0.018*"c_bond" + 0.013*"o_interaction" + 0.008*"symmetry_centre" + 0.008*"dimensional_structure" + 0.007*"basal_plane"'), (2, '0.009*"ring_system" + 0.009*"special_position" + 0.007*"saxs_intensity" + 0.007*"thermal_expansion" + 0.007*"membrane_protein" + 0.005*"transmembrane_protein" + 0.005*"respectively_deg_dihedral_angle" + 0.004*"detail_analysis" + 0.004*"rotate_phenylene_ring_deg" + 0.004*"function_positive_construction"'), (3, '0.044*"aring_resolution" + 0.009*"crystal_form" + 0.008*"aromatic_system" + 0.008*"crystal_symmetry" + 0.007*"molecular_replacement" + 0.006*"active_site" + 0.006*"unit_ce

### HDP Modeling

In [42]:
from gensim.models import HdpModel

hdp = HdpModel(corpus, dictionary)


  start_time = time.clock()


In [51]:
lda_model = hdp.suggested_lda_model()


print(lda_model.print_topics())

[(148, '0.000*"controversy_subject" + 0.000*"powder_diffraction_peak" + 0.000*"ctb_cholera_toxin" + 0.000*"anion_bind_site" + 0.000*"octahedral_arrangement" + 0.000*"free_nitrate" + 0.000*"azomethine_n" + 0.000*"chloride_calcium_sodium_citrate" + 0.000*"axial_polar_general_tensor" + 0.000*"unusual_hppd_like_domain"'), (149, '0.000*"weak_hg_link_molecule" + 0.000*"central_cuii_ion" + 0.000*"x_background" + 0.000*"silicon_crystal_performance_test" + 0.000*"real_time_observation" + 0.000*"mn_distance" + 0.000*"polarization_factor" + 0.000*"vertical_rowland_cycle" + 0.000*"rna_bind_domain" + 0.000*"modify_enzyme"'), (144, '0.000*"l_idopyranosiduronic_acid" + 0.000*"backbone_type" + 0.000*"silicon_nanowire" + 0.000*"polyethylene_glycol" + 0.000*"mono_energetic_positron_beam" + 0.000*"prime_middot_hydrate" + 0.000*"plane_wave_incident" + 0.000*"substrate_entry" + 0.000*"human_expert" + 0.000*"average_ring_size"'), (147, '0.000*"high_numerical_aperture" + 0.000*"maleic_anhydride" + 0.000*"dim

In [109]:
import pandas as pd

top_words_per_topic = []
for t in range(lda_model.num_topics):
    top_words_per_topic.extend([(t, ) + x for x in lda_model.show_topic(t, topn = 5)])

pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'P']).to_csv("top_words.csv")


### Visualization

In [53]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
pyLDAvis.show(vis)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))



Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8889/    [Ctrl-C to exit]


127.0.0.1 - - [08/Jul/2019 14:52:05] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2019 14:52:05] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2019 14:52:05] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2019 14:52:05] "GET /LDAvis.js HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2019 14:52:05] code 404, message Not Found
127.0.0.1 - - [08/Jul/2019 14:52:05] "GET /favicon.ico HTTP/1.1" 404 -



stopping Server...


### Document Tagging

In [108]:
i = 1

out =  csv.writer(open("output.csv", 'w'), lineterminator = '\n')
out.writerow(["prob", "topic", "docID"])

d2t = dict()
t2d = dict()


for ID in docIDs:
    
    print("Document " + str(i) + ": ")
    
    doc = docTokens[ID]
    
    store = list(lda_model[dictionary.doc2bow(doc)])
    
    print(store)
    
    for pair in store:
        
        
        out.writerow([pair[1], pair[0], ID])
        
        if ID in d2t:
            d2t[ID] += [pair]
        else:
            d2t[ID] = [pair]

        if pair[0] in t2d:
            t2d[pair[0]] += [(ID, pair[1])]
        else:
            t2d[pair[0]] = [(ID, pair[1])]

    
    i += 1

    

Document 1: 
[(112, 0.950259907926981)]
Document 2: 
[(7, 0.8771954314527322)]
Document 3: 
[(7, 0.9622139789046689)]
Document 4: 
[(7, 0.9661227764848008)]
Document 5: 
[(121, 0.9336073218975408)]
Document 6: 
[(1, 0.6967561590890972), (100, 0.24612017668540614)]
Document 7: 
[(5, 0.929649975915277)]
Document 8: 
[(75, 0.9530107316502073)]
Document 9: 
[(1, 0.9250309634777899)]
Document 10: 
[(5, 0.9481628507698651)]
Document 11: 
[(139, 0.39949045948202194), (142, 0.5102050781085086)]
Document 12: 
[(0, 0.6172763654760274), (7, 0.35076715664329194)]
Document 13: 
[(137, 0.9656337396874485)]
Document 14: 
[(0, 0.942573071291536)]
Document 15: 
[(0, 0.8915269124395683)]
Document 16: 
[(50, 0.960499477242538)]
Document 17: 
[(7, 0.9106875864073858)]
Document 18: 
[(70, 0.7515985085263273)]
Document 19: 
[(117, 0.9547410332277029)]
Document 20: 
[(27, 0.8013581108986814)]
Document 21: 
[(21, 0.9504269183247248)]
Document 22: 
[(7, 0.23266076932336405), (99, 0.26102924135846406), (139, 0.

[(15, 0.9764386522335383)]
Document 327: 
[(0, 0.44225080362529656), (1, 0.2708320759447859), (7, 0.24803020361239517)]
Document 328: 
[(11, 0.9671936293693171)]
Document 329: 
[(0, 0.6482142075423968), (61, 0.3145062365313245)]
Document 330: 
[(17, 0.9590258868258271)]
Document 331: 
[(91, 0.9567480316820197)]
Document 332: 
[(109, 0.9698350534088834)]
Document 333: 
[(98, 0.9679738962785345)]
Document 334: 
[(1, 0.14418304943287966), (51, 0.8156152519863523)]
Document 335: 
[(3, 0.2926707707323766), (8, 0.6192086938560994)]
Document 336: 
[(13, 0.9689966255608778)]
Document 337: 
[(75, 0.9551466074842888)]
Document 338: 
[(1, 0.9625156943256581)]
Document 339: 
[(0, 0.16265338285478204), (90, 0.8060034903503573)]
Document 340: 
[(122, 0.9502069945088243)]
Document 341: 
[(0, 0.23431588052839805), (16, 0.21566296316584774), (17, 0.48236231820784936)]
Document 342: 
[(92, 0.9818689004403934)]
Document 343: 
[(141, 0.9233094642118501)]
Document 344: 
[(142, 0.900398908511408)]
Document 

[(107, 0.8757475744556197)]
Document 496: 
[(54, 0.36301529348456635), (63, 0.587793482387542)]
Document 497: 
[(43, 0.9792858359745303)]
Document 498: 
[(1, 0.22150319106984395), (81, 0.41530380051390764), (102, 0.31256251139713465)]
Document 499: 
[(141, 0.9783265877118464)]
Document 500: 
[(123, 0.9678922960780022)]
Document 501: 
[(22, 0.968111672499076)]
Document 502: 
[(0, 0.9800763712925469)]
Document 503: 
[(1, 0.9512704132946632)]
Document 504: 
[(36, 0.9448726147577315)]
Document 505: 
[(20, 0.9669861521650874)]
Document 506: 
[(64, 0.9237041639899907)]
Document 507: 
[(87, 0.9289641515847198)]
Document 508: 
[(71, 0.9174898109451675)]
Document 509: 
[(0, 0.9302670785211734)]
Document 510: 
[(17, 0.3076352773367037), (84, 0.6171246700614117)]
Document 511: 
[(1, 0.36805541176909945), (11, 0.5634601035080007)]
Document 512: 
[(3, 0.3980680168545838), (7, 0.5374606407863165)]
Document 513: 
[(18, 0.9241320108682249)]
Document 514: 
[(75, 0.9588843901939313)]
Document 515: 
[(36

[(54, 0.9700806383612444)]
Document 669: 
[(2, 0.565666520288078), (85, 0.41499445330038304)]
Document 670: 
[(38, 0.06348339463905593), (39, 0.36784314172348276), (42, 0.3066675121260436), (90, 0.23340987939278876)]
Document 671: 
[(15, 0.9816745072669191)]
Document 672: 
[(88, 0.9631469423931275)]
Document 673: 
[(62, 0.9715500076337591)]
Document 674: 
[(75, 0.9775733037421447)]
Document 675: 
[(93, 0.9857818426772573)]
Document 676: 
[(62, 0.9875531283397698)]
Document 677: 
[(4, 0.45813792971617956), (97, 0.5279249645810651)]
Document 678: 
[(19, 0.9739797426523186)]
Document 679: 
[(28, 0.9819686572984087)]
Document 680: 
[(21, 0.9716725247569866)]
Document 681: 
[(97, 0.9095689499104362)]
Document 682: 
[(25, 0.4135694663259973), (47, 0.5320634069386377)]
Document 683: 
[(141, 0.9715149438501159)]
Document 684: 
[(1, 0.31424644584724154), (18, 0.625698833783209)]
Document 685: 
[(55, 0.9008403603017894)]
Document 686: 
[(64, 0.21389805700507783), (97, 0.10804992061293758), (115,

[(127, 0.9414397624304677)]
Document 851: 
[(1, 0.2867621999264561), (73, 0.6323784121929615)]
Document 852: 
[(0, 0.40591739940532173), (39, 0.5457010906588233)]
Document 853: 
[(29, 0.9419312881337922)]
Document 854: 
[(79, 0.9335888309149719)]
Document 855: 
[(120, 0.2642435163177903), (137, 0.6884452550903309)]
Document 856: 
[(110, 0.9744891751374709)]
Document 857: 
[(69, 0.94182394066072)]
Document 858: 
[(59, 0.9006320227572124)]
Document 859: 
[(46, 0.9339107137123013)]
Document 860: 
[(18, 0.9342477427524882)]
Document 861: 
[(0, 0.3517972300543058), (18, 0.6228727266099963)]
Document 862: 
[(30, 0.9602865585109693)]
Document 863: 
[(5, 0.9343399775209266)]
Document 864: 
[(110, 0.9738178376410885)]
Document 865: 
[(29, 0.929487992733617)]
Document 866: 
[(0, 0.9486179726302313)]
Document 867: 
[(46, 0.9527933669373583)]
Document 868: 
[(104, 0.9448645211535696)]
Document 869: 
[(8, 0.962127202456702)]
Document 870: 
[(43, 0.917140536644949)]
Document 871: 
[(0, 0.93026730085

[(2, 0.9809388688406616)]
Document 1013: 
[(22, 0.9646950659811199)]
Document 1014: 
[(78, 0.9646031844731524)]
Document 1015: 
[(39, 0.9669542529775593)]
Document 1016: 
[(4, 0.9661773845445214)]
Document 1017: 
[(0, 0.9593225885857338)]
Document 1018: 
[(76, 0.9448781101341951)]
Document 1019: 
[(95, 0.941525367188643)]
Document 1020: 
[(73, 0.9730886219627621)]
Document 1021: 
[(0, 0.2536665558470622), (16, 0.17273195246120257), (19, 0.3289182011036546), (71, 0.21987442045491143)]
Document 1022: 
[(4, 0.8910160168658113)]
Document 1023: 
[(140, 0.9750722656432945)]
Document 1024: 
[(12, 0.8905725293880179)]
Document 1025: 
[(146, 0.9168325251110487)]
Document 1026: 
[(81, 0.9415793236195126)]
Document 1027: 
[(24, 0.9758674343593037)]
Document 1028: 
[(96, 0.9737967022722709)]
Document 1029: 
[(23, 0.5161539260949296), (50, 0.39534151711876153)]
Document 1030: 
[(27, 0.9097082322266573)]
Document 1031: 
[(37, 0.29357847741371623), (80, 0.6404970923029125)]
Document 1032: 
[(4, 0.084

[(17, 0.15636879213000585), (29, 0.5593732705785986), (145, 0.21505073365651647)]
Document 1186: 
[(145, 0.94745505089565)]
Document 1187: 
[(59, 0.8343866983099326)]
Document 1188: 
[(13, 0.7519726967714208)]
Document 1189: 
[(137, 0.9616684041781984)]
Document 1190: 
[(19, 0.9293735871991328)]
Document 1191: 
[(0, 0.9593225920101421)]
Document 1192: 
[(0, 0.33648007995484536), (70, 0.3081689597862475), (106, 0.3168449887821886)]
Document 1193: 
[(69, 0.9646788211154371)]
Document 1194: 
[(10, 0.2334735001310555), (11, 0.3525044752009597), (17, 0.20069988706592656), (105, 0.17187449476734892)]
Document 1195: 
[(1, 0.9512704166399939)]
Document 1196: 
[(107, 0.9337320397096639)]
Document 1197: 
[(0, 0.9674580737318704)]
Document 1198: 
[(11, 0.5344755670205434), (75, 0.30369680726843507)]
Document 1199: 
[(51, 0.9528452810096605)]
Document 1200: 
[(54, 0.5135933413377702), (92, 0.4371791803927895)]
Document 1201: 
[(7, 0.9727100958784121)]
Document 1202: 
[(81, 0.9738641833319972)]
Doc

In [71]:
import operator


    
d2tWriter = csv.writer(open("d2t.csv", 'w'), lineterminator = '\n')
t2dWriter = csv.writer(open("t2d.csv", "w"), lineterminator = "\n")    



d2tWriter.writerow(["doc ID", "topic pairs"])
for ID in d2t:
    d2tWriter.writerow([ID, sorted(d2t[ID], key=operator.itemgetter(1), reverse=True)])
    
t2dWriter.writerow(["topic", "docID pairs"])
for topic in t2d:
    t2dWriter.writerow([topic]+ sorted(t2d[topic], key=operator.itemgetter(1), reverse=True))

### Searches

In [72]:
import sys
import urllib.request
from urllib.error import HTTPError


BASE_URL = 'http://dx.doi.org/'

def getTitle(doi):
    url = BASE_URL + doi
    req = urllib.request.Request(url)
    req.add_header('Accept', 'application/x-bibtex')
    try:
        with urllib.request.urlopen(req) as f:
            bibtex = f.read().decode()
        start = bibtex.find("title = {")
        end = bibtex.find("},", start)
        return bibtex[start + 9:end]
        
        
    except HTTPError as e:
        if e.code == 404:
            return('DOI not found.')
        else:
            return('Service unavailable.')

In [106]:



def search(docID, resistance):
    
    title =getTitle(docID)
    print("For the document: " + title)
    
    topics = d2t[docID]
    
    related = list()
    
    for topic in topics:
        
        if topic[1] > resistance:
            
            print("Topic " + str(topic[0]) + ": " + lda_model.print_topic(topic[0]))
            
            for newID in t2d[topic[0]]:
                if topic[1]*newID[1] > resistance:
                    related.append(getTitle(newID[0]))
    
    try:
        related.remove(title)
    except:
        pass
    print("We found: " + str(related))
    
search("10.1107/S0021889803000281", 0.93)

For the document: Small-angle neutron scattering by porous alumina membranes made of aligned cylindrical channels
Topic 15: 0.000*"chelate_molecule_n_atom" + 0.000*"bipyramidal_coordination_environment" + 0.000*"o_phenol_phenol" + 0.000*"crystalline_arrangement" + 0.000*"cylindrical_shape" + 0.000*"discrete_cation" + 0.000*"symmetry_relate_bmi_imidazole" + 0.000*"thiourea_unit" + 0.000*"m_acetate_ph_buff" + 0.000*"seed_achieve_method"
We found: ['Characterization of dislocations in protein crystals by means of synchrotron double-crystal topography', 'The local domain configuration in partially ordered {AuCu}3', 'X-ray near-field speckle: implementation and critical analysis', 'An Ultra-Low-Temperature Diffractometer Based on an 3He{\\textendash}4He Dilution Refrigerator used for Synchrotron-Radiation X-ray Diffractometry and Topography', '{CTD} Code: a Combinatorial Code for Eukaryotic Transcription']


Or, if we use cosine similarity.

In [23]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


def get_cosine_sim(*strs): 
    vectors = [t for t in get_vectors(*strs)]
    return cosine_similarity(vectors)
    
def get_vectors(*strs):
    text = [t for t in strs]
    vectorizer = CountVectorizer(text)
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()

def search(docID, resistance):
    
    title = getTitle(docID)
    print("For the document: " + title)
    
    str1 = " ".join(docTokens[docID])
    
    found = list()
    for ID in docIDs:
        tokens = docTokens[ID]
        str2 = " ".join(tokens)
        if get_cosine_sim(str1, str2)[0][1] > resistance:
            found.append(getTitle(ID))
    
    
    found.remove(title)
    print("We found: " + str(found))
    
search("10.1107/S0021889803000281", 0.07)

For the document: Small-angle neutron scattering by porous alumina membranes made of aligned cylindrical channels
We found: ['Structure of {PEP}{\\textendash}{PEO} block copolymer micelles: exploiting the complementarity of small-angle X-ray scattering and static light scattering', 'A new method of evaluating slit-smeared small-angle X-ray scattering data', 'First data acquired on the {extendedQ}-range small-angle neutron scattering ({EQ}-{SANS}) diffractometer at the Spallation Neutron Source', 'Comment on Misorientation-angle distribution of randomly oriented symmetric objects by Morawiec (1995)', 'The structure of pumice by neutron diffraction', 'Small-angle scattering curves of concentrated polymer solutions', 'Small-angle X-ray study of the three-dimensional collagen/mineral superstructure in intramuscular fish bone', 'Anomalous dispersion of small-angle scattering of horse-spleen ferritin at the {ironKabsorption} edge', 'Illustration of the anisotropic Porod law', 'A new method t