Searches
---

This programs uses R&R terms to conduct TF-IDF Nearest Neighbor searches.

#### Imports


In [1]:
# processing
import csv
import re
import numpy as np

# sci-kit
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# URLs
import sys
import urllib.request
from urllib.error import HTTPError

# interfacing
from ipywidgets import widgets, Output
from IPython.display import display, Latex, Math, clear_output, Markdown


#### Preprocessing

Specify pathway to R&R program output. Select terms to blacklist and levels of terms to consider. All tokens are loaded into a dictionary.

In [13]:
blacklist = [t.strip() for t in next(csv.reader(open("tools\\blacklist.csv", 'r')))]
levels = [0, 1, 2, 3]

inPath = "raw.csv"



inFile = open(inPath, 'r')
inReader = csv.reader(inFile)

docTokens = dict()

next(inReader)
for inRow in inReader:
    term = inRow[0]
    sentence = inRow[2]
    docID = inRow[3]
    
    token = "_".join([t for t in term.split(":") if re.match(r'[^\W\d]*$', t) and not t in blacklist])
    
    level = token.count("_")
    
    if level in levels and not token in blacklist and len(token) > 0:
        if docID in docTokens:
            docTokens[docID] += " " + token
        else:
            docTokens[docID] = token

docIDs = list(docTokens.keys())
data = list(docTokens.values())

#### Vectorization

Transforms the documents into vectors.

In [14]:
vectorizer = TfidfVectorizer(max_df=0.5, min_df=0, max_features=200000, use_idf=True)

weights = vectorizer.fit_transform(data)

features = vectorizer.get_feature_names()

#### Clustering

We fit our data to a nearest neighbors algorithm.

In [15]:
nn = NearestNeighbors(n_neighbors=6, algorithm='auto')

nnfitted = nn.fit(weights)

#### Searches

Some helper methods and a search interface.

In [16]:
BASE_URL = 'http://dx.doi.org/'

def getTitle(doi):
    url = BASE_URL + doi
    req = urllib.request.Request(url)
    req.add_header('Accept', 'application/x-bibtex')
    try:
        with urllib.request.urlopen(req) as f:
            bibtex = f.read().decode()
        start = bibtex.find("title = {")
        end = bibtex.find("},", start)
        return bibtex[start + 9:end]
        
        
    except HTTPError as e:
        if e.code == 404:
            return('DOI not found.')
        else:
            return('Service unavailable.')
        
def get_top_features(rownum, weights, features, top_k=10):
    weight_vec = weights.toarray()[rownum,:]
    top_idx = np.argsort(weight_vec)[::-1][:top_k]
    return [features[i] for i in top_idx]

def find_nearest_papers(row, kNNmodel, tfidf_weights, tfidf_features, data):
    keywords = get_top_features(row, tfidf_weights, tfidf_features)
    dist,idx = kNNmodel.kneighbors(tfidf_weights[row,:])
    idx = list(idx[0])
    return (idx, keywords)

def search(query):
    
    clear_output()
    
    queryTitle = getTitle(query)

    row = docIDs.index(query)
    
    indices, keywords = find_nearest_papers(row, nnfitted, weights, features, data)
    
    if row in indices:
        indices.remove(row)
    titles = [getTitle(docIDs[index]) for index in indices]
    
    print("For your document: " + queryTitle)

    print(" ")
    print("We found the following documents: ")

    for title in titles:
        print("- " + title)

    print(" ")
    print("And the following keywords: ")
    print(str(keywords).strip("[]"))


In [17]:
i = 1

for DOI in docIDs:
    search(DOI)
    print(i)
    i += 1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43


KeyboardInterrupt: 