#### Import

We utilize the gensim library for topic modeling algorithms.

In [33]:
# processing
import operator
from operator import methodcaller
import csv
import re
import numpy as np
import pandas as pd
from pprint import pprint
import string
import math
import itertools

# gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import HdpModel

# plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt

# sci-kit
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

#### Preprocessing

Specify pathway to R&R program output. Select terms to blacklist and levels of terms to consider. The tokens will be loaded into a corpus and a dictionary will be constructed.

In [34]:
blacklist = [t.strip() for t in next(csv.reader(open("tools\\blacklist.csv", 'r')))]
levels = [1, 2, 3]

inPath = "raw.csv"

inFile = open(inPath, 'r')
inReader = csv.reader(inFile)

docTokens = dict()


next(inReader)
for inRow in inReader:
    term = inRow[0]
    sentence = inRow[2]
    docID = inRow[3]
    
    token = "_".join([t for t in term.split(":") if re.match(r'[^\W\d]*$', t) and not t in blacklist])
    
    level = token.count("_")
    
    if level in levels and not token in blacklist and len(token) > 0:
        if docID in docTokens:
            docTokens[docID] += [token]
        else:
            docTokens[docID] = [token]

docIDs = list(docTokens.keys())
data = list(docTokens.values())

In [54]:

vectorizer = TfidfVectorizer(max_df=0.5, min_df=0.001, max_features=200000, use_idf=True)

weights = vectorizer.fit_transform([" ".join(tokens) for tokens in data])

features = vectorizer.get_feature_names()

#### Feature Extraction

Get top features

In [55]:
def get_top_features(rownum, weights, features, top_k=1):
    weight_vec = weights.toarray()[rownum,:]
    top_idx = np.argsort(weight_vec)[::-1][:top_k]
    return [features[i] for i in top_idx]

#### Clustering

In [56]:
nn = NearestNeighbors(n_neighbors=5, algorithm='auto')

nnfitted = nn.fit(weights)


In [57]:
import sys
import urllib.request
from urllib.error import HTTPError


BASE_URL = 'http://dx.doi.org/'

def getTitle(doi):
    url = BASE_URL + doi
    req = urllib.request.Request(url)
    req.add_header('Accept', 'application/x-bibtex')
    try:
        with urllib.request.urlopen(req) as f:
            bibtex = f.read().decode()
        start = bibtex.find("title = {")
        end = bibtex.find("},", start)
        return bibtex[start + 9:end]
        
        
    except HTTPError as e:
        if e.code == 404:
            return('DOI not found.')
        else:
            return('Service unavailable.')

In [58]:
def find_nearest_papers(row, kNNmodel, tfidf_weights, tfidf_features, data):
    keywords = get_top_features(row, tfidf_weights, tfidf_features)
    dist,idx = kNNmodel.kneighbors(tfidf_weights[row,:])
    idx = list(idx[0])
    return (idx, keywords)

In [None]:
query = input("DocID to query?: ")
queryTitle = getTitle(query)

row = docIDs.index(query)
print(row)
indices, keywords = find_nearest_papers(row, nnfitted, weights, features, data)
indices.remove(row)
titles = [getTitle(docIDs[index]) for index in indices]

print(" ")
print("For your document: " + queryTitle)

print(" ")
print("We found the following documents: ")

print(titles)

print(" ")
print("And the following keywords: ")
print(keywords)
