In [1]:
%matplotlib inline

In [1]:
import spacy
from spacy import displacy
import nltk


In [84]:
# using scikit get tfidf vectors for each document and compare them using cosine similarity.
# Return the top 10 most similar documents.
# The documents are in the directory /home/centos/data/ and are named 1.txt, 2.txt, etc.

import os
import csv
import requests
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## calculate the cosine similarity between a vector and a numpy array of vectors
cosine_similarity = lambda a, b: np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
# get top-k sorted indices
top_k = lambda a, k: np.argsort(a)[-k:]


# function to get vectorizer and vector representation
# specify max number of features to use
def get_vectorizer(corpus,max_features=5000):
    sklearn_tfidf = TfidfVectorizer(input='content', 
        encoding='utf-8', decode_error='replace', strip_accents='unicode', 
        lowercase=True, analyzer='word', stop_words='english', 
        token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b',
        ngram_range=(1, 2), max_features = max_features, 
        norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True,
        max_df=1.0, min_df=1)
    
    sklearn_representation = sklearn_tfidf.fit_transform(corpus)
    return sklearn_tfidf, sklearn_representation


# function to get the tfidf vector for the text
def get_tfidf_vector(vectorizer, text):
    sklearn_representation = vectorizer.transform([text])
    return sklearn_representation.toarray()[0]


In [33]:
import dask.bag as db
import pandas as pd
import numpy as np
import json

In [40]:
docs = db.read_text('./arxiv-metadata-oai-snapshot.json').map(json.loads)

In [41]:
docs

dask.bag<loads, npartitions=1>

In [42]:
docs.count().compute()

2196513

In [43]:
docs.take(1)

({'id': '0704.0001',
  'submitter': 'Pavel Nadolsky',
  'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
  'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
  'comments': '37 pages, 15 figures; published version',
  'journal-ref': 'Phys.Rev.D76:013009,2007',
  'doi': '10.1103/PhysRevD.76.013009',
  'report-no': 'ANL-HEP-PR-07-12',
  'categories': 'hep-ph',
  'license': None,
  'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with d

In [44]:
get_latest_version = lambda x: x['versions'][-1]['created']

trim = lambda x: {'id': x['id'],
                  'authors': x['authors'].split(','),
                  'title': x['title'],
                  'doi': x['doi'],
                  'category':x['categories'].split(' '),
                  'abstract':x['abstract'],}

docs_df = (docs.filter(lambda x: int(get_latest_version(x).split(' ')[3]) > 2019)
           .map(trim).
           compute())

docs_df = pd.DataFrame(docs_df)

In [45]:
docs_df.to_csv("arxiv_docs_after_2019.csv", index=False)

In [46]:
docs_df.head()

Unnamed: 0,id,authors,title,doi,category,abstract
0,704.0033,"[Maxim A. Yurkin, Valeri P. Maltsev, Alfons ...",Convergence of the discrete dipole approximati...,10.1364/JOSAA.23.002578 10.1364/JOSAA.32.002407,"[physics.optics, physics.comp-ph]",We performed a rigorous theoretical converge...
1,704.0038,"[Maxim A. Yurkin, Alfons G. Hoekstra]",The discrete dipole approximation: an overview...,10.1016/j.jqsrt.2007.01.034 10.1016/j.jqsrt.20...,"[physics.optics, physics.comp-ph]",We present a review of the discrete dipole a...
2,704.0479,[T.Geisser],The affine part of the Picard scheme,,"[math.AG, math.KT]",We describe the maximal torus and maximal un...
3,704.1476,[Chris Austin],TeV-scale gravity in Horava-Witten theory on a...,,[hep-th],The field equations and boundary conditions ...
4,705.0825,[Ram Gopal Vishwakarma (Zacatecas University)],Einstein's Theory of Gravity in the Presence o...,10.1007/s10509-009-0016-8,"[gr-qc, astro-ph, hep-th]",The mysterious `dark energy' needed to expla...


In [47]:
docs_df.shape

(610749, 6)

In [49]:
docs_df.loc[0].abstract

'  We performed a rigorous theoretical convergence analysis of the discrete\ndipole approximation (DDA). We prove that errors in any measured quantity are\nbounded by a sum of a linear and quadratic term in the size of a dipole d, when\nthe latter is in the range of DDA applicability. Moreover, the linear term is\nsignificantly smaller for cubically than for non-cubically shaped scatterers.\nTherefore, for small d errors for cubically shaped particles are much smaller\nthan for non-cubically shaped. The relative importance of the linear term\ndecreases with increasing size, hence convergence of DDA for large enough\nscatterers is quadratic in the common range of d. Extensive numerical\nsimulations were carried out for a wide range of d. Finally we discuss a number\nof new developments in DDA and their consequences for convergence.\n'

In [50]:
abstracts = docs_df.abstract

In [85]:
# get the vectorizer and vector representation
vectorizer, vector_np = get_vectorizer(abstracts)

In [86]:
feature_names = vectorizer.get_feature_names()

In [87]:
len(feature_names)

5000

In [88]:
vector_np.shape

(610749, 5000)

In [89]:
import pickle

In [90]:
with open("vectorizer.model","wb") as f:
    pickle.dump(vectorizer,f)

In [91]:
with open("vector_np.data","wb") as f:
    pickle.dump(vector_np,f)

In [92]:
v = get_tfidf_vector(vectorizer,docs_df.loc[0].abstract)

In [93]:
v.shape

(5000,)

In [96]:
top_k_tokens = top_k(v,70)

In [100]:
print("index,token,weight")
for index in top_k_tokens:
    weight = v[index]
    if weight > 0.001:
        print(f"{index} {feature_names[index]} {weight:.2f}")

index,token,weight
2989 new 0.07
2453 large 0.07
176 analysis 0.08
3055 number 0.08
3599 prove 0.09
1738 finally 0.09
3057 numerical 0.09
4224 small 0.09
4190 simulations 0.10
4166 significantly 0.10
4583 theoretical 0.10
1288 discuss 0.10
1656 extensive 0.11
741 common 0.11
4942 wide 0.11
235 approximation 0.11
2200 increasing 0.11
3292 performed 0.11
3821 relative 0.11
1281 discrete 0.11
2736 measured 0.12
3245 particles 0.12
2165 importance 0.12
478 bounded 0.12
3012 non 0.12
4445 sum 0.13
4943 wide range 0.13
558 carried 0.14
3061 numerical simulations 0.14
217 applicability 0.14
1109 decreases 0.14
860 consequences 0.15
3944 rigorous 0.15
1221 developments 0.15
3648 quantity 0.15
4207 size 0.16
1739 finally discuss 0.17
2550 linear 0.18
3693 range 0.19
4227 smaller 0.21
1527 errors 0.21
3636 quadratic 0.22
4557 term 0.23
936 convergence 0.23
1266 dipole 0.26
4121 shaped 0.33
