In [1]:
import pandas as pd
import numpy as np
import json
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.figure_factory as ff
from itertools import chain

init_notebook_mode(connected=True)

with open('data/descriptions1_60.json', 'r') as f:    
    raw_json_1 = json.load(f)   
with open('data/descriptions61_120.json', 'r') as f:    
    raw_json_2 = json.load(f)  
with open('data/descriptions121_180.json', 'r') as f:    
    raw_json_3 = json.load(f) 
data=[x for x in chain.from_iterable(raw_json_1+raw_json_2+[y for y in raw_json_3 if y])]


Matplotlib is building the font cache using fc-list. This may take a moment.



In [2]:
def access_date_year(row):
    try:
        return float(row['datePublished'].split('-')[0])
    except:
        return
    
def access_abstract(row):
    try:
        if (len(row['description'])>75):
            return row['description']
        else:
            return
    except:
        return
    
def access_title(row):
    try:
        return row['title']
    except:
        return

def access_id(row):
    try:
        return row['id']
    except:
        return
    
def access_topics(row):
    try:
        return ', '.join(set(map(lambda x: x.split(' - ')[0].upper(), row['topics'])))
    except:
        return
    
def access_journal(row):
    try:
        return(row['journals'][0]['identifiers'][0].strip('isn: '))
    except:
        return
    
def access_repository(row):
    try:
        return(row['repositories'][0]['name'])
    except:
        return
    
jdf_raw = pd.DataFrame({'year':map(access_date_year, data),
'abstract':map(access_abstract, data),
'title':map(access_title, data),
'id':map(access_id, data),
'topics':map(access_topics, data),
'journal':map(access_journal, data),
'repository':map(access_repository, data)})
jdf_raw=jdf_raw[jdf_raw['abstract'].notnull()]
jdf_raw.journal_c = jdf_raw.journal.astype('category')
jdf_raw.repository_c = jdf_raw.journal.astype('category')
#jdf['subject'] = jdf.topics.str.split(', ').str.get(0)[jdf.repository.str.contains('arXiv')]
#jdf.subject_c = jdf.subject.astype('category'
jdf = jdf_raw.groupby('id', as_index=False).first()
jdf

Unnamed: 0,id,abstract,journal,repository,title,topics,year
0,10192433,Published November 1983. Facts and recommendat...,,ScholarsArchive@OSU,Forest property taxation in western Oregon,,1983.0
1,10192527,Published March 1970. A newer revision exists....,,ScholarsArchive@OSU,Spray schedule for home orchards,,1970.0
2,10192589,Published May 1990. Facts and recommendations ...,,ScholarsArchive@OSU,Fair Labor Standards Act,,1990.0
3,10192635,Published May 1980. Facts and recommendations ...,,ScholarsArchive@OSU,Growing potatoes in the home garden,,1980.0
4,10192837,Revised December 1957. Please look for up-to-d...,,ScholarsArchive@OSU,Care of metals and kitchenware,,1957.0
5,10192914,This is the publisher’s final pdf. The publish...,,ScholarsArchive@OSU,Community ecology of invasions: direct and ind...,"IMPACT, DECLINE, AMPHIBIAN DECLINE, INTRODUCED...",2012.0
6,10193203,"Graduation date: 2000Recently, with the realiz...",,ScholarsArchive@OSU,A simple RLS-POCS solution for reduced complex...,,1999.0
7,10193467,A glaring hole exists between academic marketi...,,ScholarsArchive@OSU,Interpersonal Dependence and Efficiency of Int...,"FISHERIES ECONOMICS, MARKET COMPETITION AND CH...",2001.0
8,10193481,Published September 2012. Reviewed September 2...,,ScholarsArchive@OSU,Postharvest residue management for grass seed ...,"FULL STRAW LOAD, CLEAN NONTHERMAL, GRASS SEED",2012.0
9,10193498,This is the publisher’s final pdf. The publish...,,ScholarsArchive@OSU,On the use of the Boussinesq equation for inte...,"SLOPING, BOUSSINESQ, RECESSION ANALYSIS",2012.0


In [3]:
jdf['repository'].value_counts(dropna=False)

arXiv.org e-Print Archive                                                                 5752
Caltech Authors                                                                            896
Queensland University of Technology ePrints Archive                                        549
e-Prints Soton                                                                             272
University of Southern Queensland ePrints                                                  225
Archive of European Integration                                                            166
Organic Eprints                                                                            149
Hochschulschriftenserver - Universität Frankfurt am Main                                   148
E-LIS                                                                                      145
Lancaster E-Prints                                                                         137
Kent Academic Repository                          

In [4]:
#jdf = jdf.groupby('repository', as_index=False).filter(lambda x: len(x) > 10)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
tfidf = TfidfVectorizer(stop_words='english', 
                        min_df=2, 
                        max_df=.95,
                        norm='l2',
                        use_idf=True,
                        ngram_range=(1,2),
                        sublinear_tf=True,
                        #max_features = 10000,
                        binary=False)
jtfidf = tfidf.fit_transform(jdf.abstract.tolist())

In [6]:
tsvd = TruncatedSVD(n_components=100)
X_tsvd = tsvd.fit_transform(jtfidf)

In [7]:
X=X_tsvd
iplot([{'y':tsvd.explained_variance_ratio_}])

In [8]:
from sklearn.cluster import KMeans
clustering = KMeans(n_clusters=10)
yfit = clustering.fit_predict(X[:,1:20])
cats, counts = np.unique(yfit, return_counts=True)
iplot([go.Bar({'x':cats, 'y':counts})])

In [9]:
def make_label(t, y):
    return("%s<br>%d" % (t, y))

trace = go.Scattergl(
    x = X[:,2],
    y = X[:,1],
    mode = 'markers',
    marker= dict(size=5,
                 opacity= 0.3,
                 color=yfit,
                 colorscale='Rainbow',
                 colorbar=dict(
                    title='Colorbar')),
    text=map(make_label, jdf['title'], yfit)
)
layout= go.Layout(
    title= 'Truncated SVD of Abstracts',
    hovermode= 'closest',
    showlegend= False
)

d = [trace]
iplot({'data':[trace], 'layout':layout})

In [10]:
i =7
X_score =  clustering.transform(X[:,1:20])[:,i]
order = np.argsort(X_score)
component = pd.DataFrame({'title':jdf['title'][order], 'score':X_score[order], 'topics':jdf['topics'][order], 
                          'abstract':jdf['abstract'][order]})
component.head(10)

Unnamed: 0,abstract,score,title,topics
5235,We formulate the problem of renormalization of...,0.024091,Feynman integrals and motives of configuration...,"HIGH ENERGY PHYSICS, MATHEMATICS, MATHEMATICAL..."
2719,We prove a variant of Tartar's first commutati...,0.027545,On a variant of Tartar's first commutation lemma,"MATHEMATICS, 42B15"
2570,We prove a generalization of Thom's transversa...,0.028902,A generalization of Thom's transversality theorem,"MATHEMATICS, 57R35, 57R45"
6146,The Arnold inequalities characterizing the top...,0.029822,Generalization of Arnold-Viro inequalities for...,MATHEMATICS
3497,The most important open problem in Monotone Op...,0.029863,The sum of a maximal monotone operator of type...,"MATHEMATICS, PRIMARY 47H05, SECONDARY 49N15, 5..."
2797,This is the author's 2008 thesis from the Univ...,0.031633,A generalization of the Clifford index and det...,"MATHEMATICS, 14M12"
4781,The purpose of this article is to show how the...,0.032327,On the change of root numbers under twisting a...,"11F70, MATHEMATICS"
5161,In this paper we investigate operators unitari...,0.032483,Unitary equivalence to truncated Toeplitz oper...,"MATHEMATICS, 47B35, 47B32, 47A45"
4266,We prove that a semigroup generated by a finit...,0.032978,On orbits of truncated convolution operators,MATHEMATICS
3717,Suppose that $\mathcal{X}$ is a sequentially c...,0.03352,On the stability of the first order linear rec...,"MATHEMATICS, 39B82 (PRIMARY) 39A10, 39B72 (SEC..."
