In [17]:
import pandas as pd
import numpy as np
import json
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.figure_factory as ff
from itertools import chain

init_notebook_mode(connected=True)

with open('data/descriptions1_60.json', 'r') as f:    
    raw_json_1 = json.load(f)   
with open('data/descriptions61_120.json', 'r') as f:    
    raw_json_2 = json.load(f)  
with open('data/descriptions121_180.json', 'r') as f:    
    raw_json_3 = json.load(f) 
data=[x for x in chain.from_iterable(raw_json_1+raw_json_2+[y for y in raw_json_3 if y])]

In [21]:
def access_date_year(row):
    try:
        return float(row['datePublished'].split('-')[0])
    except:
        return
    
def access_abstract(row):
    try:
        if (len(row['description'])>75):
            return row['description']
        else:
            return
    except:
        return
    
def access_title(row):
    try:
        return row['title']
    except:
        return

def access_id(row):
    try:
        return row['id']
    except:
        return
    
def access_topics(row):
    try:
        return ', '.join(set(map(lambda x: x.split(' - ')[0].upper(), row['topics'])))
    except:
        return
    
def access_journal(row):
    try:
        return(row['journals'][0]['identifiers'][0].strip('isn: '))
    except:
        return
    
def access_repository(row):
    try:
        return(row['repositories'][0]['name'])
    except:
        return
    
jdf_raw = pd.DataFrame({'year':map(access_date_year, data),
'abstract':map(access_abstract, data),
'title':map(access_title, data),
'id':map(access_id, data),
'topics':map(access_topics, data),
'journal':map(access_journal, data),
'repository':map(access_repository, data)})
jdf_raw=jdf_raw[jdf_raw['abstract'].notnull()]
jdf_raw.journal_c = jdf_raw.journal.astype('category')
jdf_raw.repository_c = jdf_raw.journal.astype('category')
#jdf['subject'] = jdf.topics.str.split(', ').str.get(0)[jdf.repository.str.contains('arXiv')]
#jdf.subject_c = jdf.subject.astype('category'
jdf = jdf_raw.groupby('id', as_index=False).first()
jdf.head()

Unnamed: 0,id,abstract,journal,repository,title,topics,year
0,10192433,Published November 1983. Facts and recommendat...,,ScholarsArchive@OSU,Forest property taxation in western Oregon,,1983.0
1,10192527,Published March 1970. A newer revision exists....,,ScholarsArchive@OSU,Spray schedule for home orchards,,1970.0
2,10192589,Published May 1990. Facts and recommendations ...,,ScholarsArchive@OSU,Fair Labor Standards Act,,1990.0
3,10192635,Published May 1980. Facts and recommendations ...,,ScholarsArchive@OSU,Growing potatoes in the home garden,,1980.0
4,10192837,Revised December 1957. Please look for up-to-d...,,ScholarsArchive@OSU,Care of metals and kitchenware,,1957.0


In [22]:
jdf['repository'].value_counts(dropna=False)

arXiv.org e-Print Archive                                                                 5752
Caltech Authors                                                                            896
Queensland University of Technology ePrints Archive                                        549
e-Prints Soton                                                                             272
University of Southern Queensland ePrints                                                  225
Archive of European Integration                                                            166
Organic Eprints                                                                            149
Hochschulschriftenserver - Universität Frankfurt am Main                                   148
E-LIS                                                                                      145
Lancaster E-Prints                                                                         137
Kent Academic Repository                          

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
tfidf = TfidfVectorizer(stop_words='english', 
                        min_df=2, 
                        max_df=.9,
                        norm='l2',
                        use_idf=True,
                        ngram_range=(1,2),
                        sublinear_tf=True,
                        #max_features = 10000,
                        binary=False)
jtfidf = tfidf.fit_transform(jdf.abstract.tolist())

In [64]:
tsvd = TruncatedSVD(n_components=100)
X_tsvd = tsvd.fit_transform(jtfidf)

In [65]:
X=X_tsvd
iplot([{'y':tsvd.explained_variance_ratio_}])

In [74]:
from sklearn.cluster import KMeans
clustering = KMeans(n_clusters=10)
yfit = clustering.fit_predict(X[:,1:20])

In [75]:
trace = go.Scattergl(
    x = X[:,1],
    y = X[:,2],
    mode = 'markers',
    marker= dict(size=5,
                 opacity= 0.3,
                 color=yfit,
                 colorscale='Jet'),
    text=jdf['title']
)
layout= go.Layout(
    title= 'Truncated SVD of Abstracts',
    hovermode= 'closest',
    showlegend= False
)

d = [trace]
iplot({'data':[trace], 'layout':layout})

In [86]:
i =6
X_score =  clustering.transform(X[:,1:20])[:,i]
order = np.argsort(X_score)
component = pd.DataFrame({'title':jdf['title'][order], 'score':X_score[order], 'topics':jdf['topics'][order], 
                          'abstract':jdf['abstract'][order]})
component.head(10)

Unnamed: 0,abstract,score,title,topics
957,The California Department of Fish and Game and...,0.041934,A survey of the marine environment near the ci...,"FISHERIES, BIOLOGY, ECOLOGY"
964,The Lake Earl/Smith River Delta area is a key ...,0.042464,Natural resources of Lake Earl and Smith River...,"ENVIRONMENT, BIOLOGY, CONSERVATION, POLLUTION"
600,"In Theodore v Mistford Pty Ltd [2005] HCA 45, ...",0.043736,Equitable mortgage by deposit of a Certificate...,"180105 COMMERCIAL AND CONTRACT LAW, S 75 OF TH..."
1060,"Through the mid 1990’s, the bait purse-seine f...",0.046261,The Bait Purse-seine Fishery for Atlantic Menh...,"FISHERIES, MANAGEMENT"
838,The carbohydrate inulin is known to reduce the...,0.046578,Chicory roots improves the taste and odour of ...,"FEEDING AND GROWTH, HEALTH AND WELFARE"
136,Khat leaves are cultivated in the highlands of...,0.047778,Drugs in Focus. Edition 21,HV5800
1417,This paper focuses on Horace McCoy’s masterpie...,0.048033,To what extent are the characters in the novel...,"PE ENGLISH, PR ENGLISH LITERATURE"
1482,Preliminary Static Tests were conducted on the...,0.049097,Static testing of DO-228 composite rudder,COMPOSITE MATERIALS
994,ENGLISH: The Nankai Regional Fisheries Researc...,0.049571,Oceanographic observations from the eastern Pa...,OCEANOGRAPHY
7,A glaring hole exists between academic marketi...,0.050148,Interpersonal Dependence and Efficiency of Int...,"FISHERIES ECONOMICS, MARKET COMPETITION AND CH..."
