In [1]:
import json
import pandas as pd
from pprint import pprint
import sklearn
import numpy as np

Load Data:

In [2]:
data = {}
with open('./papers_subjects.json') as f:
    data = json.load(f)

data.keys()

dict_keys(['papers'])

Create Taxonomy:

In [4]:
def get_paper_subjects(paper):
    subjects = [paper['primary-subject']['id_subject']]
    subjects.extend([s['id_subject'] for s in paper['other-subjects']])
    return subjects

subjects_id_set = set()
c_p = 0

for paper in data['papers']:
    #subjects_id_set.add((paper['primary-subject']['id_subject'], paper['primary-subject']['name_subject']))
    if paper['primary-subject']['id_subject'].startswith('cs.'):
        c_p += 1
    for other_s in paper['other-subjects']:
        subjects_id_set.add((other_s['id_subject'], other_s['name_subject']))
            
print(c_p, len(subjects_id_set))

37368 133


In [5]:
subjects_id_set

{('astro-ph', 'Astrophysics'),
 ('astro-ph.CO', 'Cosmology and Nongalactic Astrophysics'),
 ('astro-ph.EP', 'Earth and Planetary Astrophysics'),
 ('astro-ph.GA', 'Astrophysics of Galaxies'),
 ('astro-ph.IM', 'Instrumentation and Methods for Astrophysics'),
 ('astro-ph.SR', 'Solar and Stellar Astrophysics'),
 ('cond-mat', 'Condensed Matter'),
 ('cond-mat.dis-nn', 'Disordered Systems and Neural Networks'),
 ('cond-mat.mes-hall', 'Mesoscale and Nanoscale Physics'),
 ('cond-mat.mtrl-sci', 'Materials Science'),
 ('cond-mat.soft', 'Soft Condensed Matter'),
 ('cond-mat.stat-mech', 'Statistical Mechanics'),
 ('cond-mat.str-el', 'Strongly Correlated Electrons'),
 ('cs.AI', 'Artificial Intelligence'),
 ('cs.AR', 'Hardware Architecture'),
 ('cs.CC', 'Computational Complexity'),
 ('cs.CE', 'Computational Engineering, Finance, and Science'),
 ('cs.CG', 'Computational Geometry'),
 ('cs.CL', 'Computation and Language'),
 ('cs.CR', 'Cryptography and Security'),
 ('cs.CV', 'Computer Vision and Pattern 

Taxonomy: https://arxiv.org

- https://arxiv.org/archive/cs
- https://arxiv.org/archive/astro-ph
- https://arxiv.org/archive/cond-mat
- https://arxiv.org/archive/econ
- https://arxiv.org/archive/eess
- https://arxiv.org/archive/hep-ex
- https://arxiv.org/archive/hep-ph
- https://arxiv.org/archive/hep-th
- https://arxiv.org/archive/math-ph
- https://arxiv.org/archive/math
- https://arxiv.org/archive/nlin
- https://arxiv.org/archive/nucl-ex
- https://arxiv.org/archive/nucl-th
- https://arxiv.org/archive/physics
- https://arxiv.org/archive/q-bio
- https://arxiv.org/archive/q-fin
- https://arxiv.org/archive/quant-ph
- https://arxiv.org/archive/stat

Create DataSet:

In [5]:
%%time
from sklearn.preprocessing import OneHotEncoder
def get_encoder(subjects):
    enc = OneHotEncoder().fit(np.array(sorted([s[0] for s in subjects])).reshape(-1, 1))
    #print(enc.categories_)
    return enc

def get_paper_subjects(paper):
    subjects = [paper['primary-subject']['id_subject']]
    subjects.extend([s['id_subject'] for s in paper['other-subjects']])
    return subjects

def transform_paper_subjects(enc, paper_subjects):
    return sum(enc.transform(np.array(paper_subjects).reshape(-1, 1)).toarray()) #.todense()


data_df = []
columns_df = ['paper_id', 'primary-subject']
columns_df.extend(sorted([c[0] for c in subjects_id_set]))

enc = get_encoder(subjects_id_set)

for paper in data['papers']:
    if paper['primary-subject']['id_subject'].startswith('cs.'):
        row_df = [paper['id'], paper['primary-subject']['id_subject']]
        paper_subjects = get_paper_subjects(paper)
        row_df.extend(transform_paper_subjects(enc, paper_subjects))
        data_df.append(row_df)

df = pd.DataFrame(data=data_df, columns=columns_df, dtype=np.uint8)
df

CPU times: user 10.5 s, sys: 171 ms, total: 10.7 s
Wall time: 11.1 s


Unnamed: 0,paper_id,primary-subject,astro-ph,astro-ph.CO,astro-ph.EP,astro-ph.GA,astro-ph.HE,astro-ph.IM,astro-ph.SR,cond-mat,...,q-fin.PM,q-fin.RM,q-fin.ST,q-fin.TR,quant-ph,stat.AP,stat.CO,stat.ME,stat.ML,stat.OT
0,41513,cs.CV,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,61821,cs.LG,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,61822,cs.LG,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,61823,cs.LG,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,61824,cs.CV,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30943,101219,cs.LG,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
30944,101220,cs.LG,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
30945,101221,cs.CV,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
30946,101222,cs.LG,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [84]:
def get_primary_subject_by_paper_id(paper_id):
    paper_row = df.loc[df.paper_id == paper_id]
    return paper_row['primary-subject'].to_numpy()[0]

def get_all_subjects_by_paper_id(paper_id):
    paper_row = df.loc[df.paper_id == paper_id].squeeze()
    return paper_row.index[paper_row == 1].to_list()

#print(get_primary_subject_by_paper_id('61822'))
#print(get_all_subjects_by_paper_id('61822'))

cs.LG
['cs.CV', 'cs.LG', 'stat.ML']


In [87]:
#%%time
## check:
#count = 0
#for paper in data['papers']:
#    if paper['primary-subject']['id_subject'].startswith('cs.'):
#        if paper['primary-subject']['id_subject'] != get_primary_subject_by_paper_id(paper['id']):
#            print("Error primary-subject for %s." % paper['id'])
#        else:
#            if sorted(get_paper_subjects(paper)) != get_all_subjects_by_paper_id(paper['id']):
#                print("Error all-subject for %s." % paper['id'])
#            else:
#                count += 1
#print(count)

17278
CPU times: user 1min, sys: 94.1 ms, total: 1min
Wall time: 1min


In [93]:
#BYTES_TO_MB_DIV = 0.000001
#def print_memory_usage_of_data_frame(df):
#    mem = round(df.memory_usage().sum() * BYTES_TO_MB_DIV, 3) 
#    print("Memory usage is " + str(mem) + " MB")
#
#print_memory_usage_of_data_frame(df)

Memory usage is 2.419 MB


Visualize paper-subjects: