**Data Exploration using Dask**

In [1]:
import numpy as np 
import pandas as pd
import dask.bag as db
import plotly.express as px
import json

In [2]:
#read_text is used to read file in dask bag. The contents of the file will be read as string
lines=db.read_text("data.json") 

In [3]:
lines.take(2) ## Looks at first two records

('{"id":"0704.0001","submitter":"Pavel Nadolsky","authors":"C. Bal\\\\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan","title":"Calculation of prompt diphoton production cross sections at Tevatron and\\n  LHC energies","comments":"37 pages, 15 figures; published version","journal-ref":"Phys.Rev.D76:013009,2007","doi":"10.1103/PhysRevD.76.013009","report-no":"ANL-HEP-PR-07-12","categories":"hep-ph","license":null,"abstract":"  A fully differential calculation in perturbative quantum chromodynamics is\\npresented for the production of massive photon pairs at hadron colliders. All\\nnext-to-leading order perturbative contributions from quark-antiquark,\\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\\nall-orders resummation of initial-state gluon radiation valid at\\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\\nspecified in which the calculation is most reliable. Good agreement is\\ndemonstrated with data from the Fermilab Tevatro

In [4]:
#transform each JSON line into a Python object
records=lines.map(lambda x:json.loads(x))
records.take(2)

({'id': '0704.0001',
  'submitter': 'Pavel Nadolsky',
  'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
  'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
  'comments': '37 pages, 15 figures; published version',
  'journal-ref': 'Phys.Rev.D76:013009,2007',
  'doi': '10.1103/PhysRevD.76.013009',
  'report-no': 'ANL-HEP-PR-07-12',
  'categories': 'hep-ph',
  'license': None,
  'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with d

In [5]:
#display the total number of records in ArXiv data
records_count=records.count()
print("Number of Records in ArXiv Data is ",records_count.compute())


Number of Records in ArXiv Data is  2300172


**The Top 10 Submitters**

In [6]:
records.map(lambda x:x['submitter']).frequencies(sort=True).topk(k=10,key=1).compute()

[(None, 15189),
 ('EPTCS', 3501),
 ('The CMS Collaboration', 852),
 ('Atlas Publications', 622),
 ('The ATLAS Collaboration', 554),
 ('Delfim F. M. Torres', 423),
 ('EDA Publishing Association', 391),
 ('Cms Collaboration', 373),
 ('Wei Wang', 341),
 ('Yang Liu', 296)]

**What are the top 20 categories?**

In [7]:
records.map(lambda x:x['categories']).frequencies(sort=True).topk(k=20,key=1).compute()

[('astro-ph', 86911),
 ('hep-ph', 79400),
 ('quant-ph', 64879),
 ('hep-th', 57468),
 ('cs.CV', 47559),
 ('cond-mat.mtrl-sci', 37035),
 ('cond-mat.mes-hall', 33717),
 ('math.AP', 31557),
 ('gr-qc', 29233),
 ('astro-ph.GA', 28186),
 ('math.CO', 27387),
 ('astro-ph.SR', 26073),
 ('cond-mat.str-el', 25333),
 ('math.PR', 22472),
 ('cs.IT math.IT', 21911),
 ('astro-ph.HE', 21488),
 ('astro-ph.CO', 21066),
 ('math.NT', 20542),
 ('cs.CL', 19564),
 ('math.AG', 19212)]

**Number of Papers that have been published each Year**

In [12]:
extract_latest_version_year=lambda x:x['versions'][-1]["created"].split(" ")[3]
pub_by_year=records.map(extract_latest_version_year).frequencies().to_dataframe(columns=['submission_year','num_submissions']).compute()
pub_by_year.head()


Unnamed: 0,submission_year,num_submissions
0,2007,54123
1,2008,56933
2,2009,62207
3,2022,202140
4,2012,81397


In [13]:
pub_by_year=pub_by_year.sort_values(by="submission_year")
pub_by_year.head()

Unnamed: 0,submission_year,num_submissions
36,1986,1
34,1988,1
35,1989,5
32,1990,24
33,1991,340


In [14]:
px.line(x='submission_year',y='num_submissions',data_frame=pub_by_year,title="Distribution of Paper Published By Year",color_discrete_sequence=['#87CEEB'])

**Papers published by Year for Artificial Intelligence**

In [15]:
ai_category_list=['cs.AI']
ai_docs = (records.filter(lambda x:any(ele in x['categories'] for ele in ai_category_list)==True))
print("Total Papers published in AI ",ai_docs.count().compute())

Total Papers published in AI  67787


In [16]:
ai_docs_by_year=ai_docs.map(extract_latest_version_year).frequencies().to_dataframe(columns=['submission_year','num_submissions']).compute()
ai_docs_by_year=ai_docs_by_year.sort_values(by="submission_year")
ai_docs_by_year.head()

Unnamed: 0,submission_year,num_submissions
26,1993,6
27,1994,14
28,1995,27
29,1996,28
18,1997,20


In [17]:
px.line(x='submission_year',y='num_submissions',data_frame=ai_docs_by_year,title="AI Paper Published on Arxiv By Year",color_discrete_sequence=['#87CEEB'])

**Top Authors publishing in AI**

In [18]:
## Extracting author parsed for the first paper to look at the structure
authors=records.map(lambda x:x["authors_parsed"]).take(1)[0]
authors

[['Balázs', 'C.', ''],
 ['Berger', 'E. L.', ''],
 ['Nadolsky', 'P. M.', ''],
 ['Yuan', 'C. -P.', '']]

In [19]:
[" ".join(a) for a in authors]

['Balázs C. ', 'Berger E. L. ', 'Nadolsky P. M. ', 'Yuan C. -P. ']

In [20]:
get_authors =lambda x: [' '.join(a).strip() for a in x['authors_parsed']]
ai_authors=ai_docs.map(get_authors).flatten().frequencies(sort=True).topk(k=20,key=1).to_dataframe(columns=['authors','num_submissions']).compute()

In [21]:
ai_authors.head()

Unnamed: 0,authors,num_submissions
0,Levine Sergey,214
1,Abbeel Pieter,189
2,Liu Yang,160
3,Bansal Mohit,139
4,Tenenbaum Joshua B.,135


In [22]:
ai_authors = ai_authors.sort_values('num_submissions', ascending=True)
px.bar(y="authors",x="num_submissions",data_frame=ai_authors,orientation="h",color_discrete_sequence=['#87CEEB'])