Link to dataset: https://www.kaggle.com/datasets/Cornell-University/arxiv

In [1]:
import findspark
findspark.init()

In [2]:
# Initializing spark
from pyspark import SparkContext, SparkConf

conf1 = SparkConf().setAppName("Archive_PySpark").setMaster("local[*]")
sc = SparkContext(conf=conf1)

22/11/04 08:32:04 WARN Utils: Your hostname, pc resolves to a loopback address: 127.0.1.1; using 192.168.170.52 instead (on interface wlp3s0)
22/11/04 08:32:04 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/04 08:32:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
print(sc)

<SparkContext master=local[*] appName=Archive_PySpark>


In [4]:
# read and load data to spark
import json

rdd_json = sc.textFile("/home/sbn/Downloads/data/archive/arxiv-metadata-oai-snapshot.json", 100)

rdd = rdd_json.map(lambda x: json.loads(x))

# to cache the rdd so that for each action it will not run the transformation. 
# It will use the cached rdd. 

#rdd.persist() # didn't use because of low memory

In [5]:
print(sc.defaultParallelism)
print(rdd.getNumPartitions())

8
100


In [6]:
# Count elements
rdd.count()

                                                                                

2011231

In [7]:
# Get the first two records
rdd.take(2)

[{'id': '0704.0001',
  'submitter': 'Pavel Nadolsky',
  'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
  'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
  'comments': '37 pages, 15 figures; published version',
  'journal-ref': 'Phys.Rev.D76:013009,2007',
  'doi': '10.1103/PhysRevD.76.013009',
  'report-no': 'ANL-HEP-PR-07-12',
  'categories': 'hep-ph',
  'license': None,
  'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with d

In [8]:
# get all attributes
rdd.flatMap(lambda x:x.keys()).distinct().collect()

                                                                                

['authors',
 'comments',
 'title',
 'id',
 'journal-ref',
 'versions',
 'submitter',
 'categories',
 'update_date',
 'authors_parsed',
 'report-no',
 'license',
 'abstract',
 'doi']

In [9]:
# Get the names ofthe licences
rdd.map(lambda x: x["license"]).distinct().collect()

                                                                                

[None,
 'http://creativecommons.org/licenses/publicdomain/',
 'http://creativecommons.org/licenses/by-nc-nd/4.0/',
 'http://creativecommons.org/licenses/by-nc-sa/4.0/',
 'http://creativecommons.org/licenses/by-nc-sa/3.0/',
 'http://creativecommons.org/licenses/by/3.0/',
 'http://creativecommons.org/licenses/by/4.0/',
 'http://creativecommons.org/publicdomain/zero/1.0/',
 'http://arxiv.org/licenses/nonexclusive-distrib/1.0/',
 'http://creativecommons.org/licenses/by-sa/4.0/']

In [10]:
# Get the shortest and longest titles
shortest_title_rdd = rdd.map(lambda x: x["title"]).reduce(lambda x,y: x if x < y else y)
longest_title_rdd = rdd.map(lambda x: x["title"]).reduce(lambda x,y: x if x > y else y)

print("Shortest title : ", shortest_title_rdd)
print("Longest title : ", longest_title_rdd)



Shortest title :  !-Graphs with Trivial Overlap are Context-Free
Longest title :  Weyl formula for the negative dissipative eigenvalues of Maxwell's
  equations




In [11]:
# Find abbreviations with 5 or more letters in the abstract
import re

def get_abbreviations(line):
    result = re.search(r"\(([A-Za-z][^_ /\\<>]{5,})\)", line)
    if result:
        return result.group(1)

rdd.filter(lambda x: get_abbreviations(x['abstract'])).count()

                                                                                

192721

In [14]:
# Get the number of archives records per month
import datetime

def extract_date(date_in):
    d = datetime.datetime.strptime(date_in, "%Y-%m-%d")
    return d.month


rdd.map(lambda x: (extract_date(x["update_date"]),1)).reduceByKey(lambda x,y: x+y).collect()

                                                                                

[(1, 134247),
 (2, 116948),
 (3, 126458),
 (4, 117126),
 (5, 296587),
 (6, 191746),
 (7, 122649),
 (8, 138469),
 (9, 138978),
 (10, 197755),
 (11, 297963),
 (12, 132305)]

In [20]:
# Get the average number of pages

def get_page_number(line):
    search = re.findall('\d+ pages', line)
    if search:
        return int(search[0].split(" ")[0])
    else:
        return 0

rdd_average = rdd.map(lambda x: get_page_number(x["comments"] if x["comments"] != None else "None"))

rdd_average = rdd_average.filter(lambda x: x!=0)

count = rdd_average.count()
page_num_sum = rdd_average.reduce(lambda x,y: x+y)

print(count)
print(page_num_sum)
print(f"Average of pages : {page_num_sum/count}")



1184075
21139516
Average of pages : 17.85319004286046


                                                                                