# Data Extraction

The data is retrieved from the Kaggle and stored locally<br>

In [1]:
import json
import pandas as pd
import pickle
import os

from kaggle.api.kaggle_api_extended import KaggleApi
import zipfile

api = KaggleApi()
api.authenticate()

In [2]:
!kaggle datasets download -d 'Cornell-University/arxiv'

Downloading arxiv.zip to /home/ubuntu/Documents/arXiv project/exploratory analysis/arxiv_exploratory_data_analysis/src
100%|██████████████████████████████████████| 1.19G/1.19G [00:30<00:00, 38.9MB/s]
100%|██████████████████████████████████████| 1.19G/1.19G [00:30<00:00, 41.7MB/s]


In [3]:
with zipfile.ZipFile('arxiv.zip', 'r') as zip_ref:
    zip_ref.extractall('./')

In [4]:
!rm arxiv.zip

In [5]:
!mkdir ../data

In [6]:
!mv arxiv-metadata-oai-snapshot.json ../data

In [7]:
filepath = "../data/arxiv-metadata-oai-snapshot.json"

In [8]:
## example of data content
with open(filepath, "r") as f:
    line = f.readline()
json.loads(line)

{'id': '0704.0001',
 'submitter': 'Pavel Nadolsky',
 'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
 'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
 'comments': '37 pages, 15 figures; published version',
 'journal-ref': 'Phys.Rev.D76:013009,2007',
 'doi': '10.1103/PhysRevD.76.013009',
 'report-no': 'ANL-HEP-PR-07-12',
 'categories': 'hep-ph',
 'license': None,
 'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with data from th

In [9]:
items = []
with open(filepath, "r") as f:
    lines = f.readlines()
    print("Total number of items:", len(lines))
    for l in lines:
        item = json.loads(l)
        items.append(item)

Total number of items: 2326839


In [10]:
raw_data = pd.DataFrame.from_records(items)

In [11]:
raw_data.head()

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,A fully differential calculation in perturba...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2008-11-26,"[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,..."
1,704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,math.CO cs.CG,http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2008-12-13,"[[Streinu, Ileana, ], [Theran, Louis, ]]"
2,704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,physics.gen-ph,,The evolution of Earth-Moon system is descri...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2008-01-13,"[[Pan, Hongjun, ]]"
3,704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,,math.CO,,We show that a determinant of Stirling cycle...,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2007-05-23,"[[Callan, David, ]]"
4,704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,math.CA math.FA,,In this paper we show how to compute the $\L...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2013-10-15,"[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]"


In [12]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2326839 entries, 0 to 2326838
Data columns (total 14 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   id              object
 1   submitter       object
 2   authors         object
 3   title           object
 4   comments        object
 5   journal-ref     object
 6   doi             object
 7   report-no       object
 8   categories      object
 9   license         object
 10  abstract        object
 11  versions        object
 12  update_date     object
 13  authors_parsed  object
dtypes: object(14)
memory usage: 248.5+ MB


In [13]:
# check duplicates
raw_data = raw_data.merge(raw_data["id"].value_counts().rename("count_of_id"), how='left', left_on='id', right_index=True)
duplicates = raw_data[raw_data["count_of_id"]>1].sort_values("id")
duplicates["id"].unique()

array([], dtype=object)

In [14]:
# remove duplicates if found
raw_data.drop_duplicates("id", inplace=True)
raw_data.reset_index(drop=True, inplace=True)
raw_data.drop("count_of_id", axis=1, inplace=True)

In [15]:
raw_data.shape

(2326839, 14)

In [20]:
from_date = min(raw_data["update_date"])
to_date = max(raw_data["update_date"])
print(f'\nDates of publication of the articles in the dataset:\n\nFirst date: {from_date}\nLast date: {to_date}')


Dates of publication of the articles in the dataset:

First date: 2007-05-23
Last date: 2023-09-16


In [None]:
## caching file
raw_data.to_parquet("../data/raw.parquet.gzip")