In [40]:
import pandas as pd
import numpy as np
from pylatexenc.latex2text import LatexNodes2Text
import re

import warnings
warnings.filterwarnings('ignore')

In [5]:
json_file = rf'N:\arxiv_dataset\arxiv-metadata-oai-snapshot.json'
chunk_size = 100000
data = pd.read_json(json_file, lines=True, chunksize=chunk_size)

In [6]:
ctr = 0
chunks = []
for chunk in data:
    # drop the columns that are not needed
    chunk.drop(['id', 'authors', 'comments', 'journal-ref', 'report-no', 'categories', 'license', 'versions', 'authors_parsed'], axis=1, inplace=True)
    chunks.append(chunk)
    ctr+=1
    if ctr>2:
        break

In [7]:
chunks[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   id           100000 non-null  float64
 1   submitter    100000 non-null  object 
 2   title        100000 non-null  object 
 3   doi          61372 non-null   object 
 4   abstract     100000 non-null  object 
 5   update_date  100000 non-null  object 
dtypes: float64(1), object(5)
memory usage: 4.6+ MB


In [8]:
chunks[1].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 100000 to 199999
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   id           100000 non-null  float64
 1   submitter    100000 non-null  object 
 2   title        100000 non-null  object 
 3   doi          60048 non-null   object 
 4   abstract     100000 non-null  object 
 5   update_date  100000 non-null  object 
dtypes: float64(1), object(5)
memory usage: 4.6+ MB


In [12]:
# dropping the null values in doi
chunks[0] = chunks[0].dropna()
chunks[1] = chunks[1].dropna()

In [10]:
chunks[0].info()

<class 'pandas.core.frame.DataFrame'>
Index: 61372 entries, 0 to 99998
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           61372 non-null  float64
 1   submitter    61372 non-null  object 
 2   title        61372 non-null  object 
 3   doi          61372 non-null  object 
 4   abstract     61372 non-null  object 
 5   update_date  61372 non-null  object 
dtypes: float64(1), object(5)
memory usage: 3.3+ MB


In [11]:
chunks[1].info()

<class 'pandas.core.frame.DataFrame'>
Index: 60048 entries, 100000 to 199997
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           60048 non-null  float64
 1   submitter    60048 non-null  object 
 2   title        60048 non-null  object 
 3   doi          60048 non-null  object 
 4   abstract     60048 non-null  object 
 5   update_date  60048 non-null  object 
dtypes: float64(1), object(5)
memory usage: 3.2+ MB


In [13]:
data = pd.concat([chunks[0], chunks[1]], axis=0)

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 121420 entries, 0 to 199997
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   id           121420 non-null  float64
 1   submitter    121420 non-null  object 
 2   title        121420 non-null  object 
 3   doi          121420 non-null  object 
 4   abstract     121420 non-null  object 
 5   update_date  121420 non-null  object 
dtypes: float64(1), object(5)
memory usage: 6.5+ MB


In [15]:
data.head()

Unnamed: 0,id,submitter,title,doi,abstract,update_date
0,704.0001,Pavel Nadolsky,Calculation of prompt diphoton production cros...,10.1103/PhysRevD.76.013009,A fully differential calculation in perturba...,2008-11-26
5,704.0006,Yue Hin Pong,Bosonic characters of atomic Cooper pairs acro...,10.1103/PhysRevA.75.043613,We study the two-particle wave function of p...,2015-05-13
6,704.0007,Alejandro Corichi,Polymer Quantum Mechanics and its Continuum Limit,10.1103/PhysRevD.76.044016,A rather non-standard quantum representation...,2008-11-26
7,704.0008,Damian Swift,Numerical solution of shock and ramp compressi...,10.1063/1.2975338,A general formulation was developed to repre...,2009-02-05
8,704.0009,Paul Harvey,"The Spitzer c2d Survey of Large, Nearby, Inste...",10.1086/518646,We discuss the results from the combined IRA...,2010-03-18


In [19]:
# defining a function to clean the abstract
def clean_abstract(abstract):
    try:

        abstract = re.sub(r'\\href{[^}]+}{[^}]+}', '', abstract) # remove the \href tags in the abstract
        abstract = re.sub(r'\\frac\s*{[^}]+}\s*{[^}]+}', '', abstract) # remove the \frac tags in the abstract
        abstract = abstract.replace('\n', ' ') # removing the newline escape character

        # dealing the other latex tags in the abstract and converting them to readable form
        latex_converter = LatexNodes2Text()
        readable_abstract = latex_converter.latex_to_text(abstract)

    except Exception as e:
        
        print(f"Error processing text: {e}")
        readable_abstract = None

    return readable_abstract

In [18]:
# defining a fucntion to clean the submitter names
def clean_submitter(submitter):
    try:

        # cleaning the escape characters in the submitter name
        submitter = submitter.replace("\\'", '').replace('\\"', '')

    except Exception as e:
        
        submitter = None

    return submitter

In [20]:
# defining a fucntion to clean the titles
def clean_title(title):
    try:

        cleaned_title = title.replace('\n', ' ') # cleaning the newline escape character

        # dealing the latex tags in the title and converting them to readable form
        latex_converter = LatexNodes2Text()
        readable_title = latex_converter.latex_to_text(cleaned_title)
    
    except Exception as e:

        print(f"Error processing text: {e}")
        readable_title = None

    return readable_title

In [21]:
# cleaning the abstract, submitter and title columns and insert an id column
data['abstract'] = data['abstract'].apply(clean_abstract)
data['submitter'] = data['submitter'].apply(clean_submitter)
data['title'] = data['title'].apply(clean_title)
data.insert(0, 'id', range(0, data.shape[0]))

In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 121420 entries, 0 to 199997
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           121420 non-null  int64 
 1   submitter    121420 non-null  object
 2   title        121420 non-null  object
 3   doi          121420 non-null  object
 4   abstract     121420 non-null  object
 5   update_date  121420 non-null  object
dtypes: int64(1), object(5)
memory usage: 6.5+ MB


In [36]:
data.to_csv(rf'data.csv', index=False)

In [37]:
chunk_size = 1000
num_chunks = len(data) // chunk_size + 1

In [42]:
# saving data in small chunks of 1000
for i, chunk in enumerate(np.array_split(data, num_chunks)):
    chunk.to_csv(rf"N:\arxiv_dataset\github_121420\chunks_1000\data\{i}.csv", index=False)