In [10]:
import arxiv
import pandas as pd
from tqdm import tqdm
import time
import requests
import re

OPENALEX = "https://api.openalex.org"


In [11]:
PHYSICS_PREFIXES = (
    "physics.", "astro-ph", "cond-mat", "hep-", "nucl-", "gr-qc", "quant-ph", "math-ph", "nlin"
)
BIOLOGY_PREFIX = "q-bio"

def is_physics(cat : str) -> bool:
    return bool(cat) and cat.startswith(PHYSICS_PREFIXES)

def is_biology(cat : str) -> bool:
    return bool(cat) and cat.startwith(BIOLOGY_PREFIX)

In [None]:
#Use re.compile() to efficiently reuse regex pattern (otherwise python creates a new one each time)

#Newer arxiv ids are in the format of YYMM.numbers(version optional)eg 2105.12345
NEWSTYLE = re.compile(r"^\d{4}\.\d{4,5}(v\d+)?$")
#Old style ids are in format of category(.optional subcategory)/numbers(version optional) eg: cs.AI/0102030
OLDSTYLE = re.compile(r"^[a-z\-]+(\.[A-Z]{2})?\/\d{7}(v\d+)?$", re.IGNORECASE)


def normalize_arxiv_id(aid : str) -> str:
    #Strip any erroneous whitespace, and also returns empty string in case nothing given
    aid = (aid or "").strip()
    #Substitutes the optional version ending with empty string
    aid = re.sub(r"v\d+$", "", aid)
    return aid

def is_valid_arxiv_id(aid : str) -> bool:
    #Arxiv id must be either new or old style
    return bool(NEWSTYLE.match(aid) or OLDSTYLE.match(aid))

In [None]:
def get

In [9]:
def openalex_get(url, params = None, retries = 5, backoff = 1.5):
    for attempt in range(retries):
        r = requests.get(url, params = params ,timeout = 30)
        if r.status_code == 200:
            return r.json()
        if r.status_code in (429, 500, 502, 503, 504):
            sleep = backoff ** attempt
            time.sleep(sleep)
            continue
    

In [None]:
def extract_arxiv_id(ids_obj):
    if not ids_obj:
        return None
    