In [18]:
import arxiv
import pandas as pd
from tqdm import tqdm
import time
import requests
import re

OPENALEX = "https://api.openalex.org"
ARXIVID = "S4306400194"


In [5]:
PHYSICS_PREFIXES = (
    "physics.", "astro-ph", "cond-mat", "hep-", "nucl-", "gr-qc", "quant-ph", "math-ph", "nlin"
)
BIOLOGY_PREFIX = "q-bio"

def is_physics(cat : str) -> bool:
    return bool(cat) and cat.startswith(PHYSICS_PREFIXES)

def is_biology(cat : str) -> bool:
    return bool(cat) and cat.startwith(BIOLOGY_PREFIX)

In [6]:
#Use re.compile() to efficiently reuse regex pattern (otherwise python creates a new one each time)

#Newer arxiv ids are in the format of YYMM.numbers(version optional)eg 2105.12345
NEWSTYLE = re.compile(r"^\d{4}\.\d{4,5}(v\d+)?$")
#Old style ids are in format of category(.optional subcategory)/numbers(version optional) eg: cs.AI/0102030
OLDSTYLE = re.compile(r"^[a-z\-]+(\.[A-Z]{2})?\/\d{7}(v\d+)?$", re.IGNORECASE)


def normalize_arxiv_id(aid : str) -> str:
    #Strip any erroneous whitespace, and also returns empty string in case nothing given
    aid = (aid or "").strip()
    #Substitutes the optional version ending with empty string
    aid = re.sub(r"v\d+$", "", aid)
    return aid

def is_valid_arxiv_id(aid : str) -> bool:
    #Arxiv id must be either new or old style
    return bool(NEWSTYLE.match(aid) or OLDSTYLE.match(aid))

In [None]:
#Function inputs url, and returns raw text parsed as json
def get_json(url, params = None, retries = 6, backoff = 1.6):
    for attempt in range(retries):
        #using requests library to pull website data from url
        r = requests.get(url, params = params, timeout = 45)
        #status code 200 on successful return
        if r.status_code == 200:
            return r.json()
        #failure codes, wait before trying again
        if r.status_code in (429, 500, 502, 503, 504):
            time.sleep(backoff**attempt)
            continue
    

In [35]:
work = get_json("https://api.openalex.org/w2626778328")

In [41]:
work.get("locations")

[{'id': 'doi:10.65215/2q58a426',
  'is_oa': False,
  'landing_page_url': 'https://doi.org/10.65215/2q58a426',
  'pdf_url': None,
  'source': None,
  'license': None,
  'license_id': None,
  'version': 'acceptedVersion',
  'is_accepted': True,
  'is_published': False,
  'raw_source_name': None,
  'raw_type': 'posted-content'},
 {'id': 'doi:10.65215/ctdc8e75',
  'is_oa': False,
  'landing_page_url': 'https://doi.org/10.65215/ctdc8e75',
  'pdf_url': None,
  'source': None,
  'license': None,
  'license_id': None,
  'version': 'acceptedVersion',
  'is_accepted': True,
  'is_published': False,
  'raw_source_name': None,
  'raw_type': 'posted-content'},
 {'id': 'doi:10.65215/mdcm8z23',
  'is_oa': False,
  'landing_page_url': 'https://doi.org/10.65215/mdcm8z23',
  'pdf_url': None,
  'source': None,
  'license': None,
  'license_id': None,
  'version': 'acceptedVersion',
  'is_accepted': True,
  'is_published': False,
  'raw_source_name': None,
  'raw_type': 'posted-content'},
 {'id': 'doi:10.

In [45]:
for loc in work.get("locations"):
    u = loc.get("key")

In [54]:
work.get("locations")[8].get("pdf_url")

'https://arxiv.org/pdf/1706.03762'

In [None]:
#OpenAlex Functions
_ARXIV_ABS_RE = re.compile(r"arxiv\.org/abs/([^?#/]+)", re.IGNORECASE)
_ARXIV_PDF_RE = re.compile(r"arxiv\.org/pdf/([^?#/]+)", re.IGNORECASE)

#Taking the 

def extract_arxiv_id_from_work(work):
    ids = work.get("ids") or {}
    '''
    arxiv_url = ids.get("arxiv")

    if arxiv_url:
        m = _ARXIV_ABS_RE.search(arxiv_url) or _ARXIV_PDF_RE.search(arxiv_url)
        if m:
            aid = normalize_arxiv_id(m.group(1).replace(".pdf", ""))
            if is_valid_arxiv_id(aid):
                return aid
    #The above is to find the arxiv id from the ids list but I don't think this actually exists for any papers
    '''
    
    for loc in (work.get("locations") or []):
        for key in ("landing_page_url", "pdf_url"):
            u = loc.get(key)
            if not u:
                continue
            m = _ARXIV_ABS_RE.search(u) or _ARXIV_PDF_RE.search(u)
            if m:
                aid = normalize_arxiv_id(m.group(1).replace(".pdf", ""))
                if is_valid_arxiv_id(aid):
                    return aid

    return None

In [11]:
def extract_arxiv_id(ids_obj):
    if not ids_obj:
        return None
    