In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

!pip install selectolax

/kaggle/input/arxiv-cs-2025-ids/arxiv_cs_2025_sample_ids.csv
Collecting selectolax
  Downloading selectolax-0.3.32-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (6.2 kB)
Downloading selectolax-0.3.32-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (5.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.7/5.7 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: selectolax
Successfully installed selectolax-0.3.32


In [2]:
from selectolax.parser import HTMLParser
import time
import pandas as pd
from tqdm import tqdm
import re
import requests

In [None]:
VALID_SHOW_VALUES = [25, 50, 100, 250, 500, 1000, 2000]

HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/115.0.0.0 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
    }


In [None]:
import unicodedata

def strip_accents(text: str) -> str:
    """
    Converts accented characters to their closest ASCII equivalent.
    Example: 'Gérard' -> 'Gerard'
    """
    if not isinstance(text, str):
        return text
    # Normalize to NFKD form and strip diacritics
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')


In [5]:
def extract_arxiv_metadata(paper_url: str, paper_id: str, domain: str = "cs") -> dict:
    """
    Scrapes metadata from a single arXiv abstract page.

    Returns:
        dict: Paper metadata with error handling for missing fields.
    """
    try:
        response = requests.get(paper_url, headers=HEADERS)
        response.raise_for_status()
        html = HTMLParser(response.text)
        content = html.css_first("div#content-inner")

        # Date
        try:
            date_text = content.css_first('div.dateline').text(strip=True)
            date_submitted = date_text.replace('[Submitted on ', '').replace(']', '')
        except Exception:
            date_submitted = None

        # Title
        try:
            title = content.css_first('h1.title').text(strip=True).replace('Title:', '').strip()
        except Exception:
            title = None

        # Authors
        try:
            authors = content.css_first("div.authors").text(strip=True).replace('Authors:', '').strip()
        except Exception:
            authors = None

        # Abstract
        try:
            abstract = content.css_first("blockquote.abstract").text(strip=True).replace('Abstract:', '').strip()
        except Exception:
            abstract = None

        # Subjects
        try:
            raw_subjects = content.css_first('td.subjects').text(strip=True)
            subjects = [s.strip().rsplit('(', 1)[0].strip() for s in raw_subjects.split(';')]
        except Exception:
            subjects = []

        primary_subject = subjects[0] if len(subjects) != 0 else None

        pdf_url = paper_url.replace('abs', 'pdf').strip()

        return {
            'Paper Title': title,
            'Paper ID': paper_id,
            'Authors': authors,
            'Abstract': abstract,
            'Domain': domain,
            'Primary Subject': primary_subject,
            'Subjects': subjects,
            'Date Submitted': date_submitted,
            'Abstract URL': paper_url,
            'PDF URL': pdf_url
        }

    except requests.RequestException as e:
        print(f"Request failed for {paper_url}: {e}")
        return {}

In [6]:
def scrape_arxiv_metadata_from_df(df: pd.DataFrame, domain: str = "cs") -> pd.DataFrame:
    results = []
    
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Scraping arXiv metadata"):
        paper_id = row['paper_id']
        abstract_url = row['abstract_url']

        metadata = extract_arxiv_metadata(abstract_url, paper_id, domain=domain)

        if metadata:
            # Apply accent stripping to selected fields
            for field in ['Authors', 'Paper Title']:
                if field in metadata and metadata[field]:
                    metadata[field] = strip_accents(metadata[field])
            results.append(metadata)

            time.sleep(0.5)

    return pd.DataFrame(results)

In [7]:
print(strip_accents("Gérard Ben Arous, Cédric Gerbelot"))

Gerard Ben Arous, Cedric Gerbelot


In [None]:
df_id = pd.read_csv(r'/kaggle/input/sample-ids-2024-arxiv/arxiv_cs_2024_sample_ids.csv')

paper_df = scrape_arxiv_metadata_from_df(df=df_id, domain="cs")

print(paper_df.head())
print(paper_df.info())

paper_df.to_csv(r'arxiv_cs_2024_papers.csv', index=False)

Scraping arXiv metadata:   0%|          | 31/10000 [00:17<1:35:13,  1.74it/s]