In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

!pip install selectolax

Collecting selectolax
  Downloading selectolax-0.3.32-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (6.2 kB)
Downloading selectolax-0.3.32-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (5.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.7/5.7 MB[0m [31m49.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: selectolax
Successfully installed selectolax-0.3.32


In [2]:
from selectolax.parser import HTMLParser
import time
import pandas as pd
from tqdm import tqdm
import re
import requests

In [3]:
VALID_SHOW_VALUES = [25, 50, 100, 250, 500, 1000, 2000]

HEADERS = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/115.0.0.0 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
    }

In [None]:
def extract_arxiv_metadata(paper_url: str, paper_id: str, domain: str = "cs") -> dict:
    """
    Scrapes metadata from a single arXiv abstract page.

    Returns:
        dict: Paper metadata with error handling for missing fields.
    """
    try:
        response = requests.get(paper_url, headers=HEADERS)
        response.raise_for_status()
        html = HTMLParser(response.text)
        content = html.css_first("div#content-inner")

        # Date
        try:
            date_text = content.css_first('div.dateline').text(strip=True)
            date_submitted = date_text.replace('[Submitted on ', '').replace(']', '')
        except Exception:
            date_submitted = None

        # Title
        try:
            title = content.css_first('h1.title').text(strip=True).replace('Title:', '').strip()
        except Exception:
            title = None

        # Authors
        try:
            authors = content.css_first("div.authors").text(strip=True).replace('Authors:', '').strip()
        except Exception:
            authors = None

        # Abstract
        try:
            abstract = content.css_first("blockquote.abstract").text(strip=True).replace('Abstract:', '').strip()
        except Exception:
            abstract = None

        # Subjects
        try:
            raw_subjects = content.css_first('td.subjects').text(strip=True)
            subjects = [s.strip().rsplit('(', 1)[0].strip() for s in raw_subjects.split(';')]
        except Exception:
            subjects = []

        primary_subject = subjects[0] if len(subjects) != 0 else None

        pdf_url = paper_url.replace('abs', 'pdf').strip()

        return {
            'Paper Title': title,
            'Paper ID': paper_id,
            'Authors': authors,
            'Abstract': abstract,
            'Domain': domain,
            'Primary Subject': primary_subject,
            'Subjects': subjects,
            'Date Submitted': date_submitted,
            'Abstract URL': paper_url,
            'PDF URL': pdf_url
        }

    except requests.RequestException as e:
        print(f"Request failed for {paper_url}: {e}")
        return {}
    
    

In [5]:
# def scrape_arxiv_list_page(list_url: str, max_papers: int = 3000, show: int = 100, domain: str = "cs") -> pd.DataFrame:
#     """
#     Iterates over multiple arXiv listing pages using pagination to collect metadata.

#     Parameters:
#         list_url (str): Base URL without skip parameter but including &show=...
#         max_papers (int): Max number of papers to scrape
#         show (int): Number of papers to show per page (25–2000)

#     Returns:
#         pd.DataFrame: Extracted paper metadata
#     """
#     results = []
#     skip = 71000

#     while len(results) < max_papers:
#         paged_url = f"{list_url}&skip={skip}"
#         print(f"Scraping: {paged_url}")

#         response = requests.get(paged_url, headers=HEADERS)
#         if response.status_code != 200:
#             print(f"[ERROR] Failed to fetch: {paged_url}")
#             break

#         html = HTMLParser(response.text)
#         papers = html.css('dl#articles > dt')

#         print(f"Number of papers: {len(papers)}")
#         if not papers:
#             print("[INFO] Empty page — stopping.")
#             break

#         for paper in tqdm(papers, desc=f"Processing page starting at skip={skip}"):
#             if len(results) >= max_papers:
#                 break
#             abstract_node = paper.css_first('a[title="Abstract"]')
#             if abstract_node:
#                 abstract_href = abstract_node.attributes.get('href')
#                 full_url = f"https://arxiv.org{abstract_href}"
#                 metadata = extract_arxiv_metadata(full_url, abstract_href.strip('/abs/'), domain=domain)
#                 if metadata:
#                     results.append(metadata)
#             else:
#                 print("[WARNING] Abstract link not found.")

#             time.sleep(1)


#         skip -= show  # advance by the expected batch size, not len(papers)

#     return pd.DataFrame(results)

In [None]:
def scrape_arxiv_list_ids(list_url: str, max_papers: int = 1000, show: int = 100, domain: str = "cs") -> pd.DataFrame:
    """
    Iterates over multiple arXiv listing pages using pagination to collect metadata.

    Parameters:
        list_url (str): Base URL without skip parameter but including &show=...
        max_papers (int): Max number of papers to scrape
        show (int): Number of papers to show per page (25-2000)

    Returns:
        pd.DataFrame: Extracted paper metadata
    """
    results = []
    skip = 50000

    while len(results) < max_papers:
        paged_url = f"{list_url}&skip={skip}"
        print(f"Scraping: {paged_url}")

        response = requests.get(paged_url, headers=HEADERS)
        if response.status_code != 200:
            print(f"[ERROR] Failed to fetch: {paged_url}")
            break

        html = HTMLParser(response.text)
        papers = html.css('dl#articles > dt')

        print(f"Number of papers: {len(papers)}")
        if not papers:
            print("[INFO] Empty page — stopping.")
            break

        for paper in tqdm(papers, desc=f"Processing page starting at skip={skip}"):
            if len(results) >= max_papers:
                break
            abstract_node = paper.css_first('a[title="Abstract"]')
            if abstract_node:
                abstract_href = abstract_node.attributes.get('href')
                if abstract_href:
                    paper_id = abstract_href.replace("/abs/", "").strip()
                    abstract_url = f"https://arxiv.org{abstract_href.strip()}"
                    results.append({
                        "paper_id": paper_id,
                        "abstract_url": abstract_url
                    })
            else:
                print("[WARNING] Abstract link not found for a paper.")

            time.sleep(0.25)


        skip += show  # advance by the expected batch size, not len(papers)

    return pd.DataFrame(results)


In [None]:
def retrieve_arxiv_papers(domain: str = "cs", show: int = 500, max_papers: int = 1000, year = 2024) -> pd.DataFrame:
    """
    Validates input and initiates the arXiv scraping process.

    Parameters:
        domain (str): arXiv domain like 'cs.LG', 'cs.CL'
        show (int): Number of papers per page (25–2000)
        max_papers (int): Maximum total papers to retrieve

    Returns:
        pd.DataFrame: Extracted arXiv paper metadata
    """
    if show not in VALID_SHOW_VALUES:
        raise ValueError(f"'show' must be one of {VALID_SHOW_VALUES}")

    base_url = f'https://arxiv.org/list/{domain}/{year}?show={show}'
    return scrape_arxiv_list_ids(base_url, max_papers=max_papers, show=show, domain=domain)


In [None]:
df = retrieve_arxiv_papers("cs", show=500, year=2023, max_papers=50000)
# df = scrape_arxiv_list_page(url)
print(df.head())
print(df.info())

df.to_csv(r'arxiv_cs_2023_remaining.csv', index=False)


Scraping: https://arxiv.org/list/cs/2023?show=500&skip=50000
Number of papers: 500


Processing page starting at skip=50000:  18%|█▊        | 91/500 [00:22<01:43,  3.95it/s]