In [4]:
!pip install arxiv pandas tqdm

Collecting arxiv
  Downloading arxiv-2.1.3-py3-none-any.whl.metadata (6.1 kB)
Collecting feedparser~=6.0.10 (from arxiv)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser~=6.0.10->arxiv)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25ldone
Downloading arxiv-2.1.3-py3-none-any.whl (11 kB)
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
Building wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25ldone
[?25h  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6048 sha256=672ca3fa144e9aa1dba6851d69a36f5d00ca6df15917aee9a73a1ae90f0f4370
  Stored in directory: /Users/midnight_oatmeal/Library/Caches/pip/wheels/3b/25/2a/105d6a15df6914f4d15047691c6c28f9052cc1173e40285d03
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser, arxiv
Successfully installed arxiv-2.1.3 feedparser-6.0.11 sgmllib3k-1.0.0


In [5]:
import requests
import xmltodict
import time

def fetch_arxiv_sets():
    """Fetches and prints available OAI-PMH categories from arXiv."""
    base_url = "http://export.arxiv.org/oai2"
    params = {"verb": "ListSets"}

    for attempt in range(3):  # Retry mechanism
        try:
            print(f"📡 Attempt {attempt + 1}: Fetching available arXiv categories...")
            response = requests.get(base_url, params=params, timeout=10)  # 10s timeout
            
            if response.status_code == 200:
                break
            print(f"⚠️ Warning: Status {response.status_code}. Retrying...")
            time.sleep(2)
        except requests.exceptions.RequestException as e:
            print(f"❌ Connection error: {e}. Retrying...")
            time.sleep(2)
    else:
        print("❌ Failed after multiple attempts.")
        return
    
    data = xmltodict.parse(response.content)
    list_sets = data.get("OAI-PMH", {}).get("ListSets", {}).get("set", [])

    if isinstance(list_sets, dict):  # Single entry case
        list_sets = [list_sets]

    if list_sets:
        for s in list_sets:
            set_spec = s.get("setSpec", "N/A")
            set_name = s.get("setName", "Unknown Name")
            print(f"✅ SetSpec: {set_spec} - {set_name}")
    else:
        print("❌ No category sets found.")

fetch_arxiv_sets()

📡 Attempt 1: Fetching available arXiv categories...
✅ SetSpec: cs - Computer Science
✅ SetSpec: econ - Economics
✅ SetSpec: eess - Electrical Engineering and Systems Science
✅ SetSpec: math - Mathematics
✅ SetSpec: physics - Physics
✅ SetSpec: physics:astro-ph - Astrophysics
✅ SetSpec: physics:cond-mat - Condensed Matter
✅ SetSpec: physics:gr-qc - General Relativity and Quantum Cosmology
✅ SetSpec: physics:hep-ex - High Energy Physics - Experiment
✅ SetSpec: physics:hep-lat - High Energy Physics - Lattice
✅ SetSpec: physics:hep-ph - High Energy Physics - Phenomenology
✅ SetSpec: physics:hep-th - High Energy Physics - Theory
✅ SetSpec: physics:math-ph - Mathematical Physics
✅ SetSpec: physics:nlin - Nonlinear Sciences
✅ SetSpec: physics:nucl-ex - Nuclear Experiment
✅ SetSpec: physics:nucl-th - Nuclear Theory
✅ SetSpec: physics:physics - Physics (Other)
✅ SetSpec: physics:quant-ph - Quantum Physics
✅ SetSpec: q-bio - Quantitative Biology
✅ SetSpec: q-fin - Quantitative Finance
✅ SetSpec:

In [6]:

import arxiv
import pandas as pd
import json
import time
from tqdm import tqdm

def fetch_arxiv_papers(categories, max_results=2000, batch_size=200, delay=1):
    client = arxiv.Client()
    papers = []
    
    for cat in categories:
        search = arxiv.Search(query=f"cat:{cat}", max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate)
        batch_count = 0
        
        for result in tqdm(client.results(search), total=max_results):
            papers.append({
                "title": result.title,
                "abstract": result.summary,
                "published": result.published.strftime('%Y-%m-%d'),
                "url": result.entry_id,
                "authors": [author.name for author in result.authors],
                "category": result.primary_category
            })
            batch_count += 1
            
            if batch_count % batch_size == 0:
                time.sleep(delay)  # Respect arXiv rate limits
    
    return papers

# Define categories to scrape (Deep Learning focused)
categories = ["cs.LG", "cs.CL", "stat.ML"]

# Fetch papers with batch processing and delay
dataset = fetch_arxiv_papers(categories, max_results=2000, batch_size=200, delay=1)

# Save dataset as JSON
with open("arxiv_papers.json", "w") as f:
    json.dump(dataset, f, indent=4)

# Save dataset as CSV
df = pd.DataFrame(dataset)
df.to_csv("arxiv_papers.csv", index=False)

print("✅ Dataset saved as arxiv_papers.json and arxiv_papers.csv")




100%|██████████████████████████████████████████████████████████████████████| 2000/2000 [01:22<00:00, 24.35it/s]

✅ Dataset saved as arxiv_papers.json and arxiv_papers.csv





In [7]:
import pandas as pd

df = pd.read_csv("arxiv_papers.csv")
print(df.head())
print(df.info())

                                               title  \
0   Low-Rank Adapting Models for Sparse Autoencoders   
1  Redefining Machine Unlearning: A Conformal Pre...   
2  Detection Is All You Need: A Feasible Optimal ...   
3  Vintix: Action Model via In-Context Reinforcem...   
4         Scalable-Softmax Is Superior for Attention   

                                            abstract   published  \
0  Sparse autoencoders (SAEs) decompose language ...  2025-01-31   
1  Machine unlearning seeks to systematically rem...  2025-01-31   
2  We study the problem of piecewise stationary b...  2025-01-31   
3  In-Context Reinforcement Learning (ICRL) repre...  2025-01-31   
4  The maximum element of the vector output by th...  2025-01-31   

                                 url  \
0  http://arxiv.org/abs/2501.19406v1   
1  http://arxiv.org/abs/2501.19403v1   
2  http://arxiv.org/abs/2501.19401v1   
3  http://arxiv.org/abs/2501.19400v1   
4  http://arxiv.org/abs/2501.19399v1   

             

In [8]:
# remove the noise and make clean data

import re

def clean_text(text):
    text = re.sub(r"\[\d+\]", "", text) # remove citations like [1], [2]
    text = re.sub(r"\(.*?\)", "", text) #remove anything in brackets
    text = re.sub(r"\s+", " ", text).strip() # remove extra spaces

df["cleaned_abstract"] = df["abstract"].apply(clean_text)

#drop any rows where abstract is missing
df = df.dropna(subset=["cleaned_abstract"])

df.to_csv("cleaned_arxiv_papers.csv", index=False)
print("Cleaned dataset saved as cleaned_arxiv_papers.csv")

Cleaned dataset saved as cleaned_arxiv_papers.csv


In [11]:
# converting the dataset

import json
train_data = []

for _, row in df.iterrows():
    train_data.append({
        "instruction": "Summarize this research paper.",
        "input": row["cleaned_abstract"],
        "output": "This papers explores advances in deep learning."
    })
# save as JSONL
with open("fine_tune_data.jsonl", "w") as f:
    for entry in train_data:
        f.write(json.dumps(entry) + "\n")
print("Fined-tuning dataset saved as fine_tune_data.jsonl")
        

Fined-tuning dataset saved as fine_tune_data.jsonl


In [13]:
import os

print(os.listdir())

['fine_tune_data.jsonl', 'research-gpt_fetch_metadata.ipynb', 'arxiv_papers.csv', 'arxiv_papers.json', '.ipynb_checkpoints', 'cleaned_arxiv_papers.csv']
