# Notebook 1: Data Collection

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pavannn16/BERTopic-arXiv-Analysis/blob/main/notebooks/01_data_collection.ipynb)

**Purpose:** Fetch 20,000 arXiv cs.AI paper abstracts using the arXiv API.

**Time:** ~15 minutes (API rate limited)

---

## 1. Setup and Installation

In [1]:
# Install required packages (run once in Colab)
!pip install arxiv pandas tqdm pyyaml -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/81.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m81.5/81.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone


In [2]:
# ============================================================
# PROJECT SETUP - Config-based with Train/Infer Modes
# ============================================================
import os
import yaml
from pathlib import Path

# Clone repo if running on Colab
if 'google.colab' in str(get_ipython()) and not os.path.exists('/content/BERTopic-arXiv-Analysis'):
    !git clone https://github.com/pavannn16/BERTopic-arXiv-Analysis.git /content/BERTopic-arXiv-Analysis

# Load configuration
def load_config():
    config_paths = ['config.yaml', '../config.yaml', '/content/BERTopic-arXiv-Analysis/config.yaml']
    for path in config_paths:
        if os.path.exists(path):
            with open(path, 'r') as f:
                return yaml.safe_load(f), path
    return None, None

config, config_path = load_config()
if config:
    print(f"Loaded config from {config_path}")
else:
    print("Config not found, using defaults")
    config = {'mode': 'infer', 'data': {'arxiv': {'category': 'cs.AI', 'max_results': 20000}}}

MODE = config.get('mode', 'infer')
print(f"Mode: {MODE.upper()}")

# Setup paths
if 'google.colab' in str(get_ipython()):
    if MODE == 'train':
        from google.colab import drive
        drive.mount('/content/drive')
        PROJECT_PATH = '/content/drive/MyDrive/BERTopic-arXiv-Analysis'
        print("TRAIN mode: Personal Drive mounted - will fetch fresh data")
    else:
        PROJECT_PATH = '/content/BERTopic-arXiv-Analysis'
        print("INFER mode: Using data from cloned repo")
        print("Data fetching skipped in INFER mode. Change mode='train' in config.yaml to fetch new data.")
else:
    # Running locally
    PROJECT_PATH = str(Path(os.getcwd()).parent) if 'notebooks' in os.getcwd() else os.getcwd()
    print("Running locally")

# Create directories
for folder in ['data/raw', 'data/processed', 'data/embeddings', 'models', 'results/visualizations']:
    os.makedirs(f'{PROJECT_PATH}/{folder}', exist_ok=True)

print(f"Project path: {PROJECT_PATH}")

Cloning into '/content/BERTopic-arXiv-Analysis'...
remote: Enumerating objects: 229, done.[K
remote: Counting objects: 100% (86/86), done.[K
remote: Compressing objects: 100% (48/48), done.[K
remote: Total 229 (delta 50), reused 70 (delta 38), pack-reused 143 (from 1)[K
Receiving objects: 100% (229/229), 181.22 MiB | 38.30 MiB/s, done.
Resolving deltas: 100% (118/118), done.
Updating files: 100% (66/66), done.
Loaded config from /content/BERTopic-arXiv-Analysis/config.yaml
Mode: TRAIN
Mounted at /content/drive
TRAIN mode: Personal Drive mounted - will fetch fresh data
Project path: /content/drive/MyDrive/BERTopic-arXiv-Analysis


In [3]:
# Import libraries
import arxiv
import pandas as pd
from datetime import datetime, timedelta
from tqdm import tqdm
import json
import time

print("Libraries imported successfully!")

Libraries imported successfully!


## 2. Configuration

In [4]:
# Configuration - loaded from config.yaml or defaults
ARXIV_CONFIG = config.get('data', {}).get('arxiv', {})

CONFIG = {
    'category': ARXIV_CONFIG.get('category', 'cs.AI'),
    'max_results': ARXIV_CONFIG.get('max_results', 20000),
    'months_back': ARXIV_CONFIG.get('months_back', 24),
    'batch_size': ARXIV_CONFIG.get('batch_size', 100),
    'delay_seconds': ARXIV_CONFIG.get('delay_seconds', 3.0),
}

# Calculate date range
from datetime import datetime, timedelta
end_date = datetime.now()
start_date = end_date - timedelta(days=CONFIG['months_back'] * 30)

print(f"Category: {CONFIG['category']}")
print(f"Date range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
print(f"Max results: {CONFIG['max_results']}")

# Check mode - skip fetching in INFER mode
if MODE == 'infer':
    print("\nINFER mode: Skipping data fetch. Data should already exist.")
    print("   To fetch fresh data, set mode='train' in config.yaml")

Category: cs.AI
Date range: 2023-12-14 to 2025-12-03
Max results: 20000


## 3. Fetch Papers from arXiv API

In [5]:
# ============================================================
# TRAIN MODE ONLY - Skip in INFER mode
# ============================================================
if MODE == 'infer':
    print("INFER mode: Skipping data fetch (data already exists in repo)")
    print("   Loading existing data instead...")

    # Load existing data
    df = pd.read_csv(f"{PROJECT_PATH}/data/raw/arxiv_cs_ai_raw.csv")
    papers = df.to_dict('records')
    print(f"Loaded {len(papers)} papers from existing data")
else:
    # TRAIN MODE - Fetch fresh data from arXiv
    def fetch_arxiv_papers(category, max_results, batch_size=100, delay=3.0):
        """
        Fetch papers from arXiv API.

        Args:
            category: arXiv category (e.g., 'cs.AI')
            max_results: Maximum number of papers
            batch_size: Papers per API request
            delay: Delay between requests in seconds

        Returns:
            List of paper dictionaries
        """
        query = f"cat:{category}"

        print(f"Fetching up to {max_results} papers from arXiv category: {category}")
        print(f"This may take {max_results * delay / 60 / batch_size:.1f} minutes...")

        # Configure search
        search = arxiv.Search(
            query=query,
            max_results=max_results,
            sort_by=arxiv.SortCriterion.SubmittedDate,
            sort_order=arxiv.SortOrder.Descending
        )

        # Configure client
        client = arxiv.Client(
            page_size=batch_size,
            delay_seconds=delay,
            num_retries=5
        )

        papers = []

        try:
            for result in tqdm(client.results(search), total=max_results, desc="Fetching"):
                paper = {
                    "arxiv_id": result.entry_id.split("/")[-1],
                    "title": result.title.replace("\n", " ").strip(),
                    "abstract": result.summary.replace("\n", " ").strip(),
                    "authors": ", ".join([author.name for author in result.authors[:5]]),
                    "date": result.published.strftime("%Y-%m-%d"),
                    "year_month": result.published.strftime("%Y-%m"),
                    "url": result.entry_id,
                    "categories": ", ".join(result.categories),
                    "primary_category": result.primary_category
                }
                papers.append(paper)

                if len(papers) >= max_results:
                    break

        except Exception as e:
            print(f"\nError during fetch: {e}")
            print(f"Successfully fetched {len(papers)} papers before error")

        return papers

    # Fetch papers
    papers = fetch_arxiv_papers(
        category=CONFIG['category'],
        max_results=CONFIG['max_results'],
        batch_size=CONFIG['batch_size'],
        delay=CONFIG['delay_seconds']
    )

    print(f"\nTotal papers fetched: {len(papers)}")

Fetching up to 20000 papers from arXiv category: cs.AI
This may take 10.0 minutes...


Fetching:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 10000/20000 [06:07<06:07, 27.20it/s]


Error during fetch: Page request resulted in HTTP 500 (https://export.arxiv.org/api/query?search_query=cat%3Acs.AI&id_list=&sortBy=submittedDate&sortOrder=descending&start=10000&max_results=100)
Successfully fetched 10000 papers before error

Total papers fetched: 10000





In [6]:
# ============================================================
# TRAIN MODE ONLY - Fetch additional papers
# ============================================================
if MODE == 'infer':
    print("INFER mode: Skipping additional fetch (using existing data)")
else:
    # arXiv API has a 10,000 result limit per query
    # Let's fetch additional papers by querying different date ranges

    def fetch_arxiv_by_date_range(category, start_date, end_date, max_results=10000, batch_size=100, delay=3.0):
        """Fetch papers within a specific date range."""
        start_str = start_date.strftime("%Y%m%d")
        end_str = end_date.strftime("%Y%m%d")

        query = f"cat:{category} AND submittedDate:[{start_str}0000 TO {end_str}2359]"

        print(f"Fetching papers from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")

        search = arxiv.Search(
            query=query,
            max_results=max_results,
            sort_by=arxiv.SortCriterion.SubmittedDate,
            sort_order=arxiv.SortOrder.Descending
        )

        client = arxiv.Client(
            page_size=batch_size,
            delay_seconds=delay,
            num_retries=5
        )

        papers = []

        try:
            for result in tqdm(client.results(search), total=max_results, desc="Fetching"):
                paper = {
                    "arxiv_id": result.entry_id.split("/")[-1],
                    "title": result.title.replace("\n", " ").strip(),
                    "abstract": result.summary.replace("\n", " ").strip(),
                    "authors": ", ".join([author.name for author in result.authors[:5]]),
                    "date": result.published.strftime("%Y-%m-%d"),
                    "year_month": result.published.strftime("%Y-%m"),
                    "url": result.entry_id,
                    "categories": ", ".join(result.categories),
                    "primary_category": result.primary_category
                }
                papers.append(paper)

                if len(papers) >= max_results:
                    break

        except Exception as e:
            print(f"\nError: {e}")
            print(f"Fetched {len(papers)} papers before error")

        return papers

    # Find the earliest date we have
    earliest_date = pd.to_datetime(min([p['date'] for p in papers]))
    print(f"Earliest paper in current batch: {earliest_date.strftime('%Y-%m-%d')}")

    # Fetch earlier papers
    end_date_batch2 = earliest_date - timedelta(days=1)
    start_date_batch2 = earliest_date - timedelta(days=180)

    print(f"\nFetching additional papers from {start_date_batch2.strftime('%Y-%m-%d')} to {end_date_batch2.strftime('%Y-%m-%d')}")

    papers_batch2 = fetch_arxiv_by_date_range(
        category=CONFIG['category'],
        start_date=start_date_batch2,
        end_date=end_date_batch2,
        max_results=10000,
        batch_size=CONFIG['batch_size'],
        delay=CONFIG['delay_seconds']
    )

    print(f"\nAdditional papers fetched: {len(papers_batch2)}")

Earliest paper in current batch: 2025-09-26

Fetching additional papers from 2025-03-30 to 2025-09-25
Fetching papers from 2025-03-30 to 2025-09-25


Fetching: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 9999/10000 [05:43<00:00, 29.14it/s]


Additional papers fetched: 10000





In [7]:
# ============================================================
# TRAIN MODE ONLY - Combine batches and deduplicate
# ============================================================
if MODE == 'infer':
    print("INFER mode: Using existing data (already loaded)")
else:
    # Check what we have so far
    print(f"Batch 1 (recent): {len(papers)} papers")
    print(f"Batch 2 (earlier): {len(papers_batch2)} papers")

    # Combine and deduplicate
    all_papers = papers + papers_batch2
    seen_ids = set()
    unique_papers = []
    for p in all_papers:
        if p['arxiv_id'] not in seen_ids:
            seen_ids.add(p['arxiv_id'])
            unique_papers.append(p)

    print(f"Total unique papers: {len(unique_papers)}")

    # Check date range
    dates = [p['date'] for p in unique_papers]
    print(f"Date range: {min(dates)} to {max(dates)}")

Batch 1 (recent): 10000 papers
Batch 2 (earlier): 10000 papers
Total unique papers: 20000
Date range: 2025-07-03 to 2025-12-02


In [8]:
# ============================================================
# Finalize papers dataset
# ============================================================
if MODE == 'infer':
    # Already loaded papers from CSV in cell 8
    df = pd.read_csv(f"{PROJECT_PATH}/data/raw/arxiv_cs_ai_raw.csv")
    papers = df.to_dict('records')
    dates = [p['date'] for p in papers]
    print(f"INFER mode: Loaded {len(papers):,} papers")
    print(f"üìÖ Date range: {min(dates)} to {max(dates)}")
else:
    # Use the combined unique papers
    papers = unique_papers
    print(f"Final dataset: {len(papers)} papers")
    print(f"üìÖ Date range: {min(dates)} to {max(dates)} (recent 5 months)")
    print(f"üéØ Target achieved: 20,000 recent cs.AI papers!")

Final dataset: 20000 papers
üìÖ Date range: 2025-07-03 to 2025-12-02 (recent 5 months)
üéØ Target achieved: 20,000 recent cs.AI papers!


## 4. Create DataFrame and Explore Data

In [9]:
# Create DataFrame
df = pd.DataFrame(papers)

print(f"DataFrame shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nDate range: {df['date'].min()} to {df['date'].max()}")
df.head()

DataFrame shape: (20000, 9)

Columns: ['arxiv_id', 'title', 'abstract', 'authors', 'date', 'year_month', 'url', 'categories', 'primary_category']

Date range: 2025-07-03 to 2025-12-02


Unnamed: 0,arxiv_id,title,abstract,authors,date,year_month,url,categories,primary_category
0,2512.03042v1,PPTArena: A Benchmark for Agentic PowerPoint E...,"We introduce PPTArena, a benchmark for PowerPo...","Michael Ofengenden, Yunze Man, Ziqi Pang, Yu-X...",2025-12-02,2025-12,http://arxiv.org/abs/2512.03042v1,"cs.CV, cs.AI",cs.CV
1,2512.03040v1,Video4Spatial: Towards Visuospatial Intelligen...,We investigate whether video generative models...,"Zeqi Xiao, Yiwei Zhao, Lingxiao Li, Yushi Lan,...",2025-12-02,2025-12,http://arxiv.org/abs/2512.03040v1,"cs.CV, cs.AI",cs.CV
2,2512.03036v1,ViSAudio: End-to-End Video-Driven Binaural Spa...,"Despite progress in video-to-audio generation,...","Mengchen Zhang, Qi Chen, Tong Wu, Zihan Liu, D...",2025-12-02,2025-12,http://arxiv.org/abs/2512.03036v1,"cs.CV, cs.AI",cs.CV
3,2512.03028v1,SMP: Reusable Score-Matching Motion Priors for...,Data-driven motion priors that can guide agent...,"Yuxuan Mu, Ziyu Zhang, Yi Shi, Minami Matsumot...",2025-12-02,2025-12,http://arxiv.org/abs/2512.03028v1,"cs.GR, cs.AI, cs.CV, cs.RO",cs.GR
4,2512.03026v1,The Moral Consistency Pipeline: Continuous Eth...,The rapid advancement and adaptability of Larg...,"Saeid Jamshidi, Kawser Wazed Nafi, Arghavan Mo...",2025-12-02,2025-12,http://arxiv.org/abs/2512.03026v1,"cs.CL, cs.AI",cs.CL


In [10]:
# Basic statistics
print("Dataset Statistics:")
print(f"  Total papers: {len(df)}")
print(f"  Unique dates: {df['date'].nunique()}")
print(f"  Date range: {df['date'].min()} to {df['date'].max()}")

# Text length statistics
df['title_len'] = df['title'].str.len()
df['abstract_len'] = df['abstract'].str.len()

print(f"\nTitle length: mean={df['title_len'].mean():.0f}, median={df['title_len'].median():.0f}")
print(f"Abstract length: mean={df['abstract_len'].mean():.0f}, median={df['abstract_len'].median():.0f}")

Dataset Statistics:
  Total papers: 20000
  Unique dates: 153
  Date range: 2025-07-03 to 2025-12-02

Title length: mean=83, median=83
Abstract length: mean=1340, median=1341


In [11]:
# Papers per month
papers_per_month = df['year_month'].value_counts().sort_index()
print("Papers per month:")
print(papers_per_month)

Papers per month:
year_month
2025-07    3156
2025-08    3819
2025-09    4204
2025-10    4821
2025-11    3755
2025-12     245
Name: count, dtype: int64


In [12]:
# Sample abstracts
print("Sample abstracts:")
print("=" * 80)
for i, row in df.head(3).iterrows():
    print(f"\nTitle: {row['title']}")
    print(f"Date: {row['date']}")
    print(f"Abstract: {row['abstract'][:300]}...")
    print("-" * 80)

Sample abstracts:

Title: PPTArena: A Benchmark for Agentic PowerPoint Editing
Date: 2025-12-02
Abstract: We introduce PPTArena, a benchmark for PowerPoint editing that measures reliable modifications to real slides under natural-language instructions. In contrast to image-PDF renderings or text-to-slide generation, PPTArena focuses on in-place editing across 100 decks, 2125 slides, and over 800 targete...
--------------------------------------------------------------------------------

Title: Video4Spatial: Towards Visuospatial Intelligence with Context-Guided Video Generation
Date: 2025-12-02
Abstract: We investigate whether video generative models can exhibit visuospatial intelligence, a capability central to human cognition, using only visual data. To this end, we present Video4Spatial, a framework showing that video diffusion models conditioned solely on video-based scene context can perform co...
--------------------------------------------------------------------------------

Ti

## 5. Save Raw Data

In [13]:
# ============================================================
# Save raw data (TRAIN mode only)
# ============================================================
if MODE == 'infer':
    print("INFER mode: Skipping save (data already exists)")
    print(f"Existing data at: {PROJECT_PATH}/data/raw/arxiv_cs_ai_raw.csv")
else:
    # Save raw data as JSON
    raw_json_path = f"{PROJECT_PATH}/data/raw/arxiv_cs_ai_raw.json"
    with open(raw_json_path, 'w') as f:
        json.dump(papers, f, indent=2)
    print(f"Raw JSON saved to: {raw_json_path}")

    # Save as CSV (more convenient for pandas)
    raw_csv_path = f"{PROJECT_PATH}/data/raw/arxiv_cs_ai_raw.csv"
    df.to_csv(raw_csv_path, index=False)
    print(f"Raw CSV saved to: {raw_csv_path}")

    print(f"\nTotal records saved: {len(df)}")

Raw JSON saved to: /content/drive/MyDrive/BERTopic-arXiv-Analysis/data/raw/arxiv_cs_ai_raw.json
Raw CSV saved to: /content/drive/MyDrive/BERTopic-arXiv-Analysis/data/raw/arxiv_cs_ai_raw.csv

Total records saved: 20000


## 6. Data Quality Checks

In [14]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Check for duplicates
n_duplicates = df['arxiv_id'].duplicated().sum()
print(f"\nDuplicate arxiv_ids: {n_duplicates}")

# Check for empty abstracts
empty_abstracts = (df['abstract'].str.len() < 50).sum()
print(f"Short abstracts (<50 chars): {empty_abstracts}")

Missing values:
arxiv_id            0
title               0
abstract            0
authors             0
date                0
year_month          0
url                 0
categories          0
primary_category    0
title_len           0
abstract_len        0
dtype: int64

Duplicate arxiv_ids: 0
Short abstracts (<50 chars): 0
