In [2]:
%pip install feedparser nltk 
import feedparser
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from datetime import datetime
import time
import warnings
import os
warnings.filterwarnings('ignore')

# Download NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')

# Function to fetch papers from arXiv API
def fetch_arxiv_papers(query='cat:cs.AI+AND+all:generative+ai', max_results=100):
    """
    Fetch papers from arXiv API based on query

    Parameters:
    -----------
    query : str
        arXiv query string (e.g., 'cat:cs.AI+AND+all:generative+ai')
    max_results : int
        Maximum number of results to return

    Returns:
    --------
    pandas.DataFrame
        DataFrame containing paper details
    """
    print(f"Fetching {max_results} papers with query: {query}")

    # Initialize empty list to store paper details
    papers = []

    # Set up arXiv API URL with query parameters
    base_url = 'http://export.arxiv.org/api/query?'
    start = 0
    batch_size = 100  # arXiv allows max 100 results per request

    # Loop to fetch papers in batches
    while start < max_results:
        # Calculate remaining papers to fetch
        current_batch_size = min(batch_size, max_results - start)

        # Construct the query URL
        query_url = f"{base_url}search_query={query}&start={start}&max_results={current_batch_size}&sortBy=submittedDate&sortOrder=descending"

        # Fetch data from arXiv
        response = feedparser.parse(query_url)

        # Process each entry
        for entry in response.entries:
            # Extract authors
            authors = [author.name for author in entry.authors]

            # Extract categories
            categories = [tag['term'] for tag in entry.tags]

            # Create paper dictionary
            paper = {
                'id': entry.id.split('/abs/')[-1],
                'title': entry.title.replace('\n', ' ').strip(),
                'abstract': entry.summary.replace('\n', ' ').strip(),
                'authors': ', '.join(authors),
                'categories': ', '.join(categories),
                'published': entry.published,
                'link': entry.link
            }

            papers.append(paper)

        # Update counter for next batch
        start += current_batch_size

        # Sleep to prevent hitting API rate limits
        print(f"Fetched {len(papers)} papers so far...")
        time.sleep(3)

    # Convert to DataFrame
    df = pd.DataFrame(papers)
    print(f"Total papers fetched: {len(df)}")
    return df

# Function to clean text
def clean_text(text):
    """Clean text by removing special characters, extra spaces, etc."""
    # Convert to lowercase
    text = text.lower()

    # Remove LaTeX commands
    text = re.sub(r'\\[a-zA-Z]+', '', text)

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove special characters and digits
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\d+', ' ', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

# Function to extract datasets mentioned in the text
def extract_datasets(text):
    """
    Extract potential dataset names from text
    This is a simple rule-based approach, can be improved with NER models
    """
    # Common dataset name patterns
    dataset_patterns = [
        r'\b[A-Z0-9]+-?[A-Za-z0-9]+(?:-[0-9]+)?\b',  # MNIST, MS-COCO, ImageNet-1K
        r'\b[A-Za-z]+(?:Dataset|Corpus|DB)\b',       # PennTreebank, SQuADDataset
        r'\b(?:dataset|corpus|database)s?\s+(?:called|named)\s+([A-Za-z0-9]+)',  # dataset called XYZ
    ]

    potential_datasets = []
    for pattern in dataset_patterns:
        matches = re.findall(pattern, text)
        potential_datasets.extend([m for m in matches if len(m) > 2])

    # Filter common false positives
    stopwords = ['THE', 'AND', 'FOR', 'WITH', 'THIS', 'THAT', 'USING', 'WE', 'THEY']
    filtered_datasets = [d for d in potential_datasets if d.upper() not in stopwords]

    return list(set(filtered_datasets))

# Fetch papers
papers_df = fetch_arxiv_papers(query='cat:cs.AI+AND+all:generative+ai', max_results=200)

# Clean titles and abstracts
papers_df['cleaned_title'] = papers_df['title'].apply(clean_text)
papers_df['cleaned_abstract'] = papers_df['abstract'].apply(clean_text)

# Extract tokens
papers_df['title_tokens'] = papers_df['cleaned_title'].apply(word_tokenize)
papers_df['abstract_tokens'] = papers_df['cleaned_abstract'].apply(word_tokenize)

# Count tokens
papers_df['title_token_count'] = papers_df['title_tokens'].apply(len)
papers_df['abstract_token_count'] = papers_df['abstract_tokens'].apply(len)

# Extract potential datasets
papers_df['potential_datasets'] = papers_df['abstract'].apply(extract_datasets)

# Save to CSV
output_file = 'data/cleaned/generative_ai_papers.csv'

# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(output_file), exist_ok=True)

papers_df.to_csv(output_file, index=False)
print(f"Saved processed data to {output_file}")
papers_df.to_csv(output_file, index=False)
print(f"Saved processed data to {output_file}")

# Display statistics
print("\nDataset Statistics:")
print(f"Number of papers: {len(papers_df)}")
print(f"Average title length: {papers_df['title_token_count'].mean():.2f} tokens")
print(f"Average abstract length: {papers_df['abstract_token_count'].mean():.2f} tokens")
print(f"Papers with potential datasets mentioned: {(papers_df['potential_datasets'].str.len() > 0).sum()}")

Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Kanchan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kanchan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Fetching 200 papers with query: cat:cs.AI+AND+all:generative+ai
Fetched 100 papers so far...
Fetched 200 papers so far...
Total papers fetched: 200
Saved processed data to data/cleaned/generative_ai_papers.csv
Saved processed data to data/cleaned/generative_ai_papers.csv

Dataset Statistics:
Number of papers: 200
Average title length: 11.16 tokens
Average abstract length: 192.43 tokens
Papers with potential datasets mentioned: 200
