# Corpus Analysis

This notebook provides descriptive analysis of the arXiv corpus stored in MongoDB.

## Setup

In [None]:
import os
import sys
from datetime import datetime
from collections import Counter

# Add src to path if running from notebooks directory
sys.path.insert(0, os.path.join(os.getcwd(), '..', 'src'))

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pymongo import MongoClient

# Configure display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

# Set seaborn style
sns.set_theme(style="whitegrid", palette="muted")
plt.rcParams['figure.figsize'] = (10, 6)

print("Libraries loaded successfully")

In [None]:
# Connect to MongoDB
MONGODB_URI = os.environ.get('MONGODB_URI', 'mongodb://localhost:27017')
DB_NAME = os.environ.get('ARXIV_CORPUS_DB', 'arxiv_corpus')

client = MongoClient(MONGODB_URI)
db = client[DB_NAME]

# List collections
print(f"Connected to database: {DB_NAME}")
print(f"Collections: {db.list_collection_names()}")

## 1. Corpus Overview

Basic statistics about the corpus size and composition.

In [None]:
# Collection counts
overview = {
    'Papers': db.papers.count_documents({}),
    'Paragraphs': db.paragraphs.count_documents({}),
    'Tables': db.tables.count_documents({}) if 'tables' in db.list_collection_names() else 0,
    'Figures': db.figures.count_documents({}) if 'figures' in db.list_collection_names() else 0,
    'Search Results': db.search_results.count_documents({}),
    'Term Lists': db.term_lists.count_documents({}) if 'term_lists' in db.list_collection_names() else 0,
}

overview_df = pd.DataFrame(list(overview.items()), columns=['Collection', 'Count'])
print("=== Corpus Overview ===")
display(overview_df)

In [None]:
# Papers by processing status
status_pipeline = [
    {'$group': {'_id': '$status', 'count': {'$sum': 1}}},
    {'$sort': {'count': -1}}
]

status_counts = list(db.papers.aggregate(status_pipeline))
status_df = pd.DataFrame(status_counts).rename(columns={'_id': 'Status', 'count': 'Count'})

print("\n=== Papers by Processing Status ===")
display(status_df)

if len(status_df) > 0:
    fig, ax = plt.subplots(figsize=(8, 5))
    sns.barplot(data=status_df, y='Status', x='Count', ax=ax, palette='Blues_d')
    ax.set_xlabel('Number of Papers')
    ax.set_title('Papers by Processing Status')
    plt.tight_layout()
    plt.show()

## 2. Papers Analysis

In [None]:
# Load papers into DataFrame
papers_cursor = db.papers.find({}, {
    'arxiv_id': 1,
    'title': 1,
    'authors': 1,
    'categories': 1,
    'published_date': 1,
    'status': 1,
    'occurrence_count': 1,
    'search_queries': 1,
    'document_metrics': 1,
})

papers_df = pd.DataFrame(list(papers_cursor))

if len(papers_df) > 0:
    # Extract author count
    papers_df['author_count'] = papers_df['authors'].apply(lambda x: len(x) if isinstance(x, list) else 0)
    
    # Extract primary category
    papers_df['primary_category'] = papers_df['categories'].apply(
        lambda x: x[0] if isinstance(x, list) and len(x) > 0 else 'unknown'
    )
    
    # Extract year/month from published_date
    papers_df['published_date'] = pd.to_datetime(papers_df['published_date'], errors='coerce')
    papers_df['year'] = papers_df['published_date'].dt.year
    papers_df['month'] = papers_df['published_date'].dt.to_period('M')
    
    print(f"Loaded {len(papers_df)} papers")
    display(papers_df[['arxiv_id', 'title', 'primary_category', 'year', 'status']].head(10))
else:
    print("No papers found in database")

In [None]:
# Papers by category
if len(papers_df) > 0:
    category_counts = papers_df['primary_category'].value_counts().head(15).reset_index()
    category_counts.columns = ['Category', 'Count']
    
    print("=== Papers by Primary Category (Top 15) ===")
    display(category_counts)
    
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.barplot(data=category_counts, y='Category', x='Count', ax=ax, palette='viridis')
    ax.set_xlabel('Number of Papers')
    ax.set_title('Papers by Primary arXiv Category')
    plt.tight_layout()
    plt.show()

In [None]:
# Papers by publication year
if len(papers_df) > 0 and papers_df['year'].notna().any():
    year_counts = papers_df['year'].value_counts().sort_index().reset_index()
    year_counts.columns = ['Year', 'Count']
    year_counts['Year'] = year_counts['Year'].astype(int)
    
    print("=== Papers by Publication Year ===")
    display(year_counts)
    
    fig, ax = plt.subplots(figsize=(10, 5))
    sns.barplot(data=year_counts, x='Year', y='Count', ax=ax, palette='Blues_d')
    ax.set_xlabel('Year')
    ax.set_ylabel('Number of Papers')
    ax.set_title('Papers by Publication Year')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
# Papers by month (recent trend)
if len(papers_df) > 0 and papers_df['month'].notna().any():
    month_counts = papers_df['month'].value_counts().sort_index().tail(24)  # Last 24 months
    
    if len(month_counts) > 0:
        month_df = month_counts.reset_index()
        month_df.columns = ['Month', 'Count']
        month_df['Month'] = month_df['Month'].astype(str)
        
        fig, ax = plt.subplots(figsize=(12, 5))
        sns.lineplot(data=month_df, x='Month', y='Count', ax=ax, marker='o', color='steelblue')
        ax.set_xlabel('Month')
        ax.set_ylabel('Number of Papers')
        ax.set_title('Papers by Publication Month (Last 24 Months)')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()

In [None]:
# Author count distribution
if len(papers_df) > 0:
    print("=== Author Count Statistics ===")
    print(papers_df['author_count'].describe())
    
    fig, ax = plt.subplots(figsize=(10, 5))
    sns.histplot(data=papers_df, x='author_count', bins=range(0, min(20, papers_df['author_count'].max() + 2)), 
                 ax=ax, color='steelblue', edgecolor='white')
    ax.set_xlabel('Number of Authors')
    ax.set_ylabel('Number of Papers')
    ax.set_title('Distribution of Author Count per Paper')
    plt.tight_layout()
    plt.show()

In [None]:
# Papers by occurrence count (how many search queries found each paper)
if len(papers_df) > 0 and 'occurrence_count' in papers_df.columns:
    occurrence_counts = papers_df['occurrence_count'].value_counts().sort_index().head(20).reset_index()
    occurrence_counts.columns = ['Occurrence', 'Count']
    
    print("=== Papers by Search Query Occurrence ===")
    print(f"Papers found by only 1 query: {papers_df[papers_df['occurrence_count'] == 1].shape[0]}")
    print(f"Papers found by 2+ queries: {papers_df[papers_df['occurrence_count'] >= 2].shape[0]}")
    print(f"Papers found by 5+ queries: {papers_df[papers_df['occurrence_count'] >= 5].shape[0]}")
    
    fig, ax = plt.subplots(figsize=(10, 5))
    sns.barplot(data=occurrence_counts, x='Occurrence', y='Count', ax=ax, palette='Blues_d')
    ax.set_xlabel('Number of Queries Finding Paper')
    ax.set_ylabel('Number of Papers')
    ax.set_title('Papers by Search Query Occurrence Count')
    plt.tight_layout()
    plt.show()

## 3. Document Metrics Analysis

Analysis of document structure metrics from Docling conversion.

In [None]:
# Extract document metrics
if len(papers_df) > 0 and 'document_metrics' in papers_df.columns:
    # Filter papers with metrics
    papers_with_metrics = papers_df[papers_df['document_metrics'].notna()].copy()
    
    if len(papers_with_metrics) > 0:
        # Expand metrics into columns
        metrics_df = pd.json_normalize(papers_with_metrics['document_metrics'])
        metrics_df['arxiv_id'] = papers_with_metrics['arxiv_id'].values
        
        print(f"Papers with document metrics: {len(metrics_df)}")
        print("\n=== Document Metrics Summary ===")
        display(metrics_df.describe())
    else:
        print("No papers have document metrics yet (run 'arxiv-corpus process convert' first)")
        metrics_df = pd.DataFrame()
else:
    print("No document metrics available")
    metrics_df = pd.DataFrame()

In [None]:
# Document metrics distributions
if len(metrics_df) > 0 and 'num_pages' in metrics_df.columns:
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    
    # Pages
    sns.histplot(data=metrics_df, x='num_pages', bins=30, ax=axes[0, 0], color='steelblue', edgecolor='white')
    axes[0, 0].set_title('Distribution of Page Count')
    axes[0, 0].set_xlabel('Pages')
    
    # Tables
    if 'num_tables' in metrics_df.columns:
        sns.histplot(data=metrics_df, x='num_tables', bins=20, ax=axes[0, 1], color='steelblue', edgecolor='white')
        axes[0, 1].set_title('Distribution of Table Count')
        axes[0, 1].set_xlabel('Tables')
    
    # Figures
    if 'num_figures' in metrics_df.columns:
        sns.histplot(data=metrics_df, x='num_figures', bins=20, ax=axes[1, 0], color='steelblue', edgecolor='white')
        axes[1, 0].set_title('Distribution of Figure Count')
        axes[1, 0].set_xlabel('Figures')
    
    # Word count
    if 'word_count' in metrics_df.columns:
        sns.histplot(data=metrics_df, x='word_count', bins=30, ax=axes[1, 1], color='steelblue', edgecolor='white')
        axes[1, 1].set_title('Distribution of Word Count')
        axes[1, 1].set_xlabel('Words')
    
    plt.tight_layout()
    plt.show()

## 4. Search Queries Analysis

In [None]:
# Load search results
search_results = list(db.search_results.find({}))
search_df = pd.DataFrame(search_results)

if len(search_df) > 0:
    print(f"Total search queries executed: {len(search_df)}")
    print(f"Total results across all queries: {search_df['total_results'].sum()}")
    print(f"Average results per query: {search_df['total_results'].mean():.1f}")
    
    display(search_df[['query', 'base_term', 'attribute', 'domain', 'total_results']].head(10))
else:
    print("No search queries executed yet")

In [None]:
# Results by domain
if len(search_df) > 0 and 'domain' in search_df.columns:
    domain_stats = search_df.groupby('domain').agg({
        'total_results': ['sum', 'mean', 'count']
    }).round(1)
    domain_stats.columns = ['Total Results', 'Avg per Query', 'Num Queries']
    domain_stats = domain_stats.sort_values('Total Results', ascending=False)
    
    print("\n=== Results by Domain ===")
    display(domain_stats)

In [None]:
# Results by attribute
if len(search_df) > 0 and 'attribute' in search_df.columns:
    attr_stats = search_df.groupby('attribute').agg({
        'total_results': ['sum', 'mean', 'count']
    }).round(1)
    attr_stats.columns = ['Total Results', 'Avg per Query', 'Num Queries']
    attr_stats = attr_stats.sort_values('Total Results', ascending=False)
    
    print("\n=== Results by Attribute ===")
    display(attr_stats)

In [None]:
# Heatmap: Domain x Attribute
if len(search_df) > 0 and 'domain' in search_df.columns and 'attribute' in search_df.columns:
    pivot = search_df.pivot_table(
        values='total_results', 
        index='attribute', 
        columns='domain', 
        aggfunc='sum',
        fill_value=0
    )
    
    if pivot.shape[0] > 0 and pivot.shape[1] > 0:
        fig, ax = plt.subplots(figsize=(12, 8))
        sns.heatmap(pivot, annot=True, fmt='d', cmap='Blues', ax=ax, 
                    linewidths=0.5, cbar_kws={'label': 'Total Results'})
        ax.set_title('Search Results: Attribute Ã— Domain')
        ax.set_xlabel('Domain')
        ax.set_ylabel('Attribute')
        plt.tight_layout()
        plt.show()

## 5. Paragraphs Analysis

In [None]:
# Paragraph statistics
para_count = db.paragraphs.count_documents({})

if para_count > 0:
    # Element type distribution
    element_pipeline = [
        {'$group': {'_id': '$element_type', 'count': {'$sum': 1}}},
        {'$sort': {'count': -1}}
    ]
    element_counts = list(db.paragraphs.aggregate(element_pipeline))
    element_df = pd.DataFrame(element_counts).rename(columns={'_id': 'Element Type', 'count': 'Count'})
    
    print(f"Total paragraphs: {para_count}")
    print("\n=== Paragraphs by Element Type ===")
    display(element_df)
    
    if len(element_df) > 0:
        fig, ax = plt.subplots(figsize=(10, 5))
        sns.barplot(data=element_df, y='Element Type', x='Count', ax=ax, palette='viridis')
        ax.set_xlabel('Count')
        ax.set_title('Document Elements by Type')
        plt.tight_layout()
        plt.show()
else:
    print("No paragraphs found (run 'arxiv-corpus process extract' first)")

In [None]:
# Paragraphs per paper
if para_count > 0:
    para_per_paper = list(db.paragraphs.aggregate([
        {'$group': {'_id': '$arxiv_id', 'count': {'$sum': 1}}},
        {'$sort': {'count': -1}}
    ]))
    
    para_per_paper_df = pd.DataFrame(para_per_paper)
    
    print("=== Paragraphs per Paper ===")
    print(f"Mean: {para_per_paper_df['count'].mean():.1f}")
    print(f"Median: {para_per_paper_df['count'].median():.1f}")
    print(f"Max: {para_per_paper_df['count'].max()}")
    
    fig, ax = plt.subplots(figsize=(10, 5))
    sns.histplot(data=para_per_paper_df, x='count', bins=30, ax=ax, color='steelblue', edgecolor='white')
    ax.set_xlabel('Number of Paragraphs')
    ax.set_ylabel('Number of Papers')
    ax.set_title('Distribution of Paragraphs per Paper')
    plt.tight_layout()
    plt.show()

In [None]:
# Section title analysis
if para_count > 0:
    section_pipeline = [
        {'$match': {'section_title': {'$ne': None}}},
        {'$group': {'_id': '$section_title', 'count': {'$sum': 1}}},
        {'$sort': {'count': -1}},
        {'$limit': 20}
    ]
    section_counts = list(db.paragraphs.aggregate(section_pipeline))
    
    if section_counts:
        section_df = pd.DataFrame(section_counts).rename(columns={'_id': 'Section', 'count': 'Count'})
        print("\n=== Most Common Section Titles (Top 20) ===")
        display(section_df)

## 6. Term Hits Analysis

Analysis of term search results in paragraphs.

In [None]:
# Paragraphs with term hits
if para_count > 0:
    hit_pipeline = [
        {'$match': {'total_hits': {'$gt': 0}}},
        {'$group': {'_id': None, 'count': {'$sum': 1}, 'total_hits': {'$sum': '$total_hits'}}}
    ]
    hit_stats = list(db.paragraphs.aggregate(hit_pipeline))
    
    if hit_stats:
        print(f"Paragraphs with term hits: {hit_stats[0]['count']}")
        print(f"Total term hits: {hit_stats[0]['total_hits']}")
    else:
        print("No term hits recorded (run term search analysis first)")

In [None]:
# Most common terms hit
if para_count > 0:
    term_pipeline = [
        {'$unwind': '$hits'},
        {'$group': {'_id': '$hits.term', 'total_count': {'$sum': '$hits.count'}, 'paragraphs': {'$sum': 1}}},
        {'$sort': {'total_count': -1}},
        {'$limit': 20}
    ]
    term_hits = list(db.paragraphs.aggregate(term_pipeline))
    
    if term_hits:
        term_df = pd.DataFrame(term_hits).rename(columns={
            '_id': 'Term', 
            'total_count': 'Total Hits',
            'paragraphs': 'Paragraphs'
        })
        print("\n=== Most Frequent Terms in Corpus (Top 20) ===")
        display(term_df)

## 7. Export Summary Statistics

In [None]:
# Create summary report
summary = {
    'Report Generated': datetime.now().isoformat(),
    'Database': DB_NAME,
    'Total Papers': db.papers.count_documents({}),
    'Papers Downloaded': db.papers.count_documents({'status': 'downloaded'}),
    'Papers Converted': db.papers.count_documents({'status': 'converted'}),
    'Papers with Embeddings': db.papers.count_documents({'status': 'embedded'}),
    'Papers Processed': db.papers.count_documents({'status': 'processed'}),
    'Papers with Errors': db.papers.count_documents({'status': 'error'}),
    'Total Paragraphs': db.paragraphs.count_documents({}),
    'Total Search Queries': db.search_results.count_documents({}),
}

summary_df = pd.DataFrame(list(summary.items()), columns=['Metric', 'Value'])
print("=== Corpus Summary Report ===")
display(summary_df)

In [None]:
# Close connection
client.close()
print("MongoDB connection closed")