In [None]:
#!/usr/bin/env python3
"""
Docstring for CMS_DPOA_text_extraction.claudeExtraction.ipynb

Requirements:
    pip install anthropic pypdf pdfplumber pandas

Usage:
    python extract_datasets_from_papers.py --input_folder ./papers --output dataset_info.csv
    
    # Or with API key as argument:
    python extract_datasets_from_papers.py --input_folder ./papers --output dataset_info.csv --api_key sk-ant-...

Environment:
    Set ANTHROPIC_API_KEY environment variable or pass via --api_key argument
"""

import os
import sys
import json
import argparse
import logging
from pathlib import Path
from typing import Optional
import csv

import anthropic
import pdfplumber
import pandas as pd
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("ANTHROPIC_API_KEY")

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


# Prompt template for Claude to extract dataset information
EXTRACTION_PROMPT = """You are a scientific data extraction assistant specializing in high-energy physics papers.

Analyze the following physics paper text and extract ALL dataset information mentioned. Focus on:
- CMS Open Data datasets
- Monte Carlo simulation samples
- Real collision data samples
- Any datasets with DOIs or official citation paths

For EACH dataset found, extract these fields (use "null" if not available):
1. dataset_name: The name or identifier of the dataset
2. dataset_type: "Real Data" or "Simulated MC" 
3. official_path: The official citation path (e.g., /Jet/Run2010B-Apr21ReReco-v1/AOD)
4. events_total: Total number of events in the dataset
5. events_used: Number of events actually used in the analysis
6. collision_energy_tev: Center-of-mass energy in TeV
7. generator: MC generator used (e.g., Pythia, Madgraph) or "N/A (Real Data)"
8. doi: The DOI identifier (just the DOI, not the full URL)
9. size_bytes: Dataset size in bytes (convert from TB, GB, MB if needed: 1TB=1e12, 1GB=1e9, 1MB=1e6). If multiple sizes given (e.g., raw vs compressed), use the original/raw size.
10. notes: Any other important details (luminosity, run period, selection criteria, etc.)

Return your response as a JSON array of objects. Each object represents one dataset.
If the paper mentions multiple pT bins or variants of the same base dataset, list each separately.

Example output format:
[
    {
        "dataset_name": "Jet Primary Dataset",
        "dataset_type": "Real Data",
        "official_path": "/Jet/Run2010B-Apr21ReReco-v1/AOD",
        "events_total": "20022826",
        "events_used": "768687",
        "collision_energy_tev": "7",
        "generator": "N/A (Real Data)",
        "doi": "10.7483/OPENDATA.CMS.3S7F.2E9W",
        "size_bytes": "2000000000000",
        "notes": "Run 2010B, 31.8 pb-1 integrated luminosity, 2.0 TB original size"
    }
]

IMPORTANT: Return ONLY the JSON array, no other text or markdown formatting.

Paper text to analyze:
---
{paper_text}
---
"""


def extract_text_from_pdf(pdf_path: str, max_pages: Optional[int] = None) -> str:
    """
    Extract text content from a PDF file.
    
    Args:
        pdf_path: Path to the PDF file
        max_pages: Maximum number of pages to extract (None for all)
    
    Returns:
        Extracted text as a string
    """
    logger.info(f"Extracting text from: {pdf_path}")
    
    text_parts = []
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            pages_to_process = pdf.pages[:max_pages] if max_pages else pdf.pages
            
            for i, page in enumerate(pages_to_process):
                page_text = page.extract_text()
                if page_text:
                    text_parts.append(f"--- Page {i+1} ---\n{page_text}")
                    
                # Also try to extract tables as they often contain dataset info
                tables = page.extract_tables()
                for j, table in enumerate(tables):
                    if table:
                        table_text = "\n".join(["\t".join([str(cell) if cell else "" for cell in row]) for row in table])
                        text_parts.append(f"--- Table {j+1} on Page {i+1} ---\n{table_text}")
        
        full_text = "\n\n".join(text_parts)
        logger.info(f"Extracted {len(full_text)} characters from {len(pages_to_process)} pages")
        return full_text
        
    except Exception as e:
        logger.error(f"Error extracting text from {pdf_path}: {e}")
        raise


def extract_datasets_with_claude(
    paper_text: str, 
    paper_name: str,
    api_key: Optional[str] = None,
    model: str = "claude-sonnet-4-20250514"
) -> list[dict]:
    """
    Use Claude API to extract dataset information from paper text.
    
    Args:
        paper_text: The extracted text from the paper
        paper_name: Name of the paper (for logging and output)
        api_key: Anthropic API key (uses env var if not provided)
        model: Claude model to use
    
    Returns:
        List of dictionaries containing dataset information
    """
    logger.info(f"Sending paper to Claude for analysis: {paper_name}")
    
    # Initialize client
    client = anthropic.Anthropic(api_key=api_key) if api_key else anthropic.Anthropic()
    
    # Truncate text if too long (keeping first and last parts for context)
    max_chars = 150000  # Leave room for prompt and response
    if len(paper_text) > max_chars:
        half = max_chars // 2
        paper_text = paper_text[:half] + "\n\n[... middle section truncated ...]\n\n" + paper_text[-half:]
        logger.warning(f"Paper text truncated to {max_chars} characters")
    
    prompt = EXTRACTION_PROMPT.format(paper_text=paper_text)
    
    try:
        message = client.messages.create(
            model=model,
            max_tokens=4096,
            messages=[
                {"role": "user", "content": prompt}
            ]
        )
        
        response_text = message.content[0].text.strip()
        
        # Try to parse JSON response
        # Handle case where response might have markdown code blocks
        if response_text.startswith("```"):
            # Remove markdown code block formatting
            lines = response_text.split("\n")
            response_text = "\n".join(lines[1:-1] if lines[-1] == "```" else lines[1:])
        
        datasets = json.loads(response_text)
        
        # Add paper name to each dataset
        for dataset in datasets:
            dataset["paper"] = paper_name
        
        logger.info(f"Extracted {len(datasets)} datasets from {paper_name}")
        return datasets
        
    except json.JSONDecodeError as e:
        logger.error(f"Failed to parse Claude response as JSON: {e}")
        logger.error(f"Response was: {response_text[:500]}...")
        return []
    except Exception as e:
        logger.error(f"Error calling Claude API: {e}")
        raise


def process_papers_folder(
    input_folder: str,
    output_file: str,
    api_key: Optional[str] = None,
    model: str = "claude-sonnet-4-20250514",
    max_pages: Optional[int] = None
) -> pd.DataFrame:
    """
    Process all PDF papers in a folder and extract dataset information.
    
    Args:
        input_folder: Path to folder containing PDF papers
        output_file: Path for output CSV file
        api_key: Anthropic API key
        model: Claude model to use
        max_pages: Maximum pages to process per PDF
    
    Returns:
        DataFrame with all extracted datasets
    """
    input_path = Path(input_folder)
    
    if not input_path.exists():
        raise FileNotFoundError(f"Input folder not found: {input_folder}")
    
    # Find all PDF files
    pdf_files = list(input_path.glob("*.pdf")) + list(input_path.glob("*.PDF"))
    
    if not pdf_files:
        logger.warning(f"No PDF files found in {input_folder}")
        return pd.DataFrame()
    
    logger.info(f"Found {len(pdf_files)} PDF files to process")
    
    all_datasets = []
    
    for pdf_file in pdf_files:
        try:
            logger.info(f"\n{'='*60}")
            logger.info(f"Processing: {pdf_file.name}")
            logger.info(f"{'='*60}")
            
            # Extract text from PDF
            paper_text = extract_text_from_pdf(str(pdf_file), max_pages)
            
            # Get paper name (filename without extension)
            paper_name = pdf_file.stem
            
            # Extract datasets using Claude
            datasets = extract_datasets_with_claude(
                paper_text=paper_text,
                paper_name=paper_name,
                api_key=api_key,
                model=model
            )
            
            all_datasets.extend(datasets)
            
        except Exception as e:
            logger.error(f"Failed to process {pdf_file.name}: {e}")
            continue
    
    if not all_datasets:
        logger.warning("No datasets extracted from any papers")
        return pd.DataFrame()
    
    # Create DataFrame with consistent column order
    columns = [
        "paper",
        "dataset_name", 
        "dataset_type",
        "official_path",
        "events_total",
        "events_used",
        "collision_energy_tev",
        "generator",
        "doi",
        "size_bytes",
        "notes"
    ]
    
    df = pd.DataFrame(all_datasets)
    
    # Ensure all columns exist
    for col in columns:
        if col not in df.columns:
            df[col] = "N/A"
    
    # Reorder columns
    df = df[columns]
    
    # Save to CSV
    df.to_csv(output_file, index=False, quoting=csv.QUOTE_ALL)
    logger.info(f"\nSaved {len(df)} datasets to {output_file}")
    
    return df


def main():
    """Main entry point for the script."""
    parser = argparse.ArgumentParser(
        description="Extract dataset information from physics papers",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
    # Process all PDFs in a folder
    python extract_datasets_from_papers.py --input_folder ./papers --output datasets.csv
    
    # With explicit API key
    python extract_datasets_from_papers.py --input_folder ./papers --output datasets.csv --api_key sk-ant-...
    
    # Limit pages processed per PDF (for faster processing)
    python extract_datasets_from_papers.py --input_folder ./papers --output datasets.csv --max_pages 10
        """
    )
    
    parser.add_argument(
        "--input_folder", "-i",
        default=r"C:/Users/ejren/OneDrive/DPOA_papers",
        help="Path to folder containing PDF papers"
    )
    
    parser.add_argument(
        "--output", "-o",
        default="dataset_info.csv",
        help="Output CSV file path (default: dataset_info.csv)"
    )
    
    parser.add_argument(
        "--api_key", "-k",
        default=None,
        help="Anthropic API key (or set ANTHROPIC_API_KEY env var)"
    )
    
    parser.add_argument(
        "--model", "-m",
        default="claude-sonnet-4-20250514",
        help="Claude model to use (default: claude-sonnet-4-20250514)"
    )
    
    parser.add_argument(
        "--max_pages",
        type=int,
        default=None,
        help="Maximum pages to process per PDF (default: all)"
    )
    
    parser.add_argument(
        "--verbose", "-v",
        action="store_true",
        help="Enable verbose logging"
    )
    
    args = parser.parse_args()
    
    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)
    
    # Check for API key
    api_key = args.api_key or os.environ.get("ANTHROPIC_API_KEY")
    if not api_key:
        logger.error("No API key provided. Set ANTHROPIC_API_KEY environment variable or use --api_key")
        sys.exit(1)
    
    try:
        df = process_papers_folder(
            input_folder=args.input_folder,
            output_file=args.output,
            api_key=api_key,
            model=args.model,
            max_pages=args.max_pages
        )
        
        if not df.empty:
            print(f"\n{'='*60}")
            print("EXTRACTION COMPLETE")
            print(f"{'='*60}")
            print(f"Total datasets extracted: {len(df)}")
            print(f"Papers processed: {df['paper'].nunique()}")
            print(f"Output saved to: {args.output}")
            print(f"\nDataset types found:")
            print(df['dataset_type'].value_counts().to_string())
            print(f"\nCollision energies:")
            print(df['collision_energy_tev'].value_counts().to_string())
        else:
            print("No datasets were extracted.")
            sys.exit(1)
            
    except Exception as e:
        logger.error(f"Script failed: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()