In [None]:
import pymupdf  # PyMuPDF
from pathlib import Path
import openai
import base64
from openai import OpenAI
import glob
import time
import os
from dotenv import load_dotenv
import requests
from bs4 import BeautifulSoup
import json
import csv
import re

load_dotenv() #Load environment variables from .env file

api_key = os.getenv("OPENAI_API_KEY") #Get the OpenAI API key from environment variable
client = OpenAI(api_key=api_key) #Initialize OpenAI client

#Function 1: Convert PDF pages to images
def pdf_to_images(pdf_path, output_folder="pdf_images"):
    Path(output_folder).mkdir(parents=True, exist_ok=True)
    
    doc = pymupdf.open(pdf_path)
    
    image_paths = []
    for page_num in range(len(doc)):
        page = doc[page_num]
        pix = page.get_pixmap(matrix=pymupdf.Matrix(2, 2))
        image_path = f"{output_folder}/page_{page_num + 1}.png"
        pix.save(image_path)
        image_paths.append(image_path)
    
    doc.close()
    return image_paths

#Function 2: Encode image to base64 because OpenAI API requires images in base64 format
def encode_image(image_path):
    """Encode image to base64"""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

#Function 3: Extract CERN open data fields from PDF images using OpenAI API
def extract_cern_data(image_paths, max_pages=None):
    
    if max_pages:
        image_paths = image_paths[:max_pages]
    
    prompt = """You are an expert at high energy particle physics and you understand jargon like "events" and datasets. You are also very, very careful and a good explainer.
I need your help reading documents and extracting information about datasets used.

Extract the following information:
* Title of the paper
* Authors of the paper (list all authors separated by semicolons, NO commas between authors)
* Date of publication in YYYY-MM format (e.g., 2023-06)
* Dataset name (collision or MC) - be specific, include year and collision energy if mentioned
* Size in number of events (as a plain integer, no commas or scientific notation)
* Size in number of files (as a plain integer, no commas)
* Size in bytes (as a plain integer, no commas or units - convert GB/TB/PB to bytes)
* Data format (AOD, miniAOD, nanoAOD, etc)
* DOI of datasets used (full DOI string)

Important guidelines:
- For authors: Use semicolons as separators, NOT commas. Example: "John Smith; Jane Doe; Bob Johnson"
- Do NOT use "and" before the last author - use semicolons consistently throughout
- If exact numbers are not given, provide your best approximation without noting it as such
- Convert all units to the requested format (e.g., "1.5 TB" becomes "1500000000000" bytes)
- If scientific notation is used (e.g., "10^6 events"), convert to plain integer
- Look carefully at tables, captions, methodology sections, and references for dataset information
- If multiple datasets are used, create separate rows for each
- Use regular hyphens (-) not em dashes (—)

Return ONLY a CSV with this exact header:
Title,Authors,Date,Dataset name (collision or MC),Size (events),Size (files),Size (bytes),Data format,Dataset DOI

Then one or more data rows with the extracted values. Use "null" for unknown values.
NO explanations, NO markdown formatting, NO code blocks, JUST the CSV."""

    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": prompt
                }
            ]
        }
    ]
    
    print(f"    Encoding {len(image_paths)} images...")
    for i, img_path in enumerate(image_paths):
        try:
            base64_image = encode_image(img_path)
            messages[0]["content"].append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{base64_image}",
                    "detail": "high"
                }
            })
        except Exception as e:
            print(f"    ✗ Error encoding image {img_path}: {e}")
    
    print(f"    Sending request to OpenAI...")
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=messages,
            max_tokens=4000,
            temperature=0
        )
        
        content = response.choices[0].message.content
        print(f"    Response received: {len(content)} characters")
        return content
    
    except Exception as e:
        print(f"    ✗ API Error: {e}")
        return None

#Function 4: Search CERN Open Data website for dataset information using dataset name or DOI to fill missing fields
def search_cern_website(dataset_name=None, doi=None):
    #returns dictionary 
    print(f"    Searching CERN website for: {dataset_name or doi}...")
    
    try:
        # Try specific dataset search if we have a name or DOI
        if doi:
            # Search by DOI
            search_url = f"https://opendata.cern.ch/search?q={doi}"
        elif dataset_name:
            # Search by dataset name
            search_url = f"https://opendata.cern.ch/search?q={dataset_name}"
        else:
            # General search
            search_url = "https://opendata.cern.ch/"
        
        response = requests.get(search_url, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')
        page_content = str(soup.body)
        
        # Use GPT to extract information from the page
        prompt = f"""You are analyzing a page from the CERN Open Data website.
Extract dataset information and return ONLY a valid JSON object (no markdown, no code blocks) with these fields:
- dataset_name (string or null)
- size_events (integer or null) - number of events
- size_files (integer or null) - number of files  
- size_bytes (integer or null) - size in bytes (convert any units)
- data_format (string or null) - e.g., AOD, miniAOD, nanoAOD
- dataset_doi (string or null) - full DOI

If searching for specific dataset: {dataset_name or doi}
Focus on finding information about this specific dataset if possible.

Return ONLY the JSON object, nothing else.

HTML content (first 15000 chars):
{page_content[:15000]}"""

        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You extract data from HTML and return only valid JSON objects."},
                {"role": "user", "content": prompt}
            ],
            temperature=0,
            max_tokens=1000
        )
        
        result = response.choices[0].message.content.strip()
        # Remove markdown code blocks if present
        result = re.sub(r'^```json\s*|\s*```$', '', result, flags=re.MULTILINE).strip()
        
        data = json.loads(result)
        print(f"    ✓ Found data on CERN website")
        return data
        
    except json.JSONDecodeError as e:
        print(f"    ✗ JSON decode error: {e}")
        print(f"    Response was: {result[:200]}...")
        return None
    except Exception as e:
        print(f"    ✗ Error searching CERN website: {e}")
        return None

#Function 5: Fill missing fields in extracted data using CERN website search results
def fill_missing_fields(row_dict, cern_data):
    """
    Fill in missing fields from CERN website data
    row_dict: dictionary of extracted data from PDF
    cern_data: dictionary from CERN website search
    Returns: updated row_dict
    """
    if not cern_data:
        return row_dict
    
    # Map CERN data fields to CSV fields
    field_mapping = {
        'Size (events)': 'size_events',
        'Size (files)': 'size_files',
        'Size (bytes)': 'size_bytes',
        'Data format': 'data_format',
        'Dataset DOI': 'dataset_doi',
        'Dataset name (collision or MC)': 'dataset_name'
    }
    
    filled_count = 0
    for csv_field, cern_field in field_mapping.items():
        # If field is missing or null in PDF extraction
        if row_dict.get(csv_field) in [None, 'null', '', 'NULL']:
            if cern_data.get(cern_field):
                row_dict[csv_field] = str(cern_data[cern_field])
                filled_count += 1
    
    if filled_count > 0:
        print(f"    ✓ Filled {filled_count} missing field(s) from CERN website")
    
    return row_dict

#Function 6: Parse CSV string to list of dictionaries
def parse_csv_to_dict(csv_content):
    
    lines = csv_content.strip().split('\n')
    if len(lines) < 2:
        return []
    
    header = lines[0].split(',')
    rows = []
    
    for line in lines[1:]:
        if line.strip() and not line.strip().startswith('Title,'):
            # Simple CSV parsing (may need improvement for complex cases)
            values = line.split(',')
            if len(values) == len(header):
                row_dict = dict(zip(header, values))
                rows.append(row_dict)
    
    return rows

#Function 7: Process all PDFs in a directory
def process_directory(pdf_directory, output_csv="extracted_data.csv", max_pages_per_pdf=10):
    
    pdf_files = glob.glob(f"{pdf_directory}/*.pdf")
    
    if not pdf_files:
        print(f"No PDF files found in {pdf_directory}")
        return
    
    print(f"Found {len(pdf_files)} PDF files to process")
    print(f"Processing max {max_pages_per_pdf} pages per PDF\n")
    
    all_rows = []
    
    for i, pdf_path in enumerate(pdf_files, 1):
        pdf_name = Path(pdf_path).stem
        print(f"[{i}/{len(pdf_files)}] Processing {Path(pdf_path).name}...")
        
        try:
            # Convert PDF to images
            image_folder = Path("pdf_images") / pdf_name
            image_paths = pdf_to_images(pdf_path, output_folder=str(image_folder))
            print(f"  Converted {len(image_paths)} pages")
            
            # Extract data from PDF
            result = extract_cern_data(image_paths, max_pages=max_pages_per_pdf)
            
            if result is None:
                print(f"  ✗ No response from API")
                continue
            
            # Clean result
            clean_result = result.replace("```csv", "").replace("```", "").strip()
            
            if not clean_result or len(clean_result) < 50:
                print(f"  ⚠ Warning: Response too short or empty")
                continue
            
            # Parse CSV to dictionaries
            rows = parse_csv_to_dict(clean_result)
            
            # For each row, check for missing fields and search CERN website
            for row_dict in rows:
                dataset_name = row_dict.get('Dataset name (collision or MC)')
                dataset_doi = row_dict.get('Dataset DOI')
                
                # Check if any important fields are missing
                missing_fields = [
                    field for field in ['Size (events)', 'Size (files)', 'Size (bytes)', 'Data format', 'Dataset DOI']
                    if row_dict.get(field) in [None, 'null', '', 'NULL']
                ]
                
                if missing_fields and (dataset_name not in [None, 'null', ''] or dataset_doi not in [None, 'null', '']):
                    print(f"  Missing fields: {', '.join(missing_fields)}")
                    print(f"  Attempting to fill from CERN website...")
                    
                    # Search CERN website
                    cern_data = search_cern_website(
                        dataset_name=dataset_name if dataset_name not in ['null', ''] else None,
                        doi=dataset_doi if dataset_doi not in ['null', ''] else None
                    )
                    
                    # Fill missing fields
                    row_dict = fill_missing_fields(row_dict, cern_data)
                    
                    # Small delay between searches
                    time.sleep(1)
                
                all_rows.append(row_dict)
            
            print(f"  ✓ Extraction complete ({len(rows)} dataset(s))\n")
            
            # Delay between PDFs
            time.sleep(1)
            
        except Exception as e:
            print(f"  ✗ Error processing {Path(pdf_path).name}: {e}")
            import traceback
            traceback.print_exc()
            print()
            continue
    
    # Write combined results to CSV
    if all_rows:
        print(f"\nWriting {len(all_rows)} rows to {output_csv}...")
        
        with open(output_csv, 'w', encoding='utf-8', newline='') as f:
            if all_rows:
                fieldnames = all_rows[0].keys()
                writer = csv.DictWriter(f, fieldnames=fieldnames)
                writer.writeheader()
                writer.writerows(all_rows)
        
        print(f"\n✓ All data saved to {output_csv}")
        print(f"✓ Processed {len(pdf_files)} papers")
        print(f"✓ Wrote {len(all_rows)} dataset rows")
    else:
        print("\n✗ No data extracted from any papers")

# Main execution
if __name__ == "__main__":
    pdf_directory = r"C:/Users/ejren/OneDrive/DPOA_papers"  
    
    # Process all PDFs with CERN website fallback
    process_directory(pdf_directory, output_csv="cern_data_extracted.csv", max_pages_per_pdf=10)