In [None]:
# =============================================================================
# PAPERCHECKER - SETUP
# =============================================================================
# This cell installs all required dependencies, clones the PaperChecker
# repository from GitHub, mounts Google Drive for PDF access, and loads
# API keys from Colab Secrets.
#
# FIRST TIME SETUP:
#   1. Click the key icon in the left sidebar
#   2. Add secret: OPENAI_API_KEY = your OpenAI API key
#   3. Add secret: GOOGLE_API_KEY = your Google AI API key
#   4. Run this cell
# =============================================================================

#@title **1. SETUP** { display-mode: "form" }

# --- Install Python packages ---
# openai: OpenAI API client for GPT models
# google-genai: Google Generative AI client for Gemini models
# pymupdf: PDF text extraction (also known as fitz)
# python-docx: Word document generation for review logs
# openpyxl: Excel file read/write for structured output
# jsonschema: JSON validation for LLM responses
!pip install -U openai google-genai pymupdf python-docx openpyxl jsonschema -q

# --- Clone or update PaperChecker repository ---
import os
REPO_PATH = '/content/paperchecker'
REPO_URL = 'https://github.com/maxrusse/paperchecker.git'

if not os.path.exists(REPO_PATH):
    # Fresh clone
    !git clone {REPO_URL} {REPO_PATH}
    print(f'[OK] Cloned repository to {REPO_PATH}')
else:
    # Update existing clone
    !cd {REPO_PATH} && git pull
    print(f'[OK] Updated repository at {REPO_PATH}')

# Change to repo directory
%cd {REPO_PATH}

# --- Mount Google Drive ---
# This allows access to PDFs stored in your Drive
from google.colab import drive
drive.mount('/content/drive')
print('[OK] Google Drive mounted at /content/drive')

# --- Load API keys from Colab Secrets ---
# Keys are stored securely and not visible in the notebook
from google.colab import userdata

OPENAI_API_KEY = None
GOOGLE_API_KEY = None

try:
    OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
    GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
    
    # Validate keys are present
    missing_keys = []
    if not OPENAI_API_KEY:
        missing_keys.append('OPENAI_API_KEY')
    if not GOOGLE_API_KEY:
        missing_keys.append('GOOGLE_API_KEY')
    
    if missing_keys:
        print(f'[WARNING] Missing API keys: {", ".join(missing_keys)}')
        print('          Add them in Colab sidebar > Secrets (key icon)')
    else:
        print('[OK] API keys loaded successfully')
        
except Exception as e:
    print(f'[ERROR] Failed to load API keys: {e}')
    print('        Add OPENAI_API_KEY and GOOGLE_API_KEY to Colab Secrets')

print('\n' + '=' * 50)
print('SETUP COMPLETE - proceed to Cell 2')
print('=' * 50)

In [None]:
# =============================================================================
# PAPERCHECKER - CONFIGURE PDF FOLDER
# =============================================================================
# Set the path to your PDF folder in Google Drive.
# The folder should contain the medical research PDFs you want to analyze.
#
# FOLDER PATH FORMAT:
#   /content/drive/MyDrive/your_folder_name
#
# ALTERNATIVE - Upload PDFs directly:
#   Uncomment the upload block below to upload PDFs from your computer
# =============================================================================

#@title **2. CONFIGURE** { display-mode: "form" }

# --- PDF folder path in Google Drive ---
PDF_FOLDER = '/content/drive/MyDrive/paperchecker/pdfs'  #@param {type:"string"}

# --- Alternative: Upload PDFs directly (uncomment to use) ---
# from google.colab import files
# print('Select PDF files to upload...')
# uploaded = files.upload()
# PDF_FOLDER = '/content'  # uploaded files go to current directory

# --- Scan for PDF files ---
import os
import glob

# Find all PDF files in the specified folder
pdf_paths = sorted(glob.glob(os.path.join(PDF_FOLDER, '*.pdf')))

# Display results
print(f'PDF Folder: {PDF_FOLDER}')
print(f'Found: {len(pdf_paths)} PDF(s)')
print('-' * 50)

if pdf_paths:
    # List all found PDFs with index numbers
    for idx, pdf_path in enumerate(pdf_paths, start=1):
        filename = os.path.basename(pdf_path)
        print(f'  {idx:3d}. {filename}')
else:
    # No PDFs found - show helpful error message
    print('[ERROR] No PDF files found!')
    print('')
    print('Troubleshooting:')
    print('  1. Check that PDF_FOLDER path is correct')
    print('  2. Ensure files have .pdf extension')
    print('  3. Example path: /content/drive/MyDrive/my_papers')

In [None]:
# =============================================================================
# PAPERCHECKER - RUN EXTRACTION
# =============================================================================
# Main extraction cell with hierarchical model configuration.
#
# MODEL SELECTION HIERARCHY:
#   Extractor: Provider -> Model -> Strength (reasoning/thinking level)
#   Verifier:  Provider -> Model -> Strength (reasoning/thinking level)
#
# PROVIDERS:
#   - openai: Uses OpenAI GPT models (gpt-5.2, gpt-5.1)
#   - google: Uses Google Gemini models (gemini-3-pro-preview)
#
# STRENGTH LEVELS:
#   OpenAI reasoning: none < low < medium < high < xhigh
#   Gemini thinking:  minimal < low < medium < high
#
# RECOMMENDATIONS:
#   - Extractor: Higher strength for better accuracy (medium-high)
#   - Verifier: Lower strength is usually sufficient (low-medium)
# =============================================================================

#@title **3. RUN EXTRACTION** { display-mode: "form" }

import os
import glob
import importlib
import script

# Reload script module to pick up any code changes
importlib.reload(script)

# --- Verify PDFs are available ---
if not pdf_paths:
    raise RuntimeError('No PDFs found! Run Cell 2 first and check PDF_FOLDER path.')

# =============================================================================
# OUTPUT CONFIGURATION
# =============================================================================
# Output files are saved to Google Drive for persistence.
# Change OUTPUT_RUN_TAG to start a fresh extraction run.

OUTPUT_DIR = '/content/drive/MyDrive/paperchecker/output'
os.makedirs(OUTPUT_DIR, exist_ok=True)

#@markdown ---
#@markdown ### Output Settings
OUTPUT_RUN_TAG = 'current'  #@param {type:"string"}

# Construct output file paths
OUTPUT_XLSX = f'{OUTPUT_DIR}/extraction_{OUTPUT_RUN_TAG}.xlsx'
OUTPUT_DOCX = f'{OUTPUT_DIR}/review_log_{OUTPUT_RUN_TAG}.docx'

# =============================================================================
# EXTRACTOR MODEL CONFIGURATION
# =============================================================================
# The extractor processes PDF content and extracts structured data.
# Higher strength = better accuracy but slower and more expensive.

#@markdown ---
#@markdown ### Extractor Settings
#@markdown Select provider, model, and reasoning strength for data extraction.

EXTRACTOR_PROVIDER = 'openai'  #@param ["openai", "google"]

#@markdown **OpenAI Extractor:**
EXTRACTOR_OPENAI_MODEL = 'gpt-5.2'  #@param ["gpt-5.2", "gpt-5.1"]
EXTRACTOR_OPENAI_STRENGTH = 'medium'  #@param ["none", "low", "medium", "high", "xhigh"]

#@markdown **Google Extractor:**
EXTRACTOR_GOOGLE_MODEL = 'gemini-3-pro-preview'  #@param ["gemini-3-pro-preview"]
EXTRACTOR_GOOGLE_STRENGTH = 'low'  #@param ["minimal", "low", "high"]

# =============================================================================
# VERIFIER MODEL CONFIGURATION
# =============================================================================
# The verifier validates extracted data against the source PDF.
# Can use a different provider/model than the extractor for cross-validation.

#@markdown ---
#@markdown ### Verifier Settings
#@markdown Select provider, model, and reasoning strength for verification.

VERIFIER_PROVIDER = 'openai'  #@param ["openai", "google"]

#@markdown **OpenAI Verifier:**
VERIFIER_OPENAI_MODEL = 'gpt-5.1'  #@param ["gpt-5.2", "gpt-5.1"]
VERIFIER_OPENAI_STRENGTH = 'low'  #@param ["none", "low", "medium", "high", "xhigh"]

#@markdown **Google Verifier:**
VERIFIER_GOOGLE_MODEL = 'gemini-3-pro-preview'  #@param ["gemini-3-pro-preview"]
VERIFIER_GOOGLE_STRENGTH = 'low'  #@param ["low", "medium", "high"]

# =============================================================================
# APPLY MODEL CONFIGURATION TO SCRIPT
# =============================================================================
# Map the hierarchical settings to the script module variables.

# Extractor settings (based on selected provider)
if EXTRACTOR_PROVIDER == 'openai':
    script.OPENAI_EXTRACT_MODEL = EXTRACTOR_OPENAI_MODEL
    script.REASONING_EFFORT_OPENAI = EXTRACTOR_OPENAI_STRENGTH
    use_gemini_driver = False
else:  # google
    script.GEMINI_EXTRACT_MODEL = EXTRACTOR_GOOGLE_MODEL
    script.THINKING_LEVEL_GEMINI = EXTRACTOR_GOOGLE_STRENGTH
    use_gemini_driver = True

# Verifier settings (based on selected provider)
if VERIFIER_PROVIDER == 'openai':
    script.OPENAI_VERIFIER_MODEL = VERIFIER_OPENAI_MODEL
    script.VERIFIER_REASONING_EFFORT_OPENAI = VERIFIER_OPENAI_STRENGTH
    use_openai_verifier = True
else:  # google
    script.GEMINI_VERIFIER_MODEL = VERIFIER_GOOGLE_MODEL
    script.VERIFIER_THINKING_LEVEL_GEMINI = VERIFIER_GOOGLE_STRENGTH
    use_openai_verifier = False

# =============================================================================
# DISPLAY CONFIGURATION SUMMARY
# =============================================================================

print('=' * 60)
print('PAPERCHECKER - EXTRACTION CONFIGURATION')
print('=' * 60)
print('')
print(f'PDFs to process: {len(pdf_paths)}')
print(f'Output Excel:    {OUTPUT_XLSX}')
print(f'Output Word:     {OUTPUT_DOCX}')
print('')
print('EXTRACTOR:')
if EXTRACTOR_PROVIDER == 'openai':
    print(f'  Provider: OpenAI')
    print(f'  Model:    {EXTRACTOR_OPENAI_MODEL}')
    print(f'  Strength: {EXTRACTOR_OPENAI_STRENGTH}')
else:
    print(f'  Provider: Google')
    print(f'  Model:    {EXTRACTOR_GOOGLE_MODEL}')
    print(f'  Strength: {EXTRACTOR_GOOGLE_STRENGTH}')
print('')
print('VERIFIER:')
if VERIFIER_PROVIDER == 'openai':
    print(f'  Provider: OpenAI')
    print(f'  Model:    {VERIFIER_OPENAI_MODEL}')
    print(f'  Strength: {VERIFIER_OPENAI_STRENGTH}')
else:
    print(f'  Provider: Google')
    print(f'  Model:    {VERIFIER_GOOGLE_MODEL}')
    print(f'  Strength: {VERIFIER_GOOGLE_STRENGTH}')
print('')
print('=' * 60)
print('Starting extraction...')
print('=' * 60)

# =============================================================================
# RUN EXTRACTION PIPELINE
# =============================================================================

results = script.run_pipeline(
    pdf_paths=pdf_paths,
    out_xlsx=OUTPUT_XLSX,
    out_docx=OUTPUT_DOCX,
    openai_api_key=OPENAI_API_KEY,
    google_api_key=GOOGLE_API_KEY,
    progress_fn=print,
    skip_existing_evals=True,
    use_gemini_driver=use_gemini_driver,
    use_openai_verifier=use_openai_verifier,
)

# =============================================================================
# DISPLAY RESULTS SUMMARY
# =============================================================================

print('')
print('=' * 60)
print(f'EXTRACTION COMPLETE - {len(results)} paper(s) processed')
print('=' * 60)
print('')

for idx, result in enumerate(results, start=1):
    # Extract paper identification
    paper_id = result.get('paper_id', {})
    pmid = paper_id.get('pmid', 'N/A')
    
    # Extract study metadata
    study_type = result.get('study_type', 'Unknown')
    
    # Extract validation status
    validation = result.get('validation', {})
    needs_review = validation.get('needs_human_review', False)
    review_status = 'REVIEW NEEDED' if needs_review else 'OK'
    
    print(f'  {idx:3d}. PMID={pmid} | Type={study_type} | Status={review_status}')

print('')
print('Results saved to:')
print(f'  Excel: {OUTPUT_XLSX}')
print(f'  Word:  {OUTPUT_DOCX}')

In [None]:
# =============================================================================
# PAPERCHECKER - DOWNLOAD RESULTS
# =============================================================================
# Downloads all output files to your local computer:
#   - Excel file: Structured extraction data
#   - Word file: Human-readable review log
#   - Audit files: JSON logs for debugging/auditing
#
# NOTE: Files are also saved to Google Drive in the output folder.
# =============================================================================

#@title **4. DOWNLOAD RESULTS** { display-mode: "form" }

from google.colab import files
import os
import glob

print('=' * 50)
print('DOWNLOADING RESULTS')
print('=' * 50)
print('')

# Track download counts
downloaded_count = 0

# --- Download main output files ---
main_files = [OUTPUT_XLSX, OUTPUT_DOCX]

for filepath in main_files:
    if os.path.exists(filepath):
        filename = os.path.basename(filepath)
        print(f'  Downloading: {filename}')
        files.download(filepath)
        downloaded_count += 1
    else:
        print(f'  [SKIP] Not found: {filepath}')

# --- Download audit files ---
# Audit files contain detailed JSON logs of the extraction process
audit_pattern = f'{OUTPUT_DIR}/*.audit_*.json'
audit_files = glob.glob(audit_pattern)

if audit_files:
    print(f'')
    print(f'  Downloading {len(audit_files)} audit file(s)...')
    for filepath in audit_files:
        files.download(filepath)
        downloaded_count += 1

# --- Summary ---
print('')
print('=' * 50)
print(f'DOWNLOAD COMPLETE - {downloaded_count} file(s)')
print('=' * 50)

In [None]:
# =============================================================================
# PAPERCHECKER - TROUBLESHOOTING & UTILITIES
# =============================================================================
# Run this cell to display troubleshooting tips and useful commands.
# =============================================================================

#@title **5. TROUBLESHOOTING** { display-mode: "form" }

print('=' * 60)
print('TROUBLESHOOTING GUIDE')
print('=' * 60)
print('')

print('API KEYS:')
print('  1. Click the key icon in the left sidebar')
print('  2. Add secret: OPENAI_API_KEY = your OpenAI key')
print('  3. Add secret: GOOGLE_API_KEY = your Google AI key')
print('  4. Re-run Cell 1 to reload keys')
print('')

print('NO PDFs FOUND:')
print('  1. Check PDF_FOLDER path in Cell 2')
print('  2. Ensure files have .pdf extension')
print('  3. Example: /content/drive/MyDrive/my_papers')
print('')

print('MODEL ERRORS:')
print('  1. Verify your API key has access to the selected model')
print('  2. Try a different model (e.g., gpt-5.1 instead of gpt-5.2)')
print('  3. Reduce strength level if hitting rate limits')
print('')

print('UPDATE CODE:')
print('  Run this command to get the latest version:')
print('  !cd /content/paperchecker && git pull')
print('')

print('USEFUL COMMANDS:')
print('  # List PDFs in folder')
print('  !ls -la "{PDF_FOLDER}"')
print('')
print('  # Check Drive mount')
print('  !ls /content/drive/MyDrive')
print('')
print('  # View output folder')
print('  !ls -la "{OUTPUT_DIR}"')
print('')

print('=' * 60)
print('For more help, visit:')
print('https://github.com/maxrusse/paperchecker')
print('=' * 60)