In [1]:
# Cell 1 - Setup and imports
# eTOUR/eTOUR/03_Fix_Java_Formatting.ipynb
import os
import re
import logging
import shutil
import json
import time
from pathlib import Path
from dotenv import load_dotenv
import anthropic  # Import the official Anthropic SDK

# Configure logging with syslog-style formatting
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s %(name)s[%(process)d]: %(levelname)s %(message)s',
    datefmt='%b %d %H:%M:%S'
)
logger = logging.getLogger("java_formatter")

# Load environment variables
load_dotenv()
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")
MODEL_ID = os.getenv("CLAUDE_3_7_MODEL_ID", "claude-3-7-sonnet-20240229")

# Use relative paths
JAVA_DIR = os.path.join(".", "java")

# Create backup directory
BACKUP_DIR = os.path.join(".", "java_backup")
if not os.path.exists(BACKUP_DIR):
    os.makedirs(BACKUP_DIR)
    logger.info(f"Created backup directory at {BACKUP_DIR}")

# Initialize Anthropic client
anthropic_client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)

In [2]:
# Cell 2 - Functions to interact with Claude API
# eTOUR/eTOUR/03_Fix_Java_Formatting.ipynb

def create_backup(file_path):
    """
    Create a backup of the original file
    
    Args:
        file_path: Path to the file to backup
        
    Returns:
        bool: True if backup was successful, False otherwise
    """
    try:
        backup_path = os.path.join(BACKUP_DIR, os.path.basename(file_path))
        with open(file_path, 'r', encoding='utf-8', errors='replace') as src:
            content = src.read()
        
        with open(backup_path, 'w', encoding='utf-8') as dst:
            dst.write(content)
        
        logger.debug(f"Created backup of {file_path} at {backup_path}")
        return True
    except Exception as e:
        logger.error(f"Failed to create backup of {file_path}: {str(e)}")
        return False

def format_with_claude(content):
    """
    Use Claude 3.7 to fix Java formatting with streaming
    
    Args:
        content: Raw Java code content as string
        
    Returns:
        Formatted Java code as string
    """
    try:
        # Define the system prompt for formatting
        system_prompt = """You are a Java formatting expert. 
Your task is to fix formatting issues in Java code without changing functionality.
Common issues to fix include:
1. Fixing comment syntax: "/ /" should be "//", "/ **" should be "/**", "* /" should be "*/"
2. Proper indentation based on code blocks
3. Replacing parentheses ( ) used for code blocks with braces { }
4. Fixing spacing in operators like "& &" should be "&&"
5. Proper capitalization of keywords (private, public, try, etc.)
6. Proper spacing around methods, parameters, and commas

Return ONLY the formatted code without any explanation or additional text.
"""
        
        # Make API call using the Anthropic SDK with streaming
        full_response = ""
        with anthropic_client.messages.stream(
            model=MODEL_ID,
            system=system_prompt,
            max_tokens=10000,
            messages=[
                {"role": "user", "content": f"Please format this Java code correctly:\n\n```java\n{content}\n```"}
            ]
        ) as stream:
            for text in stream.text_stream:
                full_response += text
        
        # Extract code between ```java and ``` if present
        if "```java" in full_response and "```" in full_response:
            start_idx = full_response.find("```java") + 7
            end_idx = full_response.rfind("```")
            formatted_code = full_response[start_idx:end_idx].strip()
        else:
            formatted_code = full_response
        
        return formatted_code
    except Exception as e:
        logger.error(f"Error formatting with Claude: {str(e)}")
        return None

def process_java_file(file_path, dry_run=True):
    """
    Process a Java file to fix formatting using Claude
    
    Args:
        file_path: Path to the Java file
        dry_run: If True, don't write changes, just report them
    
    Returns:
        True if changes were made, False otherwise
    """
    try:
        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
            content = f.read()
        
        # Get the file size for logging
        file_size = os.path.getsize(file_path) / 1024  # Size in KB
        logger.info(f"Processing {os.path.basename(file_path)} ({file_size:.1f} KB)")
        
        # Skip very large files to avoid API limits
        if file_size > 90:  # 90KB threshold
            logger.warning(f"File {file_path} is too large ({file_size:.1f} KB), skipping")
            return False
        
        fixed_content = format_with_claude(content)
        if fixed_content is None:
            logger.error(f"Failed to format {file_path}")
            return False
        
        if content != fixed_content:
            if not dry_run:
                # Create backup before modifying
                if create_backup(file_path):
                    with open(file_path, 'w', encoding='utf-8') as f:
                        f.write(fixed_content)
                    logger.info(f"Fixed formatting in {file_path}")
                else:
                    logger.warning(f"Skipped fixing {file_path} due to backup failure")
            else:
                logger.info(f"Would fix formatting in {file_path} (dry run)")
            return True
        else:
            logger.debug(f"No changes needed for {file_path}")
            return False
    except Exception as e:
        logger.error(f"Error processing file {file_path}: {str(e)}")
        return False

In [None]:
# Cell 3 - Test on a single file
# eTOUR/eTOUR/03_Fix_Java_Formatting.ipynb

def show_diff(original, fixed):
    """
    Display differences between original and fixed content
    
    Args:
        original: Original content
        fixed: Fixed content
    """
    try:
        import difflib
        
        diff = difflib.unified_diff(
            original.splitlines(keepends=True),
            fixed.splitlines(keepends=True),
            fromfile="Original",
            tofile="Fixed"
        )
        
        for line in diff:
            if line.startswith('+'):
                print(f"\033[92m{line}\033[0m", end="")  # Green for additions
            elif line.startswith('-'):
                print(f"\033[91m{line}\033[0m", end="")  # Red for removals
            else:
                print(line, end="")
    except Exception as e:
        logger.error(f"Error showing diff: {str(e)}")

# Test on a single small file
test_file = os.path.join(JAVA_DIR, "CostantiGlobali.java")  # This is a small file (932B)

def test_single_file(file_path):
    """
    Test the formatting on a single file
    
    Args:
        file_path: Path to the file to test
    """
    try:
        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
            original_content = f.read()
        
        print(f"Testing Claude formatting on {file_path}...")
        fixed_content = format_with_claude(original_content)
        
        if fixed_content is None:
            print("Error formatting the file. Check logs.")
            return
        
        print(f"Sample diff for {file_path}:")
        show_diff(original_content, fixed_content)
        
        if original_content == fixed_content:
            print("No changes made by Claude.")
        else:
            print(f"Changes would be made to {file_path}")
    except Exception as e:
        logger.error(f"Error testing file {file_path}: {str(e)}")

# Test the formatting on a single file
test_single_file(test_file)

In [4]:
# Cell 4 - Process all Java files
# eTOUR/eTOUR/03_Fix_Java_Formatting.ipynb

def process_all_files(dry_run=True, max_files=None):
    """
    Process Java files in the specified directory using Claude
    
    Args:
        dry_run: If True, don't write changes, just report them
        max_files: Maximum number of files to process (None for all)
    
    Returns:
        Tuple of (total files, files with changes)
    """
    try:
        total_files = 0
        changed_files = 0
        
        # Get all Java files and sort by size (smallest first)
        java_files = [os.path.join(JAVA_DIR, f) for f in os.listdir(JAVA_DIR) 
                     if f.endswith('.java')]
        
        # Sort by file size to process smaller files first
        java_files.sort(key=lambda x: os.path.getsize(x))
        
        if max_files:
            java_files = java_files[:max_files]
        
        total_files = len(java_files)
        logger.info(f"Found {total_files} Java files to process")
        
        for file_path in java_files:
            # Skip files larger than 90KB to avoid API limits
            if os.path.getsize(file_path) / 1024 > 90:
                logger.warning(f"Skipping {file_path} (file too large)")
                continue
                
            if process_java_file(file_path, dry_run):
                changed_files += 1
        
        logger.info(f"Processed {total_files} files, {changed_files} needed changes")
        return total_files, changed_files
    except Exception as e:
        logger.error(f"Error processing files: {str(e)}")
        return 0, 0

# First do a dry run with a small number of files to test the API
# total, changed = process_all_files(dry_run=True, max_files=5)
# print(f"Dry run complete: {changed} of {total} files would be modified")

In [None]:
# Cell 5 - Run the actual formatting fix
# eTOUR/eTOUR/03_Fix_Java_Formatting.ipynb

def restore_backups():
    """
    Restore backup files to the original location
    """
    try:
        backup_files = [f for f in os.listdir(BACKUP_DIR) if f.endswith('.java')]
        for file_name in backup_files:
            src = os.path.join(BACKUP_DIR, file_name)
            dst = os.path.join(JAVA_DIR, file_name)
            shutil.copy2(src, dst)
            logger.info(f"Restored {file_name} from backup")
        print(f"Restored {len(backup_files)} files from backup")
    except Exception as e:
        logger.error(f"Error restoring backups: {str(e)}")

# Uncomment to restore files from backup before running
# restore_backups()

# To run the actual formatting, uncomment the line below and set max_files
# to a reasonable number (or None to process all files)
total, changed = process_all_files(dry_run=False, max_files=150)
print(f"Formatting fixed: {changed} of {total} files were modified")

def verify_file(file_name):
    """
    Verify the changes made to a specific file
    
    Args:
        file_name: Name of the file to verify
    """
    try:
        original_path = os.path.join(BACKUP_DIR, file_name)
        fixed_path = os.path.join(JAVA_DIR, file_name)
        
        with open(original_path, 'r', encoding='utf-8', errors='replace') as f:
            original = f.read()
        
        with open(fixed_path, 'r', encoding='utf-8', errors='replace') as f:
            fixed = f.read()
        
        print(f"Differences in {file_name}:")
        show_diff(original, fixed)
    except Exception as e:
        logger.error(f"Error verifying file {file_name}: {str(e)}")

# Uncomment to verify a specific file after running the fix
# verify_file("CostantiGlobali.java")

In [6]:
# Cell 6 - Batch processing to handle API rate limits
# eTOUR/eTOUR/03_Fix_Java_Formatting.ipynb

def process_files_in_batches(batch_size=5, delay_seconds=5, dry_run=True):
    """
    Process files in batches with delays to avoid API rate limits
    
    Args:
        batch_size: Number of files to process in each batch
        delay_seconds: Delay between batches in seconds
        dry_run: If True, don't write changes, just report them
    
    Returns:
        Tuple of (total files, files with changes)
    """
    try:
        # Get all Java files and sort by size (smallest first)
        java_files = [os.path.join(JAVA_DIR, f) for f in os.listdir(JAVA_DIR) 
                     if f.endswith('.java')]
        
        # Sort by file size to process smaller files first
        java_files.sort(key=lambda x: os.path.getsize(x))
        
        # Filter out files that are too large
        java_files = [f for f in java_files if os.path.getsize(f) / 1024 <= 90]
        
        total_files = len(java_files)
        changed_files = 0
        
        logger.info(f"Processing {total_files} files in batches of {batch_size}")
        
        # Process files in batches
        for i in range(0, total_files, batch_size):
            batch = java_files[i:i+batch_size]
            logger.info(f"Processing batch {i//batch_size + 1} of {(total_files + batch_size - 1)//batch_size}")
            
            for file_path in batch:
                if process_java_file(file_path, dry_run):
                    changed_files += 1
            
            # Delay between batches to avoid rate limits
            if i + batch_size < total_files:
                logger.info(f"Waiting {delay_seconds} seconds before next batch...")
                time.sleep(delay_seconds)
        
        logger.info(f"Batch processing complete: {changed_files} of {total_files} files modified")
        return total_files, changed_files
    except Exception as e:
        logger.error(f"Error in batch processing: {str(e)}")
        return 0, 0

# Uncomment to run batch processing with appropriate parameters
# process_files_in_batches(batch_size=5, delay_seconds=10, dry_run=True)