In [None]:
import pandas as pd
import json
import time
from anthropic import Anthropic
from tqdm import tqdm
import os
from datetime import datetime
from dotenv import load_dotenv


load_dotenv()
API_KEY = os.getenv('API_KEY')

In [None]:

# [Previous checkpoint-related functions remain the same]
def save_checkpoint(df, current_idx, checkpoint_dir):
    """[Previous documentation remains the same]"""
    os.makedirs(checkpoint_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_{timestamp}_idx_{current_idx}.csv')
    df.to_csv(checkpoint_path, index=False)
    index_file = os.path.join(checkpoint_dir, 'last_checkpoint.txt')
    with open(index_file, 'w') as f:
        f.write(checkpoint_path)
    print(f"\nCheckpoint saved at index {current_idx}: {checkpoint_path}")

def load_latest_checkpoint(checkpoint_dir):
    """[Previous documentation remains the same]"""
    if not os.path.exists(checkpoint_dir):
        return None, -1
    index_file = os.path.join(checkpoint_dir, 'last_checkpoint.txt')
    if not os.path.exists(index_file):
        return None, -1
    with open(index_file, 'r') as f:
        latest_checkpoint = f.read().strip()
    if os.path.exists(latest_checkpoint):
        df = pd.read_csv(latest_checkpoint)
        last_processed = df[df['has_woman_first_name'].notna()].index.max()
        if pd.isna(last_processed):
            last_processed = -1
        print(f"Loaded checkpoint: {latest_checkpoint}")
        return df, last_processed
    return None, -1

def analyze_movie_name(client, movie_name):
    """
    Analyze a movie name using Claude API with optimized token usage.
    
    Args:
        client: Anthropic client instance
        movie_name: String containing the movie name to analyze
    
    Returns:
        dict: Dictionary containing the analysis results
    """
    max_retries = 3
    retry_delay = 2
    
    for attempt in range(max_retries):
        try:
            # Optimized prompt with minimal instructions and example
            prompt = f"""Title: "{movie_name}"
Check for name types:
-Woman first name
-Man first name
-Last name
-Full woman name
-Full man name

Return only JSON:
{{"has_woman_first_name":bool,"has_man_first_name":bool,"has_last_name":bool,"has_complete_woman_name":bool,"has_complete_man_name":bool,"names_found":[]}}"""
            
            message = client.messages.create(
                model="claude-3-5-sonnet-20240620",
                max_tokens=150, 
                temperature=0,
                system="You are a name analyzer. Always respond with valid JSON only.",  # Shortened system prompt
                messages=[{"role": "user", "content": prompt}]
            )
            
            response = json.loads(message.content[0].text)
            return response
            
        except Exception as e:
            if attempt < max_retries - 1:
                print(f"Attempt {attempt + 1} failed for '{movie_name}': {str(e)}")
                print(f"Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay * (attempt + 1))
            else:
                print(f"All retries failed for '{movie_name}': {str(e)}")
                return None

def analyze_movie_dataset(csv_path, api_key, checkpoint_interval=100, batch_size=None):
    """
    Analyze all movie names in a CSV file using Claude API with robust checkpointing
    and optional batch processing.
    
    Args:
        csv_path: Path to the CSV file
        api_key: Anthropic API key
        checkpoint_interval: Number of movies to process before saving checkpoint
        batch_size: Optional batch size for processing chunks of data
    """
    client = Anthropic(api_key=API_KEY)
    
    checkpoint_dir = 'checkpoints'
    output_path = 'movie_names_analysis.csv'
    
    results_columns = [
        'has_woman_first_name',
        'has_man_first_name',
        'has_last_name',
        'has_complete_woman_name',
        'has_complete_man_name',
        'names_found'
    ]
    
    checkpoint_df, last_processed_idx = load_latest_checkpoint(checkpoint_dir)
    
    if checkpoint_df is None:
        print("Starting new analysis...")
        df = pd.read_csv(csv_path)
        for col in results_columns:
            df[col] = None
    else:
        df = checkpoint_df
        print(f"Resuming analysis from index {last_processed_idx + 1}")
    
    try:
        remaining_df = df.iloc[last_processed_idx + 1:]
        total_remaining = len(remaining_df)
        
        if batch_size:
            # Process in batches to allow for better memory management
            batches = range(0, total_remaining, batch_size)
            for start_idx in tqdm(batches, desc="Processing batches"):
                end_idx = min(start_idx + batch_size, total_remaining)
                batch = remaining_df.iloc[start_idx:end_idx]
                
                for idx, row in batch.iterrows():
                    result = analyze_movie_name(client, row['Movie Name'])
                    
                    if result:
                        for col in results_columns:
                            df.at[idx, col] = result[col]
                    
                    if (idx - last_processed_idx) % checkpoint_interval == 0:
                        save_checkpoint(df, idx, checkpoint_dir)
                        df.to_csv(output_path, index=False)
                    
                    time.sleep(1)  # Rate limiting
        else:
            # Process one at a time
            with tqdm(total=total_remaining, desc="Analyzing movies") as pbar:
                for idx, row in remaining_df.iterrows():
                    result = analyze_movie_name(client, row['Movie Name'])
                    
                    if result:
                        for col in results_columns:
                            df.at[idx, col] = result[col]
                    
                    if (idx - last_processed_idx) % checkpoint_interval == 0:
                        save_checkpoint(df, idx, checkpoint_dir)
                        df.to_csv(output_path, index=False)
                    
                    pbar.update(1)
                    time.sleep(1)
                
    except KeyboardInterrupt:
        print("\nProcess interrupted by user. Progress has been saved.")
    except Exception as e:
        print(f"\nAn error occurred: {str(e)}")
        print("Progress has been saved in checkpoint files.")
    finally:
        df.to_csv(output_path, index=False)
        print(f"\nFinal results saved to {output_path}")
        
        print("\nSummary Statistics:")
        for col in results_columns[:-1]:
            count = df[col].sum()
            percentage = (count / len(df)) * 100 if count is not None else 0
            print(f"{col}: {count} movies ({percentage:.1f}%)")
        
        if 'idx' in locals() and idx == len(df) - 1:
            print("\nProcessing completed successfully!")
            response = input("Do you want to clean up checkpoint files? (y/n): ")
            if response.lower() == 'y':
                import shutil
                shutil.rmtree(checkpoint_dir)
                print("Checkpoint files removed.")

# Usage example:
if __name__ == "__main__":
    api_key = API_KEY
    csv_path = "catalogo.csv"
    analyze_movie_dataset(csv_path, api_key)