In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def analyze_extraction_performance(performance_csv='extraction_performance.csv'):
    """
    Comprehensive analysis of PDF extraction performance metrics
    
    Args:
        performance_csv (str): Path to performance metrics CSV file
    
    Returns:
        dict: Summary statistics and visualization details
    """
    # Read performance data
    df = pd.read_csv(performance_csv)
    
    # Basic summary statistics
    summary_stats = df.groupby('Extraction Method').agg({
        'extraction_time': ['mean', 'min', 'max', 'std'],
        'memory_usage': ['mean', 'min', 'max', 'std'],
        'cpu_usage': ['mean', 'min', 'max', 'std'],
        'extracted_text_length': ['mean', 'min', 'max', 'std']
    }).round(4)
    
    # Create visualization directory
    import os
    os.makedirs('performance_visualizations', exist_ok=True)
    
    # 1. Boxplot for Extraction Time
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='Extraction Method', y='extraction_time', data=df)
    plt.title('Extraction Time Comparison')
    plt.ylabel('Time (seconds)')
    plt.tight_layout()
    plt.savefig('performance_visualizations/extraction_time_boxplot.png')
    plt.close()
    
    # 2. Boxplot for Memory Usage
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='Extraction Method', y='memory_usage', data=df)
    plt.title('Memory Usage Comparison')
    plt.ylabel('Memory (MB)')
    plt.tight_layout()
    plt.savefig('performance_visualizations/memory_usage_boxplot.png')
    plt.close()
    
    # 3. Boxplot for CPU Usage
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='Extraction Method', y='cpu_usage', data=df)
    plt.title('CPU Usage Comparison')
    plt.ylabel('CPU Usage (%)')
    plt.tight_layout()
    plt.savefig('performance_visualizations/cpu_usage_boxplot.png')
    plt.close()
    
    # 4. Scatter Plot: Extraction Time vs Text Length
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='extracted_text_length', y='extraction_time', 
                    hue='Extraction Method', data=df)
    plt.title('Extraction Time vs Text Length')
    plt.xlabel('Extracted Text Length')
    plt.ylabel('Extraction Time (seconds)')
    plt.tight_layout()
    plt.savefig('performance_visualizations/time_vs_length_scatter.png')
    plt.close()
    
    # 5. Heatmap for Correlation Between Metrics
    correlation = df[['extraction_time', 'memory_usage', 'cpu_usage', 'extracted_text_length']].corr()
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Heatmap of Performance Metrics')
    plt.tight_layout()
    plt.savefig('performance_visualizations/performance_correlation_heatmap.png')
    plt.close()
    
    # 6. Bar Graph: Compare Extracted Text Length by Method for Each PDF
    if 'Filename' in df.columns and 'Extraction Method' in df.columns and 'extracted_text_length' in df.columns:
        plt.figure(figsize=(12, 8))
        sns.barplot(x='Filename', y='extracted_text_length', hue='Extraction Method', data=df)
        plt.title('Comparison of Extracted Text Length by Method for Each PDF')
        plt.ylabel('Extracted Text Length')
        plt.xlabel('PDF Files')
        plt.xticks(rotation=45, ha='right')
        plt.legend(title='Extraction Method')
        plt.tight_layout()
        bar_graph_path = 'performance_visualizations/extracted_text_bar_graph.png'
        plt.savefig(bar_graph_path)
        plt.close()
    else:
        bar_graph_path = None

    # Save summary to text file
    with open('performance_visualizations/performance_summary.txt', 'w') as f:
        f.write("PDF Extraction Performance Summary\n")
        f.write("==================================\n\n")
        f.write(str(summary_stats))
    
    return {
        'summary_statistics': summary_stats,
        'visualizations': [
            'performance_visualizations/extraction_time_boxplot.png',
            'performance_visualizations/memory_usage_boxplot.png',
            'performance_visualizations/cpu_usage_boxplot.png',
            'performance_visualizations/time_vs_length_scatter.png',
            'performance_visualizations/performance_correlation_heatmap.png',
            bar_graph_path
        ],
        'summary_file': 'performance_visualizations/performance_summary.txt'
    }

# Run the analysis
performance_analysis = analyze_extraction_performance()

# Print summary statistics
print(performance_analysis['summary_statistics'])
print("\nVisualizations and summary have been saved in the 'performance_visualizations' directory.")


                  extraction_time                          memory_usage  \
                             mean     min      max     std         mean   
Extraction Method                                                         
PDFPlumber                 7.9836  0.0198  40.1128  7.0339    1218.8409   
PyPDF2                     2.6234  0.0079  10.4922  2.5403    1212.2071   
Textract                   0.6224  0.0080   3.6047  0.6477    1214.3786   

                                                  cpu_usage               \
                         min        max       std      mean   min    max   
Extraction Method                                                          
PDFPlumber         1085.2852  1585.8438  116.5849   34.5000  10.0  100.0   
PyPDF2             1085.4102  1704.0117  117.4629   25.3613   6.2   81.1   
Textract           1085.4102  1702.4219  115.8835   23.2447   6.3   75.3   

                           extracted_text_length                           
                