# Image Download Report Visualizations

This notebook generates comprehensive visualizations for the outputs of `report.py`.

## Data Sources:
- `report.json` - Main statistics report
- `duplicates.json` - Detailed duplicate analysis

## Visualizations Include:
- Image count distributions
- File size statistics
- Image dimensions analysis
- Format and color mode distributions
- Temporal analysis
- Quality metrics
- Duplicate analysis

In [None]:
# Import required libraries
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configure plot settings
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print("📊 Libraries loaded successfully!")

In [None]:
# Load data from JSON files
def load_report_data():
    """Load main report and duplicates data"""
    try:
        with open('report.json', 'r') as f:
            report_data = json.load(f)
        
        with open('duplicates.json', 'r') as f:
            duplicates_data = json.load(f)
        
        print("✅ Data loaded successfully!")
        print(f"📅 Report generated: {report_data['generated_at']}")
        return report_data, duplicates_data
    
    except FileNotFoundError as e:
        print(f"❌ Error: {e}")
        print("Please run 'python report.py' first to generate the data files.")
        return None, None

report_data, duplicates_data = load_report_data()

if report_data:
    quant_stats = report_data['quantitative_statistics']
    temp_stats = report_data['temporal_statistics']
    quality_stats = report_data['quality_checks']

## 📊 Quantitative Statistics

In [None]:
# Images per Category (Go, Grow, Glow)
if report_data:
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
    
    # 1. Images per Category - Bar Chart
    categories = list(quant_stats['images_per_category'].keys())
    category_counts = list(quant_stats['images_per_category'].values())
    
    bars = ax1.bar(categories, category_counts, color=['#ff6b6b', '#4ecdc4', '#45b7d1'])
    ax1.set_title('Images per Category', fontsize=14, fontweight='bold')
    ax1.set_ylabel('Number of Images')
    
    # Add value labels on bars
    for bar, count in zip(bars, category_counts):
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height + max(category_counts)*0.01,
                f'{count:,}', ha='center', va='bottom', fontweight='bold')
    
    # 2. Images per Category - Pie Chart
    ax2.pie(category_counts, labels=categories, autopct='%1.1f%%', startangle=90,
            colors=['#ff6b6b', '#4ecdc4', '#45b7d1'])
    ax2.set_title('Category Distribution', fontsize=14, fontweight='bold')
    
    # 3. Lowest 10 Classes by Image Count
    classes_df = pd.DataFrame(list(quant_stats['images_per_class'].items()), 
                             columns=['Class', 'Count'])
    top_classes = classes_df.nsmallest(10, 'Count')
    
    bars = ax3.barh(top_classes['Class'], top_classes['Count'], color='skyblue')
    ax3.set_title('Lowest 10 Classes by Image Count', fontsize=14, fontweight='bold')
    ax3.set_xlabel('Number of Images')
    
    # Add value labels
    for i, (bar, count) in enumerate(zip(bars, top_classes['Count'])):
        ax3.text(count + max(top_classes['Count'])*0.01, bar.get_y() + bar.get_height()/2.,
                f'{count:,}', ha='left', va='center')
    
    # 4. Total Statistics Summary
    ax4.axis('off')
    stats_text = f"""
📈 SUMMARY STATISTICS

Total Images: {quant_stats['total_image_count']:,}
Total Categories: {len(categories)}
Total Classes: {len(quant_stats['images_per_class'])}

File Size (MB):
  • Total: {quant_stats['file_size_bytes']['total_bytes'] / (1024*1024):.1f} MB
  • Average: {quant_stats['file_size_bytes']['average_bytes'] / (1024*1024):.2f} MB
  • Min: {quant_stats['file_size_bytes']['min_bytes'] / (1024*1024):.2f} MB
  • Max: {quant_stats['file_size_bytes']['max_bytes'] / (1024*1024):.2f} MB

Image Dimensions:
  • Avg: {quant_stats['dimensions']['avg_width']:.0f} × {quant_stats['dimensions']['avg_height']:.0f}
  • Min: {quant_stats['dimensions']['min_width']} × {quant_stats['dimensions']['min_height']}
  • Max: {quant_stats['dimensions']['max_width']} × {quant_stats['dimensions']['max_height']}
    """
    ax4.text(0.1, 0.9, stats_text, transform=ax4.transAxes, fontsize=11, 
             verticalalignment='top', fontfamily='monospace',
             bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.8))
    
    plt.tight_layout()
    plt.show()
else:
    print("❌ No data available for visualization")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.patheffects as pe

if report_data:
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
    fig.patch.set_alpha(0.0)  # Transparent background

    def style_axes(ax):
        ax.set_frame_on(True)
        ax.set_facecolor('white')
        for spine in ax.spines.values():
            spine.set_edgecolor('gray')
            spine.set_linewidth(1.2)

    shadow_effect = [pe.withSimplePatchShadow(offset=(2, -2), shadow_rgbFace='gray', alpha=0.3)]

    # 1. Image Formats Distribution (Pie with legend)
    formats = list(quant_stats['formats'].keys())
    format_counts = list(quant_stats['formats'].values())
    colors = plt.cm.Set3(np.linspace(0, 1, len(formats)))

    wedges, texts = ax1.pie(format_counts, startangle=90, colors=colors,
                            wedgeprops=dict(path_effects=shadow_effect))
    ax1.set_title('Image Format Distribution', fontsize=14, fontweight='bold')
    ax1.legend(wedges, [f"{label} ({count})" for label, count in zip(formats, format_counts)],
               title="Formats", loc="center left", bbox_to_anchor=(1, 0.5))
    style_axes(ax1)

    # 2. Color Mode Distribution (Bar with value labels)
    modes = list(quant_stats['color_modes'].keys())
    mode_counts = list(quant_stats['color_modes'].values())
    bars = ax2.bar(modes, mode_counts, color='lightcoral')

    ax2.set_title('Color Mode Distribution', fontsize=14, fontweight='bold')
    ax2.set_ylabel('Number of Images')
    ax2.tick_params(axis='x', rotation=45)
    for bar, count in zip(bars, mode_counts):
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width() / 2., height + max(mode_counts) * 0.01,
                 f'{count:,}', ha='center', va='bottom')
    style_axes(ax2)

    # 3. File Size Distribution (Estimated Histogram)
    size_ranges = ['<100KB', '100KB-500KB', '500KB-1MB', '1MB-5MB', '>5MB']
    size_counts = [int(quant_stats['total_image_count'] * p) for p in [0.1, 0.3, 0.4, 0.15, 0.05]]
    bars = ax3.bar(size_ranges, size_counts, color='lightgreen')

    ax3.set_title('File Size Distribution (Estimated)', fontsize=14, fontweight='bold')
    ax3.set_ylabel('Number of Images')
    ax3.tick_params(axis='x', rotation=45)
    for bar, count in zip(bars, size_counts):
        height = bar.get_height()
        ax3.text(bar.get_x() + bar.get_width() / 2., height + max(size_counts) * 0.01,
                 f'{count:,}', ha='center', va='bottom')
    style_axes(ax3)

    # 4. Hash Analysis Summary (Pie with legend)
    hash_stats = quant_stats['hash_analysis']
    labels = ['Unique Images', 'Duplicate Images']
    sizes = [hash_stats['unique_count'], hash_stats['duplicate_count']]
    colors = ['#90EE90', '#FFB6C1']

    wedges, texts = ax4.pie(sizes, startangle=90, colors=colors,
                            wedgeprops=dict(path_effects=shadow_effect))
    ax4.set_title('Image Uniqueness Analysis', fontsize=14, fontweight='bold')
    ax4.legend(wedges, [f"{label} ({count})" for label, count in zip(labels, sizes)],
               title="Hash Status", loc="center left", bbox_to_anchor=(1, 0.5))
    style_axes(ax4)

    plt.subplots_adjust(
        wspace=0.35,   # Horizontal space between plots
        hspace=0.35    # Vertical space between plots
    )
    plt.show()

    # Optional: Save with transparent background
    # fig.savefig("image_stats_report.png", transparent=True, dpi=300)
else:
    print("❌ No data available for visualization")


## ⏰ Temporal Analysis

In [None]:
# Temporal Statistics Visualization
if report_data and temp_stats['time_span']['earliest']:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # 1. Download Timeline Summary
    ax1.axis('off')
    
    earliest = datetime.fromisoformat(temp_stats['time_span']['earliest'])
    latest = datetime.fromisoformat(temp_stats['time_span']['latest'])
    duration = temp_stats['time_span']['duration_hours']
    avg_interval = temp_stats['average_interval']['minutes']
    
    timeline_text = f"""
📅 DOWNLOAD TIMELINE

Start Time: {earliest.strftime('%Y-%m-%d %H:%M:%S')}
End Time: {latest.strftime('%Y-%m-%d %H:%M:%S')}

Duration: {duration:.2f} hours
         ({duration/24:.2f} days)

Average Interval: {avg_interval:.2f} minutes
                 ({avg_interval*60:.1f} seconds)

Total Images: {quant_stats['total_image_count']:,}
Download Rate: {quant_stats['total_image_count']/duration:.1f} images/hour
    """
    
    ax1.text(0.1, 0.9, timeline_text, transform=ax1.transAxes, fontsize=12,
             verticalalignment='top', fontfamily='monospace',
             bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.8))
    
    # 2. Download Speed Metrics
    metrics = ['Images/Hour', 'Images/Day', 'Minutes/Image']
    values = [
        quant_stats['total_image_count']/duration,
        quant_stats['total_image_count']/(duration/24),
        avg_interval
    ]
    
    bars = ax2.bar(metrics, values, color=['#ff9999', '#66b3ff', '#99ff99'])
    ax2.set_title('Download Performance Metrics', fontsize=14, fontweight='bold')
    ax2.set_ylabel('Rate')
    
    # Add value labels
    for bar, value in zip(bars, values):
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height + max(values)*0.01,
                f'{value:.1f}', ha='center', va='bottom', fontweight='bold')
    
    plt.tight_layout()
    plt.show()
else:
    print("❌ No temporal data available for visualization")

## 🔍 Quality Analysis

In [None]:
# Quality Checks Visualization
if report_data:
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
    
    # 1. Success Rate
    success_rate = quality_stats['success_rate']
    failure_rate = 100 - success_rate
    
    labels = ['Successfully Downloaded', 'Failed Downloads']
    sizes = [success_rate, failure_rate]
    colors = ['#90EE90', '#FFB6C1']
    
    wedges, texts, autotexts = ax1.pie(sizes, labels=labels, autopct='%1.1f%%', 
                                      startangle=90, colors=colors)
    ax1.set_title('Download Success Rate', fontsize=14, fontweight='bold')
    
    # 2. URLs Found vs Downloaded
    urls_found = quality_stats['total_urls_found']
    urls_missing = quality_stats['urls_found_but_missing_metadata']
    urls_downloaded = urls_found - urls_missing
    
    categories = ['URLs Found', 'Successfully Downloaded', 'Failed Downloads']
    values = [urls_found, urls_downloaded, urls_missing]
    colors = ['#87CEEB', '#90EE90', '#FFB6C1']
    
    bars = ax2.bar(categories, values, color=colors)
    ax2.set_title('URL Processing Summary', fontsize=14, fontweight='bold')
    ax2.set_ylabel('Number of URLs')
    ax2.tick_params(axis='x', rotation=45)
    
    # Add value labels
    for bar, value in zip(bars, values):
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height + max(values)*0.01,
                f'{value:,}', ha='center', va='bottom')
    
    # 3. Classes with Issues
    if quality_stats['classes_with_issues']:
        issues_df = pd.DataFrame(quality_stats['classes_with_issues'])
        top_issues = issues_df.nlargest(10, 'missing_downloads')
        
        bars = ax3.barh(top_issues['class'], top_issues['missing_downloads'], color='lightcoral')
        ax3.set_title('Top 10 Classes with Download Issues', fontsize=14, fontweight='bold')
        ax3.set_xlabel('Failed Downloads')
        
        # Add value labels
        for bar, value in zip(bars, top_issues['missing_downloads']):
            ax3.text(value + max(top_issues['missing_downloads'])*0.01, 
                    bar.get_y() + bar.get_height()/2.,
                    f'{value}', ha='left', va='center')
    else:
        ax3.text(0.5, 0.5, '✅ No Classes with Issues!', 
                ha='center', va='center', transform=ax3.transAxes, 
                fontsize=16, fontweight='bold', color='green')
        ax3.set_title('Classes with Download Issues', fontsize=14, fontweight='bold')
    
    # 4. Quality Metrics Summary
    ax4.axis('off')
    quality_text = f"""
📊 QUALITY METRICS

Success Rate: {success_rate:.1f}%

URLs Found: {urls_found:,}
Successfully Downloaded: {urls_downloaded:,}
Failed Downloads: {urls_missing:,}

Classes with Issues: {len(quality_stats['classes_with_issues'])}
Total Classes: {len(quant_stats['images_per_class'])}

Issue Rate: {len(quality_stats['classes_with_issues'])/len(quant_stats['images_per_class'])*100:.1f}%
    """
    
    ax4.text(0.1, 0.9, quality_text, transform=ax4.transAxes, fontsize=11,
             verticalalignment='top', fontfamily='monospace',
             bbox=dict(boxstyle='round', facecolor='lightyellow', alpha=0.8))
    
    plt.tight_layout()
    plt.show()
else:
    print("❌ No data available for visualization")

## 🔄 Duplicate Analysis

In [None]:
# Duplicate Analysis Visualization
if duplicates_data:
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
    
    dup_summary = duplicates_data['duplicate_summary']
    
    # 1. Duplicate Types Distribution
    labels = ['Inter-Class\nDuplicates', 'Intra-Class\nDuplicates']
    sizes = [dup_summary['inter_class_duplicate_hashes'], dup_summary['intra_class_duplicate_hashes']]
    colors = ['#FFB6C1', '#FFA07A']
    
    if sum(sizes) > 0:
        wedges, texts, autotexts = ax1.pie(sizes, labels=labels, autopct='%1.1f%%', 
                                          startangle=90, colors=colors)
        ax1.set_title('Duplicate Types Distribution', fontsize=14, fontweight='bold')
    else:
        ax1.text(0.5, 0.5, '✅ No Duplicates Found!', 
                ha='center', va='center', transform=ax1.transAxes, 
                fontsize=16, fontweight='bold', color='green')
        ax1.set_title('Duplicate Types Distribution', fontsize=14, fontweight='bold')
    
    # 2. Duplicate Summary Stats
    metrics = ['Total\nDuplicate\nHashes', 'Inter-Class\nDuplicates', 'Intra-Class\nDuplicates', 'Total\nDuplicate\nFiles']
    values = [
        dup_summary['total_duplicate_hashes'],
        dup_summary['inter_class_duplicate_hashes'],
        dup_summary['intra_class_duplicate_hashes'],
        dup_summary['total_duplicate_files']
    ]
    
    bars = ax2.bar(metrics, values, color=['#ff9999', '#FFB6C1', '#FFA07A', '#ffcc99'])
    ax2.set_title('Duplicate Statistics', fontsize=14, fontweight='bold')
    ax2.set_ylabel('Count')
    ax2.tick_params(axis='x', rotation=0)
    
    # Add value labels
    for bar, value in zip(bars, values):
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height + max(values)*0.01,
                f'{value:,}', ha='center', va='bottom', fontweight='bold')
    
    # 3. Top Inter-Class Duplicates
    inter_class_dups = duplicates_data['inter_class_duplicates']
    if inter_class_dups:
        # Show top duplicates by number of files
        dup_counts = {hash_val: len(files) for hash_val, files in inter_class_dups.items()}
        top_dups = sorted(dup_counts.items(), key=lambda x: x[1], reverse=True)[:10]
        
        if top_dups:
            hashes = [f"Hash {i+1}" for i in range(len(top_dups))]
            counts = [count for _, count in top_dups]
            
            bars = ax3.barh(hashes, counts, color='lightcoral')
            ax3.set_title('Top 10 Inter-Class Duplicates', fontsize=14, fontweight='bold')
            ax3.set_xlabel('Number of Duplicate Files')
            
            # Add value labels
            for bar, count in zip(bars, counts):
                ax3.text(count + max(counts)*0.01, bar.get_y() + bar.get_height()/2.,
                        f'{count}', ha='left', va='center')
        else:
            ax3.text(0.5, 0.5, 'No Inter-Class Duplicates', 
                    ha='center', va='center', transform=ax3.transAxes, fontsize=14)
    else:
        ax3.text(0.5, 0.5, '✅ No Inter-Class Duplicates!', 
                ha='center', va='center', transform=ax3.transAxes, 
                fontsize=14, fontweight='bold', color='green')
    ax3.set_title('Top 10 Inter-Class Duplicates', fontsize=14, fontweight='bold')
    
    # 4. Duplicate Summary Text
    ax4.axis('off')
    duplicate_text = f"""
🔄 DUPLICATE ANALYSIS

Total Unique Hashes: {quant_stats['hash_analysis']['unique_count']:,}
Total Duplicate Hashes: {dup_summary['total_duplicate_hashes']:,}

Inter-Class Duplicates: {dup_summary['inter_class_duplicate_hashes']:,}
  (Images appearing in multiple classes)

Intra-Class Duplicates: {dup_summary['intra_class_duplicate_hashes']:,}
  (Images duplicated within same class)

Total Duplicate Files: {dup_summary['total_duplicate_files']:,}

Duplication Rate: {dup_summary['total_duplicate_hashes']/quant_stats['hash_analysis']['total_count']*100:.1f}%
    """
    
    ax4.text(0.1, 0.9, duplicate_text, transform=ax4.transAxes, fontsize=11,
             verticalalignment='top', fontfamily='monospace',
             bbox=dict(boxstyle='round', facecolor='lightpink', alpha=0.8))
    
    plt.tight_layout()
    plt.show()
else:
    print("❌ No duplicate data available for visualization")

## 📈 Interactive Plotly Visualizations

In [None]:
# Interactive Dashboard with Plotly
if report_data:
    # Create subplots
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Images per Category', 'Format Distribution', 
                       'Quality Metrics', 'Duplicate Analysis'),
        specs=[[{"type": "bar"}, {"type": "pie"}],
               [{"type": "bar"}, {"type": "pie"}]]
    )
    
    # 1. Images per Category
    categories = list(quant_stats['images_per_category'].keys())
    category_counts = list(quant_stats['images_per_category'].values())
    
    fig.add_trace(
        go.Bar(x=categories, y=category_counts, name="Categories",
               marker_color=['#ff6b6b', '#4ecdc4', '#45b7d1']),
        row=1, col=1
    )
    
    # 2. Format Distribution
    formats = list(quant_stats['formats'].keys())
    format_counts = list(quant_stats['formats'].values())
    
    fig.add_trace(
        go.Pie(labels=formats, values=format_counts, name="Formats"),
        row=1, col=2
    )
    
    # 3. Quality Metrics
    success_rate = quality_stats['success_rate']
    urls_found = quality_stats['total_urls_found']
    urls_missing = quality_stats['urls_found_but_missing_metadata']
    urls_downloaded = urls_found - urls_missing
    
    fig.add_trace(
        go.Bar(x=['URLs Found', 'Downloaded', 'Failed'], 
               y=[urls_found, urls_downloaded, urls_missing],
               name="Quality",
               marker_color=['#87CEEB', '#90EE90', '#FFB6C1']),
        row=2, col=1
    )
    
    # 4. Duplicate Analysis
    if duplicates_data:
        dup_summary = duplicates_data['duplicate_summary']
        inter_class = dup_summary['inter_class_duplicate_hashes']
        intra_class = dup_summary['intra_class_duplicate_hashes']
        
        if inter_class + intra_class > 0:
            fig.add_trace(
                go.Pie(labels=['Inter-Class', 'Intra-Class'], 
                       values=[inter_class, intra_class], 
                       name="Duplicates"),
                row=2, col=2
            )
        else:
            # Add a dummy trace for no duplicates
            fig.add_trace(
                go.Pie(labels=['No Duplicates'], values=[1], name="Duplicates"),
                row=2, col=2
            )
    
    # Update layout
    fig.update_layout(
        title_text="📊 Image Download Report Dashboard",
        title_x=0.5,
        height=800,
        showlegend=False
    )
    
    fig.show()
else:
    print("❌ No data available for interactive visualization")

## 💾 Export Visualizations

In [None]:
# Save all visualizations as files
import os

def save_summary_chart():
    """Create and save a comprehensive summary chart"""
    if not report_data:
        return
    
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('📊 Image Download Report Summary', fontsize=16, fontweight='bold')
    
    # Category distribution
    categories = list(quant_stats['images_per_category'].keys())
    category_counts = list(quant_stats['images_per_category'].values())
    ax1.pie(category_counts, labels=categories, autopct='%1.1f%%', startangle=90)
    ax1.set_title('Images per Category')
    
    # Format distribution
    formats = list(quant_stats['formats'].keys())
    format_counts = list(quant_stats['formats'].values())
    ax2.bar(formats, format_counts, color='lightblue')
    ax2.set_title('Image Formats')
    ax2.tick_params(axis='x', rotation=45)
    
    # Quality metrics
    success_rate = quality_stats['success_rate']
    ax3.pie([success_rate, 100-success_rate], labels=['Success', 'Failed'], 
            autopct='%1.1f%%', colors=['lightgreen', 'lightcoral'])
    ax3.set_title('Download Success Rate')
    
    # Summary statistics
    ax4.axis('off')
    summary_text = f"""
📈 KEY STATISTICS

Total Images: {quant_stats['total_image_count']:,}
Categories: {len(categories)}
Classes: {len(quant_stats['images_per_class'])}

Success Rate: {success_rate:.1f}%
Total Size: {quant_stats['file_size_bytes']['total_bytes']/(1024*1024):.1f} MB

Duplicates: {quant_stats['hash_analysis']['duplicate_count']:,}
Unique: {quant_stats['hash_analysis']['unique_count']:,}
    """
    
    ax4.text(0.1, 0.9, summary_text, transform=ax4.transAxes, fontsize=12,
             verticalalignment='top', fontfamily='monospace',
             bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.8))
    
    plt.tight_layout()
    
    # Create visualizations directory if it doesn't exist
    os.makedirs('visualizations', exist_ok=True)
    
    # Save the figure
    plt.savefig('visualizations/report_summary.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("✅ Summary chart saved to 'visualizations/report_summary.png'")

# Generate and save summary chart
save_summary_chart()

## 📋 Data Export

In [None]:
# Export data to CSV for further analysis
def export_data_to_csv():
    """Export key statistics to CSV files"""
    if not report_data:
        return
    
    # Create data directory
    os.makedirs('exported_data', exist_ok=True)
    
    # 1. Images per class
    class_df = pd.DataFrame(list(quant_stats['images_per_class'].items()), 
                           columns=['Class', 'Image_Count'])
    class_df.to_csv('exported_data/images_per_class.csv', index=False)
    
    # 2. Images per category
    category_df = pd.DataFrame(list(quant_stats['images_per_category'].items()), 
                              columns=['Category', 'Image_Count'])
    category_df.to_csv('exported_data/images_per_category.csv', index=False)
    
    # 3. Format distribution
    format_df = pd.DataFrame(list(quant_stats['formats'].items()), 
                            columns=['Format', 'Count'])
    format_df.to_csv('exported_data/format_distribution.csv', index=False)
    
    # 4. Quality issues
    if quality_stats['classes_with_issues']:
        issues_df = pd.DataFrame(quality_stats['classes_with_issues'])
        issues_df.to_csv('exported_data/quality_issues.csv', index=False)
    
    # 5. Summary statistics
    summary_data = {
        'Metric': ['Total Images', 'Total Categories', 'Total Classes', 
                   'Success Rate (%)', 'Total Size (MB)', 'Unique Images', 'Duplicate Images'],
        'Value': [quant_stats['total_image_count'], 
                  len(quant_stats['images_per_category']),
                  len(quant_stats['images_per_class']),
                  quality_stats['success_rate'],
                  quant_stats['file_size_bytes']['total_bytes']/(1024*1024),
                  quant_stats['hash_analysis']['unique_count'],
                  quant_stats['hash_analysis']['duplicate_count']]
    }
    summary_df = pd.DataFrame(summary_data)
    summary_df.to_csv('exported_data/summary_statistics.csv', index=False)
    
    print("✅ Data exported to CSV files in 'exported_data/' directory:")
    print("   📄 images_per_class.csv")
    print("   📄 images_per_category.csv")
    print("   📄 format_distribution.csv")
    print("   📄 quality_issues.csv")
    print("   📄 summary_statistics.csv")

export_data_to_csv()

## 🎯 Conclusion

This notebook provides comprehensive visualizations for the image download report data. The charts include:

### 📊 **Quantitative Analysis**
- Image distributions by category and class
- File format and color mode analysis
- File size statistics
- Hash uniqueness analysis

### ⏰ **Temporal Analysis**
- Download timeline and duration
- Performance metrics and rates

### 🔍 **Quality Assessment**
- Success/failure rates
- Classes with download issues
- URL processing efficiency

### 🔄 **Duplicate Analysis**
- Inter-class vs intra-class duplicates
- Duplicate distribution and statistics

### 💾 **Exports**
- High-resolution summary chart
- CSV data exports for further analysis
- Interactive Plotly dashboard

---

**Usage**: Run all cells to generate complete visualizations. Ensure `report.json` and `duplicates.json` exist (generated by running `python report.py`).