# Duplicate Analysis

This notebook analyzes duplicate detection results and hash efficiency:
- Duplicate detection overview
- Inter-class vs intra-class duplicates
- Hash efficiency metrics
- Storage impact analysis

## Setup and Data Loading

In [None]:
import sys
sys.path.append('../visualizers')
sys.path.append('../utils')

from duplicate_detector import create_combined_duplicate_analysis
from data_loader import load_report_data, get_duplicate_stats, get_overview_metrics
from plot_helpers import apply_global_style, display_config

apply_global_style()

data = load_report_data('../../sample_report.json')
print("Data loaded successfully!")
print(f"Generated at: {data.get('generated_at', 'Unknown')}")

## Duplicate Detection Summary

High-level duplicate detection metrics:

In [None]:
charts = create_combined_duplicate_analysis(data)

summary_fig = charts['summary_cards']
summary_fig.show(config=display_config())

## Duplicate Detection Overview

Unique vs duplicate image distribution:

In [None]:
overview_fig = charts['duplicate_overview']
overview_fig.show(config=display_config())

## Hash Efficiency Metrics

Effectiveness of the duplicate detection system:

In [None]:
efficiency_fig = charts['efficiency_metrics']
efficiency_fig.show(config=display_config())

## Duplicate Type Breakdown

Detailed analysis of different duplicate types:

In [None]:
breakdown_fig = charts['duplicate_breakdown']
breakdown_fig.show(config=display_config())

## Inter-Class vs Intra-Class Comparison

Distribution of duplicates within and across classes:

In [None]:
comparison_fig = charts['class_comparison']
comparison_fig.show(config=display_config())

## Duplicate Detection Funnel

Flow from total images to duplicate identification:

In [None]:
funnel_fig = charts['distribution_funnel']
funnel_fig.show(config=display_config())

## Impact Analysis

Storage and processing impact of duplicates:

In [None]:
impact_fig = charts['impact_analysis']
impact_fig.show(config=display_config())

## Duplicate Statistics Report

Comprehensive duplicate analysis:

In [None]:
duplicate_stats = get_duplicate_stats(data)
overview_metrics = get_overview_metrics(data)
duplicate_summary = duplicate_stats['duplicate_summary']

print("=== DUPLICATE DETECTION RESULTS ===")
print(f"Total Images Processed: {duplicate_stats['total_count']:,}")
print(f"Unique Images: {duplicate_stats['unique_count']:,}")
print(f"Duplicate Hashes: {duplicate_stats['duplicate_count']:,}")

uniqueness_rate = (duplicate_stats['unique_count'] / duplicate_stats['total_count']) * 100
duplicate_rate = (duplicate_stats['duplicate_count'] / duplicate_stats['total_count']) * 100

print(f"\nUniqueness Rate: {uniqueness_rate:.2f}%")
print(f"Duplicate Rate: {duplicate_rate:.2f}%")

print("\n=== DUPLICATE TYPE BREAKDOWN ===")
print(f"Inter-class Duplicates: {duplicate_summary.get('inter_class_duplicate_hashes', 0):,}")
print(f"Intra-class Duplicates: {duplicate_summary.get('intra_class_duplicate_hashes', 0):,}")
print(f"Total Duplicate Hashes: {duplicate_summary.get('total_duplicate_hashes', 0):,}")
print(f"Total Duplicate Files: {duplicate_summary.get('total_duplicate_files', 0):,}")

# Calculate storage impact
total_duplicate_files = duplicate_summary.get('total_duplicate_files', 0)
if total_duplicate_files > 0 and overview_metrics['total_images'] > 0:
    storage_waste = (total_duplicate_files / overview_metrics['total_images']) * 100
    avg_file_size_mb = overview_metrics['avg_file_size_mb']
    wasted_storage_mb = total_duplicate_files * avg_file_size_mb
    
    print(f"\n=== STORAGE IMPACT ===")
    print(f"Estimated Storage Waste: {storage_waste:.2f}%")
    print(f"Approximate Wasted Storage: {wasted_storage_mb:.1f} MB")
    
    if wasted_storage_mb > 1024:
        print(f"                          {wasted_storage_mb/1024:.2f} GB")

## Quality Assessment

Evaluation of duplicate detection effectiveness:

In [None]:
print("=== DUPLICATE DETECTION QUALITY ASSESSMENT ===")

if uniqueness_rate >= 99:
    print("üü¢ Excellent uniqueness - minimal duplicates detected")
    print("   Dataset quality: Outstanding")
elif uniqueness_rate >= 95:
    print("üü° Good uniqueness - acceptable duplicate levels")
    print("   Dataset quality: Good")
elif uniqueness_rate >= 90:
    print("üü† Moderate uniqueness - noticeable duplicates")
    print("   Dataset quality: Fair - consider cleanup")
else:
    print("üî¥ Low uniqueness - significant duplicate problem")
    print("   Dataset quality: Poor - cleanup recommended")

# Analyze duplicate patterns
inter_class = duplicate_summary.get('inter_class_duplicate_hashes', 0)
intra_class = duplicate_summary.get('intra_class_duplicate_hashes', 0)
total_dup_hashes = duplicate_summary.get('total_duplicate_hashes', 0)

if total_dup_hashes > 0:
    inter_ratio = (inter_class / total_dup_hashes) * 100
    intra_ratio = (intra_class / total_dup_hashes) * 100
    
    print(f"\n=== DUPLICATE PATTERN ANALYSIS ===")
    print(f"Inter-class Duplicates: {inter_ratio:.1f}% of total duplicates")
    print(f"Intra-class Duplicates: {intra_ratio:.1f}% of total duplicates")
    
    if inter_class > intra_class:
        print("\n‚ö†Ô∏è  High inter-class duplication detected:")
        print("   - Same images appearing in multiple classes")
        print("   - May impact model training accuracy")
        print("   - Consider reviewing class definitions")
    else:
        print("\n‚úÖ Duplicates mostly within classes:")
        print("   - Normal pattern for class-based collection")
        print("   - Lower impact on model training")
else:
    print("\nüéâ No duplicates detected - perfect uniqueness!")

print("\n=== RECOMMENDATIONS ===")

if duplicate_rate < 1:
    print("‚úÖ Excellent duplicate detection - no action needed")
elif duplicate_rate < 5:
    print("üìä Consider implementing duplicate removal for:")
    print("   - Storage optimization")
    print("   - Training efficiency improvement")
else:
    print("üîß Recommend implementing duplicate cleanup:")
    print("   - Automated deduplication pipeline")
    print("   - Enhanced hash-based filtering")
    print("   - Manual review of flagged duplicates")

if inter_class > 0:
    print("\nüéØ Inter-class Duplicate Action Items:")
    print("   - Review and validate class boundaries")
    print("   - Implement cross-class deduplication")
    print("   - Consider data source overlap analysis")

## Export Options

Save duplicate analysis reports:

In [None]:
# Uncomment to save charts
# from plot_helpers import save_plot
# 
# save_plot(summary_fig, 'duplicate_summary', 'png', width=1200, height=400)
# save_plot(overview_fig, 'duplicate_overview', 'png', width=800, height=600)
# save_plot(efficiency_fig, 'hash_efficiency', 'png', width=1200, height=500)
# save_plot(breakdown_fig, 'duplicate_breakdown', 'png', width=1000, height=600)
# save_plot(funnel_fig, 'duplicate_funnel', 'png', width=800, height=600)
# save_plot(impact_fig, 'duplicate_impact', 'html')
# 
# print("Duplicate analysis charts saved to visualizations/output/")