# Quality Analysis

This notebook focuses on data quality and download success metrics:
- Overall success rate analysis
- Class-specific success rates
- Download efficiency metrics
- Quality issues identification and impact

## Setup and Data Loading

In [None]:
import sys
sys.path.append('../visualizers')
sys.path.append('../utils')

from quality_metrics import create_combined_quality_analysis
from data_loader import load_report_data, extract_quality_stats, create_quality_issues_df
from plot_helpers import apply_global_style, display_config

apply_global_style()

data = load_report_data('../../sample_report.json')
print("Data loaded successfully!")
print(f"Generated at: {data.get('generated_at', 'Unknown')}")

## Quality Metrics Summary

High-level quality indicators:

In [None]:
charts = create_combined_quality_analysis(data)

summary_fig = charts['summary_cards']
summary_fig.show(config=display_config())

## Overall Success Rate

Comprehensive success rate gauge:

In [None]:
success_fig = charts['success_rate_breakdown']
success_fig.show(config=display_config())

## Download Efficiency Analysis

Funnel analysis of the download process:

In [None]:
efficiency_fig = charts['efficiency_analysis']
efficiency_fig.show(config=display_config())

## Class-Specific Success Rates

Success rate breakdown by individual classes:

In [None]:
class_success_fig = charts['class_success_rates']
class_success_fig.show(config=display_config())

## Download Comparison

URLs found vs successfully downloaded:

In [None]:
comparison_fig = charts['download_comparison']
comparison_fig.show(config=display_config())

## Detailed Quality Issues

In-depth analysis of classes with issues:

In [None]:
detailed_fig = charts['quality_issues_detailed']
detailed_fig.show(config=display_config())

## Quality Statistics Report

Detailed breakdown of quality metrics:

In [None]:
quality_stats = extract_quality_stats(data)
issues_df = create_quality_issues_df(data)

print("=== OVERALL QUALITY METRICS ===")
print(f"Success Rate: {quality_stats.get('success_rate', 0):.2f}%")
print(f"URLs Found: {quality_stats.get('total_urls_found', 0):,}")
print(f"Successfully Downloaded: {quality_stats.get('total_downloaded', 0):,}")
print(f"Missing/Failed: {quality_stats.get('urls_found_but_missing_metadata', 0):,}")

if not issues_df.empty:
    print("\n=== CLASSES WITH ISSUES ===")
    for _, row in issues_df.iterrows():
        print(f"\n{row['class']}:")
        print(f"  URLs Found: {row['urls_found']:,}")
        print(f"  Downloaded: {row['urls_downloaded']:,}")
        print(f"  Missing: {row['missing_downloads']:,}")
        print(f"  Success Rate: {row['success_rate']:.2f}%")
else:
    print("\n✅ No quality issues detected - all classes downloaded successfully!")

total_found = quality_stats.get('total_urls_found', 0)
total_downloaded = quality_stats.get('total_downloaded', 0)
if total_found > 0:
    efficiency = (total_downloaded / total_found) * 100
    print(f"\n=== EFFICIENCY SCORE ===")
    print(f"Download Efficiency: {efficiency:.2f}%")
    
    if efficiency >= 99:
        print("🟢 Excellent quality - minimal data loss")
    elif efficiency >= 95:
        print("🟡 Good quality - acceptable data loss")
    else:
        print("🔴 Quality concerns - significant data loss detected")

## Recommendations

Quality improvement suggestions based on analysis:

In [None]:
success_rate = quality_stats.get('success_rate', 0)
missing_count = quality_stats.get('urls_found_but_missing_metadata', 0)

print("=== QUALITY RECOMMENDATIONS ===")

if success_rate >= 99:
    print("✅ Excellent data quality - no immediate action needed")
elif success_rate >= 95:
    print("⚠️  Good quality with minor issues:")
    print("   - Monitor failing URLs for patterns")
    print("   - Consider retry mechanisms for failed downloads")
else:
    print("🚨 Quality concerns requiring attention:")
    print("   - Investigate root causes of download failures")
    print("   - Implement robust error handling and retries")
    print("   - Review URL validation and filtering")

if missing_count > 0:
    print(f"\n📊 {missing_count:,} URLs missing metadata - consider:")
    print("   - Improving metadata collection processes")
    print("   - Adding fallback data sources")
    print("   - Implementing data validation checks")

if not issues_df.empty:
    worst_class = issues_df.loc[issues_df['success_rate'].idxmin()]
    print(f"\n🎯 Focus improvement efforts on: {worst_class['class']}")
    print(f"   Current success rate: {worst_class['success_rate']:.1f}%")

## Export Options

Save quality analysis reports:

In [None]:
# Uncomment to save charts
# from plot_helpers import save_plot
# 
# save_plot(summary_fig, 'quality_summary', 'png', width=1200, height=400)
# save_plot(success_fig, 'success_rate_gauge', 'png', width=600, height=500)
# save_plot(efficiency_fig, 'download_efficiency', 'png', width=800, height=500)
# save_plot(class_success_fig, 'class_success_rates', 'png', width=1000, height=600)
# save_plot(detailed_fig, 'quality_issues_detailed', 'html')
# 
# print("Quality analysis charts saved to visualizations/output/")