# Image Quality Analysis

This notebook analyzes the quality characteristics of downloaded images:
- File sizes and formats
- Image dimensions and resolutions
- Quality distributions
- Storage analysis

In [1]:
# Import required modules
import sys
import os
sys.path.append('..')

from visualizations.data_loader import (
    load_all_metadata,
    create_image_details_dataframe
)

from visualizations.plotters import (
    plot_file_characteristics
)

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

print("✅ Modules imported successfully")

✅ Modules imported successfully


In [2]:
# Load data
metadata_list = load_all_metadata()
image_df = create_image_details_dataframe(metadata_list)

# Filter to downloaded images only
downloaded_df = image_df[image_df['has_download_data'] == True].copy()

print(f"📊 Analyzing {len(downloaded_df)} downloaded images")

if len(downloaded_df) == 0:
    print("❌ No downloaded images found for analysis")
else:
    print("✅ Ready for quality analysis")

Found 100 metadata files
✅ Successfully loaded 100 metadata files
📊 Analyzing 49036 downloaded images
✅ Ready for quality analysis


In [3]:
# Overall file characteristics
if len(downloaded_df) > 0:
    plot_file_characteristics(image_df, use_plotly=True)

In [4]:
# Detailed quality metrics
def analyze_quality_metrics(df):
    if len(df) == 0:
        return
    
    # File size analysis
    file_sizes_mb = df['bytes'] / 1048576  # Convert to MB
    
    print("📊 QUALITY METRICS SUMMARY")
    print("=" * 50)
    
    # File size statistics
    print(f"\n💾 File Size Analysis:")
    print(f"   Average: {file_sizes_mb.mean():.2f} MB")
    print(f"   Median: {file_sizes_mb.median():.2f} MB")
    print(f"   Min: {file_sizes_mb.min():.2f} MB")
    print(f"   Max: {file_sizes_mb.max():.2f} MB")
    print(f"   Total Storage: {file_sizes_mb.sum():.1f} MB")
    
    # Dimension statistics
    print(f"\n📐 Dimension Analysis:")
    print(f"   Average Width: {df['width'].mean():.0f} px")
    print(f"   Average Height: {df['height'].mean():.0f} px")
    print(f"   Max Resolution: {df['width'].max()} x {df['height'].max()} px")
    print(f"   Min Resolution: {df['width'].min()} x {df['height'].min()} px")
    
    # Aspect ratio analysis
    valid_aspect_ratios = df[df['aspect_ratio'] > 0]['aspect_ratio']
    if len(valid_aspect_ratios) > 0:
        print(f"   Average Aspect Ratio: {valid_aspect_ratios.mean():.2f}:1")
        print(f"   Most Common Ratio: ~{valid_aspect_ratios.mode().iloc[0]:.2f}:1")
    
    # Format distribution
    format_counts = df['format'].value_counts()
    print(f"\n🎨 Format Distribution:")
    for fmt, count in format_counts.head().items():
        percentage = (count / len(df)) * 100
        print(f"   {fmt.upper()}: {count} images ({percentage:.1f}%)")
    
    # Color mode distribution
    mode_counts = df['mode'].value_counts()
    print(f"\n🌈 Color Mode Distribution:")
    for mode, count in mode_counts.items():
        percentage = (count / len(df)) * 100
        print(f"   {mode}: {count} images ({percentage:.1f}%)")

analyze_quality_metrics(downloaded_df)

📊 QUALITY METRICS SUMMARY

💾 File Size Analysis:
   Average: 0.22 MB
   Median: 0.10 MB
   Min: 0.00 MB
   Max: 18.53 MB
   Total Storage: 10735.9 MB

📐 Dimension Analysis:
   Average Width: 936 px
   Average Height: 816 px
   Max Resolution: 19200.0 x 7023.0 px
   Min Resolution: 0.0 x 0.0 px
   Average Aspect Ratio: 1.26:1
   Most Common Ratio: ~1.00:1

🎨 Format Distribution:
   JPEG: 39856 images (81.3%)
   WEBP: 5869 images (12.0%)
   PNG: 2439 images (5.0%)
   JPG: 809 images (1.6%)
   MPO: 32 images (0.1%)

🌈 Color Mode Distribution:
   RGB: 46101 images (94.0%)
   RGBA: 1295 images (2.6%)
   unknown: 789 images (1.6%)
   P: 785 images (1.6%)
   L: 35 images (0.1%)
   CMYK: 27 images (0.1%)
   : 4 images (0.0%)


In [5]:
# Resolution quality categorization
def analyze_resolution_quality(df):
    if len(df) == 0:
        return
    
    def categorize_resolution(row):
        width, height = row['width'], row['height']
        pixels = width * height
        
        if width >= 3840 or height >= 2160:  # 4K+
            return '4K+'
        elif width >= 1920 or height >= 1080:  # Full HD
            return 'Full HD'
        elif width >= 1280 or height >= 720:  # HD
            return 'HD'
        elif width >= 854 or height >= 480:  # SD
            return 'SD'
        elif width >= 640 or height >= 360:  # Low
            return 'Low'
        else:
            return 'Very Low'
    
    df['resolution_category'] = df.apply(categorize_resolution, axis=1)
    
    # Create resolution analysis visualization
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            'Resolution Categories',
            'Resolution by Category',
            'File Size vs Resolution',
            'Pixel Count Distribution'
        ),
        specs=[[{"type": "pie"}, {"type": "bar"}],
               [{"type": "scatter"}, {"type": "histogram"}]]
    )
    
    # Resolution categories pie chart
    res_counts = df['resolution_category'].value_counts()
    fig.add_trace(
        go.Pie(
            labels=res_counts.index,
            values=res_counts.values,
            hole=0.3
        ),
        row=1, col=1
    )
    
    # Resolution by category
    category_res = df.groupby(['category', 'resolution_category']).size().unstack(fill_value=0)
    
    for res_cat in category_res.columns:
        fig.add_trace(
            go.Bar(
                x=category_res.index,
                y=category_res[res_cat],
                name=res_cat
            ),
            row=1, col=2
        )
    
    # File size vs resolution scatter
    fig.add_trace(
        go.Scatter(
            x=df['pixel_count'],
            y=df['bytes'] / 1048576,  # MB
            mode='markers',
            text=df['filename'],
            marker=dict(
                color=df['width'],
                colorscale='Viridis',
                showscale=True,
                colorbar=dict(title="Width (px)")
            ),
            opacity=0.6
        ),
        row=2, col=1
    )
    
    # Pixel count distribution
    fig.add_trace(
        go.Histogram(
            x=df['pixel_count'] / 1000000,  # Megapixels
            nbinsx=25
        ),
        row=2, col=2
    )
    
    fig.update_layout(
        title_text="📐 Resolution Quality Analysis",
        title_x=0.5,
        height=800,
        showlegend=False
    )
    
    # Update axis labels
    fig.update_xaxes(title_text="Category", row=1, col=2)
    fig.update_yaxes(title_text="Count", row=1, col=2)
    fig.update_xaxes(title_text="Pixel Count", row=2, col=1)
    fig.update_yaxes(title_text="File Size (MB)", row=2, col=1)
    fig.update_xaxes(title_text="Megapixels", row=2, col=2)
    fig.update_yaxes(title_text="Frequency", row=2, col=2)
    
    fig.show()
    
    # Print resolution summary
    print("📐 RESOLUTION ANALYSIS:")
    print("=" * 30)
    for category, count in res_counts.items():
        percentage = (count / len(df)) * 100
        print(f"   {category}: {count} images ({percentage:.1f}%)")

if len(downloaded_df) > 0:
    analyze_resolution_quality(downloaded_df)

📐 RESOLUTION ANALYSIS:
   HD: 12744 images (26.0%)
   SD: 10648 images (21.7%)
   Low: 8918 images (18.2%)
   Full HD: 8892 images (18.1%)
   Very Low: 6137 images (12.5%)
   4K+: 1697 images (3.5%)


In [6]:
# Format efficiency analysis
def analyze_format_efficiency(df):
    if len(df) == 0:
        return
    
    # Calculate efficiency metrics by format
    format_stats = df.groupby('format').agg({
        'bytes': ['mean', 'median', 'sum', 'count'],
        'pixel_count': 'mean',
        'width': 'mean',
        'height': 'mean'
    }).round(2)
    
    # Flatten column names
    format_stats.columns = ['_'.join(col) for col in format_stats.columns]
    
    # Calculate bytes per pixel for efficiency
    format_efficiency = df.groupby('format').apply(
        lambda x: (x['bytes'] / x['pixel_count']).mean() if x['pixel_count'].sum() > 0 else 0
    ).sort_values()
    
    # Create format comparison visualization
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            'Average File Size by Format',
            'Format Efficiency (Bytes per Pixel)',
            'File Count by Format',
            'Storage Usage by Format'
        )
    )
    
    # Average file size by format
    avg_sizes = df.groupby('format')['bytes'].mean() / 1048576  # MB
    fig.add_trace(
        go.Bar(
            x=avg_sizes.index,
            y=avg_sizes.values,
            text=[f"{v:.2f} MB" for v in avg_sizes.values],
            textposition='auto'
        ),
        row=1, col=1
    )
    
    # Format efficiency
    fig.add_trace(
        go.Bar(
            x=format_efficiency.index,
            y=format_efficiency.values,
            text=[f"{v:.3f}" for v in format_efficiency.values],
            textposition='auto',
            marker_color='lightcoral'
        ),
        row=1, col=2
    )
    
    # File count by format
    format_counts = df['format'].value_counts()
    fig.add_trace(
        go.Bar(
            x=format_counts.index,
            y=format_counts.values,
            text=format_counts.values,
            textposition='auto',
            marker_color='lightblue'
        ),
        row=2, col=1
    )
    
    # Storage usage by format
    storage_by_format = df.groupby('format')['bytes'].sum() / 1048576  # MB
    fig.add_trace(
        go.Bar(
            x=storage_by_format.index,
            y=storage_by_format.values,
            text=[f"{v:.1f} MB" for v in storage_by_format.values],
            textposition='auto',
            marker_color='lightgreen'
        ),
        row=2, col=2
    )
    
    fig.update_layout(
        title_text="🎨 Format Efficiency Analysis",
        title_x=0.5,
        height=700,
        showlegend=False
    )
    
    fig.show()
    
    print("🎨 FORMAT EFFICIENCY SUMMARY:")
    print("=" * 40)
    print("\nMost efficient formats (lowest bytes per pixel):")
    for fmt, efficiency in format_efficiency.head().items():
        print(f"   {fmt.upper()}: {efficiency:.4f} bytes/pixel")
    
    print("\nLargest storage consumers:")
    for fmt, storage in storage_by_format.nlargest(5).items():
        percentage = (storage / storage_by_format.sum()) * 100
        print(f"   {fmt.upper()}: {storage:.1f} MB ({percentage:.1f}%)")

if len(downloaded_df) > 0:
    analyze_format_efficiency(downloaded_df)

🎨 FORMAT EFFICIENCY SUMMARY:

Most efficient formats (lowest bytes per pixel):
   SVG: 0.0000 bytes/pixel
   PNJ: 0.0000 bytes/pixel
   WEBP: 0.1852 bytes/pixel
   MPO: 0.4215 bytes/pixel
   GIF: 2.4079 bytes/pixel

Largest storage consumers:
   JPEG: 8186.6 MB (76.3%)
   PNG: 1463.6 MB (13.6%)
   WEBP: 847.6 MB (7.9%)
   JPG: 194.3 MB (1.8%)
   MPO: 34.5 MB (0.3%)


In [7]:
# Quality recommendations
def generate_quality_recommendations(df):
    if len(df) == 0:
        print("No data available for recommendations")
        return
    
    recommendations = []
    
    # File size analysis
    file_sizes_mb = df['bytes'] / 1048576
    very_large_files = df[file_sizes_mb > 10]  # Files larger than 10MB
    very_small_files = df[file_sizes_mb < 0.1]  # Files smaller than 100KB
    
    if len(very_large_files) > 0:
        recommendations.append({
            'type': 'Storage Optimization',
            'issue': f'{len(very_large_files)} files are very large (>10MB)',
            'impact': f'Using {file_sizes_mb[file_sizes_mb > 10].sum():.1f} MB of storage',
            'suggestion': 'Consider image compression or format conversion'
        })
    
    if len(very_small_files) > 0:
        recommendations.append({
            'type': 'Quality Check',
            'issue': f'{len(very_small_files)} files are very small (<100KB)',
            'impact': 'May indicate low-quality images or thumbnails',
            'suggestion': 'Review these files for quality and consider filtering'
        })
    
    # Resolution analysis
    low_res_images = df[(df['width'] < 640) | (df['height'] < 480)]
    if len(low_res_images) > 0:
        recommendations.append({
            'type': 'Resolution Quality',
            'issue': f'{len(low_res_images)} images have low resolution',
            'impact': 'May not be suitable for high-quality applications',
            'suggestion': 'Consider setting minimum resolution requirements'
        })
    
    # Format optimization
    jpeg_files = df[df['format'] == 'jpeg']
    png_files = df[df['format'] == 'png']
    webp_files = df[df['format'] == 'webp']
    
    if len(png_files) > len(webp_files) + len(jpeg_files):
        png_storage = (png_files['bytes'].sum() / 1048576)
        recommendations.append({
            'type': 'Format Optimization',
            'issue': f'Many PNG files ({len(png_files)}) using {png_storage:.1f} MB',
            'impact': 'PNG files are typically larger than JPEG/WebP',
            'suggestion': 'Consider converting PNGs to JPEG for photos or WebP for better compression'
        })
    
    # Aspect ratio consistency
    valid_ratios = df[df['aspect_ratio'] > 0]['aspect_ratio']
    if len(valid_ratios) > 0:
        ratio_std = valid_ratios.std()
        if ratio_std > 1.0:  # High variation in aspect ratios
            recommendations.append({
                'type': 'Consistency',
                'issue': f'High variation in aspect ratios (std: {ratio_std:.2f})',
                'impact': 'May indicate inconsistent image cropping or mixed content types',
                'suggestion': 'Review image sources and consider standardizing aspect ratios'
            })
    
    print("💡 QUALITY IMPROVEMENT RECOMMENDATIONS:")
    print("=" * 50)
    
    if not recommendations:
        print("✅ No major quality issues detected! Your images are well-optimized.")
    else:
        for i, rec in enumerate(recommendations, 1):
            print(f"\n{i}. {rec['type']}")
            print(f"   Issue: {rec['issue']}")
            print(f"   Impact: {rec['impact']}")
            print(f"   Suggestion: {rec['suggestion']}")
    
    # Overall quality score
    quality_score = 100
    
    # Deduct points for issues
    if len(very_large_files) > len(df) * 0.1:  # >10% very large files
        quality_score -= 15
    if len(very_small_files) > len(df) * 0.1:  # >10% very small files
        quality_score -= 15
    if len(low_res_images) > len(df) * 0.2:  # >20% low resolution
        quality_score -= 20
    if len(valid_ratios) > 0 and valid_ratios.std() > 1.0:
        quality_score -= 10
    
    quality_score = max(0, quality_score)  # Don't go below 0
    
    print(f"\n🎯 Overall Quality Score: {quality_score}/100")
    if quality_score >= 90:
        print("   Excellent image quality! 🌟")
    elif quality_score >= 75:
        print("   Good image quality with room for improvement 👍")
    elif quality_score >= 60:
        print("   Moderate quality - several areas need attention ⚠️")
    else:
        print("   Poor quality - significant improvements needed ❌")

if len(downloaded_df) > 0:
    generate_quality_recommendations(downloaded_df)

💡 QUALITY IMPROVEMENT RECOMMENDATIONS:

1. Storage Optimization
   Issue: 17 files are very large (>10MB)
   Impact: Using 215.0 MB of storage
   Suggestion: Consider image compression or format conversion

2. Quality Check
   Issue: 25352 files are very small (<100KB)
   Impact: May indicate low-quality images or thumbnails
   Suggestion: Review these files for quality and consider filtering

3. Resolution Quality
   Issue: 19193 images have low resolution
   Impact: May not be suitable for high-quality applications
   Suggestion: Consider setting minimum resolution requirements

🎯 Overall Quality Score: 65/100
   Moderate quality - several areas need attention ⚠️


## Image Quality Summary

This analysis provides comprehensive insights into the quality of your downloaded images:

### Key Quality Metrics
- **File Sizes**: Distribution and efficiency of storage usage
- **Resolutions**: Quality categories from low to 4K+
- **Formats**: Efficiency and optimization opportunities
- **Consistency**: Aspect ratios and dimensional consistency

### Quality Assessment
- **Storage Optimization**: Identifying files that are too large or too small
- **Resolution Quality**: Ensuring adequate resolution for intended use
- **Format Efficiency**: Optimizing file formats for better compression
- **Overall Score**: Comprehensive quality rating with improvement suggestions

### Optimization Opportunities
1. **Large Files**: Consider compression for files >10MB
2. **Small Files**: Review files <100KB for quality issues
3. **Format Conversion**: PNG to JPEG/WebP for better compression
4. **Resolution Standards**: Set minimum resolution requirements
5. **Consistency**: Standardize aspect ratios where appropriate