# Data Profiling for Drone Imagery Dataset

This notebook focuses specifically on data profiling functions and quality metrics for the drone imagery dataset. It provides detailed analysis of image characteristics and generates comprehensive profiling reports.

## Requirements Addressed:
- 1.2: Data profiling and visualization capabilities
- Analysis of drone imagery characteristics for ML pipeline optimization

In [None]:
# Import required libraries
import sys
sys.path.append('../../src')

from data.s3_utils import S3DataAccess
from data.data_profiler import DroneImageryProfiler
from data.data_validator import YOLOv11Validator

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from datetime import datetime

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print(f"Data profiling session started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## 1. Initialize Data Access and Profiler

In [None]:
# Configuration
BUCKET_NAME = 'lucaskle-ab3-project-pv'
AWS_PROFILE = 'ab'

# Initialize S3 access and profiler
try:
    s3_access = S3DataAccess(bucket_name=BUCKET_NAME, aws_profile=AWS_PROFILE)
    profiler = DroneImageryProfiler(s3_access)
    validator = YOLOv11Validator(s3_access)
    
    print("✅ Successfully initialized data access and profiling tools")
    
    # Display connection info
    conn_info = s3_access.get_connection_info()
    print(f"📡 Connected to: {conn_info['bucket_name']} using profile '{conn_info['aws_profile']}'")
    
except Exception as e:
    print(f"❌ Error initializing tools: {str(e)}")
    print("Please ensure AWS credentials are properly configured")

## 2. Discover and Categorize Dataset Files

In [None]:
# Discover all files in the dataset
print("🔍 Discovering dataset files...")

all_objects = s3_access.list_objects(prefix='', max_keys=2000)
print(f"📁 Found {len(all_objects)} total files")

# Filter image files
image_extensions = ['jpg', 'jpeg', 'png', 'tiff', 'tif', 'bmp']
image_files = s3_access.filter_objects_by_extension(image_extensions)

# Filter annotation files
annotation_extensions = ['txt', 'json', 'xml', 'yaml', 'yml']
annotation_files = s3_access.filter_objects_by_extension(annotation_extensions)

print(f"🖼️ Image files: {len(image_files)}")
print(f"📝 Annotation files: {len(annotation_files)}")

# Display file type distribution
file_types = {}
for obj in all_objects:
    ext = obj['Key'].split('.')[-1].lower() if '.' in obj['Key'] else 'no_extension'
    file_types[ext] = file_types.get(ext, 0) + 1

print("\n📊 File type distribution:")
for ext, count in sorted(file_types.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(f"   .{ext}: {count} files")

## 3. Comprehensive Image Profiling

In [None]:
# Perform comprehensive image profiling
if len(image_files) > 0:
    print(f"🔬 Starting comprehensive profiling of {len(image_files)} images...")
    
    # Use a reasonable sample size for detailed analysis
    sample_size = min(100, len(image_files))
    print(f"📊 Analyzing sample of {sample_size} images for detailed profiling")
    
    # Perform profiling
    profile_results = profiler.profile_images(image_files, sample_size=sample_size)
    
    print(f"✅ Profiling completed successfully!")
    print(f"   - Images processed: {profile_results['total_images_processed']}")
    print(f"   - Processing errors: {profile_results['processing_errors']}")
    print(f"   - Success rate: {profile_results['success_rate']:.1%}")
    
else:
    print("⚠️ No image files found for profiling")
    profile_results = None

## 4. Generate Detailed Profile Report

In [None]:
# Generate and display comprehensive profile report
if profile_results:
    print("📋 Generating comprehensive profile report...\n")
    
    # Generate formatted report
    report = profiler.generate_profile_report(profile_results)
    print(report)
    
    # Generate recommendations
    recommendations = profiler.get_recommendations(profile_results)
    
    print("\n💡 PROFILING RECOMMENDATIONS")
    print("-" * 40)
    for i, rec in enumerate(recommendations, 1):
        print(f"{i}. {rec}")
        
else:
    print("⚠️ No profile results available for report generation")

## 5. Advanced Quality Metrics Visualization

In [None]:
# Create advanced visualizations of quality metrics
if profile_results and len(profile_results.get('brightness_scores', [])) > 0:
    print("📈 Creating advanced quality metrics visualizations...")
    
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    fig.suptitle('Advanced Image Quality Metrics Analysis', fontsize=16, fontweight='bold')
    
    # Brightness distribution
    axes[0, 0].hist(profile_results['brightness_scores'], bins=30, alpha=0.7, color='gold', edgecolor='black')
    axes[0, 0].axvline(np.mean(profile_results['brightness_scores']), color='red', linestyle='--', 
                      label=f'Mean: {np.mean(profile_results["brightness_scores"]):.3f}')
    axes[0, 0].set_xlabel('Brightness Score')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].set_title('Brightness Distribution')
    axes[0, 0].legend()
    axes[0, 0].grid(True, alpha=0.3)
    
    # Contrast distribution
    axes[0, 1].hist(profile_results['contrast_scores'], bins=30, alpha=0.7, color='lightblue', edgecolor='black')
    axes[0, 1].axvline(np.mean(profile_results['contrast_scores']), color='red', linestyle='--',
                      label=f'Mean: {np.mean(profile_results["contrast_scores"]):.3f}')
    axes[0, 1].set_xlabel('Contrast Score')
    axes[0, 1].set_ylabel('Frequency')
    axes[0, 1].set_title('Contrast Distribution')
    axes[0, 1].legend()
    axes[0, 1].grid(True, alpha=0.3)
    
    # Sharpness distribution
    axes[0, 2].hist(profile_results['sharpness_scores'], bins=30, alpha=0.7, color='lightgreen', edgecolor='black')
    axes[0, 2].axvline(np.mean(profile_results['sharpness_scores']), color='red', linestyle='--',
                      label=f'Mean: {np.mean(profile_results["sharpness_scores"]):.3f}')
    axes[0, 2].set_xlabel('Sharpness Score')
    axes[0, 2].set_ylabel('Frequency')
    axes[0, 2].set_title('Sharpness Distribution')
    axes[0, 2].legend()
    axes[0, 2].grid(True, alpha=0.3)
    
    # Color diversity distribution
    axes[1, 0].hist(profile_results['color_diversity_scores'], bins=30, alpha=0.7, color='coral', edgecolor='black')
    axes[1, 0].axvline(np.mean(profile_results['color_diversity_scores']), color='red', linestyle='--',
                      label=f'Mean: {np.mean(profile_results["color_diversity_scores"]):.3f}')
    axes[1, 0].set_xlabel('Color Diversity Score')
    axes[1, 0].set_ylabel('Frequency')
    axes[1, 0].set_title('Color Diversity Distribution')
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3)
    
    # Quality correlation matrix
    quality_data = pd.DataFrame({
        'Brightness': profile_results['brightness_scores'],
        'Contrast': profile_results['contrast_scores'],
        'Sharpness': profile_results['sharpness_scores'],
        'Color Diversity': profile_results['color_diversity_scores']
    })
    
    correlation_matrix = quality_data.corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
                square=True, ax=axes[1, 1], cbar_kws={'shrink': 0.8})
    axes[1, 1].set_title('Quality Metrics Correlation')
    
    # Overall quality score distribution
    overall_scores = []
    for i in range(len(profile_results['brightness_scores'])):
        # Calculate composite quality score
        score = (profile_results['brightness_scores'][i] + 
                profile_results['contrast_scores'][i] + 
                profile_results['sharpness_scores'][i] + 
                profile_results['color_diversity_scores'][i]) / 4
        overall_scores.append(score)
    
    axes[1, 2].hist(overall_scores, bins=30, alpha=0.7, color='mediumpurple', edgecolor='black')
    axes[1, 2].axvline(np.mean(overall_scores), color='red', linestyle='--',
                      label=f'Mean: {np.mean(overall_scores):.3f}')
    axes[1, 2].set_xlabel('Overall Quality Score')
    axes[1, 2].set_ylabel('Frequency')
    axes[1, 2].set_title('Overall Quality Distribution')
    axes[1, 2].legend()
    axes[1, 2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
else:
    print("⚠️ Insufficient data for quality metrics visualization")

## 6. Export Profiling Results

In [None]:
# Export profiling results for further analysis
if profile_results:
    print("💾 Exporting profiling results...")
    
    # Create summary DataFrame
    if len(profile_results.get('widths', [])) > 0:
        summary_data = {
            'Image_Index': range(len(profile_results['widths'])),
            'Width': profile_results['widths'],
            'Height': profile_results['heights'],
            'Aspect_Ratio': profile_results['aspect_ratios'],
            'File_Size_MB': [size / (1024*1024) for size in profile_results['file_sizes']],
            'Color_Mode': profile_results['color_modes'],
            'Format': profile_results['formats'],
            'Brightness': profile_results['brightness_scores'],
            'Contrast': profile_results['contrast_scores'],
            'Sharpness': profile_results['sharpness_scores'],
            'Color_Diversity': profile_results['color_diversity_scores']
        }
        
        df = pd.DataFrame(summary_data)
        
        # Display summary statistics
        print("\n📊 Summary Statistics:")
        print(df.describe())
        
        # Save to CSV (optional - uncomment if needed)
        # timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        # csv_filename = f'drone_imagery_profile_{timestamp}.csv'
        # df.to_csv(csv_filename, index=False)
        # print(f"\n💾 Results exported to: {csv_filename}")
        
        print("\n✅ Profiling analysis completed successfully!")
        
    else:
        print("⚠️ No data available for export")
        
else:
    print("⚠️ No profiling results available for export")

## 7. Profiling Session Summary

In [None]:
# Generate session summary
print("📋 DATA PROFILING SESSION SUMMARY")
print("=" * 50)
print(f"Session completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Dataset: {BUCKET_NAME}")
print(f"AWS Profile: {AWS_PROFILE}")

if profile_results:
    print(f"\n📊 Processing Results:")
    print(f"   • Total images found: {len(image_files)}")
    print(f"   • Images analyzed: {profile_results['total_images_processed']}")
    print(f"   • Processing success rate: {profile_results['success_rate']:.1%}")
    print(f"   • Processing errors: {profile_results['processing_errors']}")
    
    if 'avg_width' in profile_results:
        print(f"\n🖼️ Image Characteristics:")
        print(f"   • Average resolution: {profile_results['avg_width']:.0f}x{profile_results['avg_height']:.0f}")
        print(f"   • Resolution range: {profile_results['min_width']}x{profile_results['min_height']} to {profile_results['max_width']}x{profile_results['max_height']}")
        print(f"   • Average file size: {profile_results['avg_file_size']:.1f} MB")
        print(f"   • Color modes: {', '.join(profile_results['color_modes'])}")
        print(f"   • File formats: {', '.join(profile_results['formats'])}")
    
    if 'avg_brightness' in profile_results:
        print(f"\n✅ Quality Metrics:")
        print(f"   • Average brightness: {profile_results['avg_brightness']:.3f}")
        print(f"   • Average contrast: {profile_results['avg_contrast']:.3f}")
        print(f"   • Average sharpness: {profile_results['avg_sharpness']:.3f}")
        print(f"   • Average color diversity: {profile_results['avg_color_diversity']:.3f}")

print(f"\n📝 Annotation files found: {len(annotation_files)}")
print(f"\n🎯 Ready for next steps: YOLOv11 training pipeline development")
print("=" * 50)