# Drone Imagery Dataset Analysis

This notebook provides comprehensive analysis and exploration of the drone imagery dataset stored in S3 bucket 'lucaskle-ab3-project-pv'. It includes data profiling, visualization, and quality validation for YOLOv11 format requirements.

## Requirements Addressed:
- 1.1: Data Scientist read-only access to S3 dataset
- 1.2: Data profiling and visualization capabilities
- 1.3: Dataset exploration functionality

In [None]:
# Import required libraries
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import json
import os
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Import custom utilities
import sys
sys.path.append('../../src')
from data.s3_utils import S3DataAccess
from data.data_profiler import DroneImageryProfiler
from data.data_validator import YOLOv11Validator

## 1. Initialize S3 Data Access

Set up connection to the S3 bucket containing drone imagery data.

In [None]:
# Initialize S3 data access with proper error handling
BUCKET_NAME = 'lucaskle-ab3-project-pv'
AWS_PROFILE = 'ab'  # Using the specified AWS CLI profile

try:
    s3_access = S3DataAccess(bucket_name=BUCKET_NAME, aws_profile=AWS_PROFILE)
    print(f"✅ Successfully connected to S3 bucket: {BUCKET_NAME}")
    
    # Test connection and list some objects
    sample_objects = s3_access.list_objects(prefix='', max_keys=10)
    print(f"📁 Found {len(sample_objects)} sample objects in bucket")
    
except Exception as e:
    print(f"❌ Error connecting to S3: {str(e)}")
    print("Please ensure AWS credentials are configured for profile 'ab'")

## 2. Dataset Overview and Structure Analysis

In [None]:
# Analyze dataset structure
print("🔍 Analyzing dataset structure...")

# Get all objects in the bucket
all_objects = s3_access.list_objects(prefix='', max_keys=1000)

# Categorize files by type
file_types = {}
total_size = 0

for obj in all_objects:
    key = obj['Key']
    size = obj['Size']
    total_size += size
    
    # Extract file extension
    ext = key.split('.')[-1].lower() if '.' in key else 'no_extension'
    
    if ext not in file_types:
        file_types[ext] = {'count': 0, 'total_size': 0}
    
    file_types[ext]['count'] += 1
    file_types[ext]['total_size'] += size

# Display dataset overview
print(f"📊 Dataset Overview:")
print(f"   Total files: {len(all_objects)}")
print(f"   Total size: {total_size / (1024**3):.2f} GB")
print(f"\n📁 File types distribution:")

for ext, info in sorted(file_types.items(), key=lambda x: x[1]['count'], reverse=True):
    size_mb = info['total_size'] / (1024**2)
    print(f"   .{ext}: {info['count']} files ({size_mb:.1f} MB)")

## 3. Image Data Profiling and Analysis

In [None]:
# Initialize data profiler
profiler = DroneImageryProfiler(s3_access)

# Profile image characteristics
print("🖼️ Profiling drone imagery characteristics...")

# Get image files (assuming common image extensions)
image_extensions = ['jpg', 'jpeg', 'png', 'tiff', 'tif']
image_files = [obj['Key'] for obj in all_objects 
               if any(obj['Key'].lower().endswith(f'.{ext}') for ext in image_extensions)]

print(f"📸 Found {len(image_files)} image files")

# Sample a subset for detailed analysis (to avoid processing too many files)
sample_size = min(50, len(image_files))
sample_images = np.random.choice(image_files, sample_size, replace=False) if len(image_files) > 0 else []

print(f"🎯 Analyzing sample of {len(sample_images)} images for detailed profiling")

In [None]:
# Perform detailed image analysis
if len(sample_images) > 0:
    image_profile = profiler.profile_images(sample_images)
    
    # Display image statistics
    print("📊 Image Characteristics Summary:")
    print(f"   Resolution range: {image_profile['min_width']}x{image_profile['min_height']} to {image_profile['max_width']}x{image_profile['max_height']}")
    print(f"   Average resolution: {image_profile['avg_width']:.0f}x{image_profile['avg_height']:.0f}")
    print(f"   Color modes: {', '.join(image_profile['color_modes'])}")
    print(f"   File formats: {', '.join(image_profile['formats'])}")
    print(f"   Average file size: {image_profile['avg_file_size']:.1f} MB")
    
    # Create visualizations
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('Drone Imagery Dataset Analysis', fontsize=16, fontweight='bold')
    
    # Resolution distribution
    axes[0, 0].scatter(image_profile['widths'], image_profile['heights'], alpha=0.6)
    axes[0, 0].set_xlabel('Width (pixels)')
    axes[0, 0].set_ylabel('Height (pixels)')
    axes[0, 0].set_title('Image Resolution Distribution')
    axes[0, 0].grid(True, alpha=0.3)
    
    # Aspect ratio distribution
    aspect_ratios = [w/h for w, h in zip(image_profile['widths'], image_profile['heights'])]
    axes[0, 1].hist(aspect_ratios, bins=20, alpha=0.7, edgecolor='black')
    axes[0, 1].set_xlabel('Aspect Ratio (W/H)')
    axes[0, 1].set_ylabel('Frequency')
    axes[0, 1].set_title('Aspect Ratio Distribution')
    axes[0, 1].grid(True, alpha=0.3)
    
    # File size distribution
    file_sizes_mb = [size / (1024**2) for size in image_profile['file_sizes']]
    axes[1, 0].hist(file_sizes_mb, bins=20, alpha=0.7, edgecolor='black')
    axes[1, 0].set_xlabel('File Size (MB)')
    axes[1, 0].set_ylabel('Frequency')
    axes[1, 0].set_title('File Size Distribution')
    axes[1, 0].grid(True, alpha=0.3)
    
    # Color mode distribution
    color_mode_counts = {mode: image_profile['color_modes'].count(mode) for mode in set(image_profile['color_modes'])}
    axes[1, 1].pie(color_mode_counts.values(), labels=color_mode_counts.keys(), autopct='%1.1f%%')
    axes[1, 1].set_title('Color Mode Distribution')
    
    plt.tight_layout()
    plt.show()
    
else:
    print("⚠️ No image files found for analysis")

## 4. YOLOv11 Format Validation

In [None]:
# Initialize YOLOv11 validator
validator = YOLOv11Validator(s3_access)

print("🎯 Validating data for YOLOv11 format requirements...")

# Look for annotation files (common formats: .txt, .json, .xml)
annotation_files = [obj['Key'] for obj in all_objects 
                   if any(obj['Key'].lower().endswith(f'.{ext}') for ext in ['txt', 'json', 'xml', 'yaml', 'yml'])]

print(f"📝 Found {len(annotation_files)} potential annotation files")

# Validate dataset structure for YOLOv11
validation_results = validator.validate_dataset_structure(image_files, annotation_files)

print("\n✅ YOLOv11 Format Validation Results:")
print(f"   Images suitable for YOLOv11: {validation_results['valid_images']}")
print(f"   Images requiring preprocessing: {validation_results['images_need_preprocessing']}")
print(f"   Annotation files found: {validation_results['annotation_files_found']}")
print(f"   Recommended input size: {validation_results['recommended_input_size']}")

if validation_results['issues']:
    print("\n⚠️ Issues identified:")
    for issue in validation_results['issues']:
        print(f"   - {issue}")

if validation_results['recommendations']:
    print("\n💡 Recommendations:")
    for rec in validation_results['recommendations']:
        print(f"   - {rec}")

## 5. Data Quality Assessment

In [None]:
# Perform comprehensive data quality assessment
print("🔍 Performing data quality assessment...")

quality_report = validator.assess_data_quality(sample_images)

print("\n📊 Data Quality Report:")
print(f"   Overall quality score: {quality_report['overall_score']:.1f}/10")
print(f"   Images with good quality: {quality_report['high_quality_count']}")
print(f"   Images with medium quality: {quality_report['medium_quality_count']}")
print(f"   Images with low quality: {quality_report['low_quality_count']}")

# Quality metrics breakdown
print("\n📈 Quality Metrics:")
for metric, value in quality_report['metrics'].items():
    print(f"   {metric}: {value:.3f}")

# Visualize quality distribution
if 'quality_scores' in quality_report:
    plt.figure(figsize=(10, 6))
    plt.hist(quality_report['quality_scores'], bins=20, alpha=0.7, edgecolor='black')
    plt.axvline(quality_report['overall_score'], color='red', linestyle='--', 
                label=f'Average Score: {quality_report["overall_score"]:.1f}')
    plt.xlabel('Quality Score')
    plt.ylabel('Number of Images')
    plt.title('Image Quality Score Distribution')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

## 6. Sample Image Visualization

In [None]:
# Display sample images from the dataset
if len(sample_images) >= 4:
    print("🖼️ Displaying sample images from the dataset...")
    
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    fig.suptitle('Sample Drone Imagery from Dataset', fontsize=16, fontweight='bold')
    
    # Display 4 random sample images
    sample_to_display = np.random.choice(sample_images, 4, replace=False)
    
    for idx, img_key in enumerate(sample_to_display):
        try:
            # Download and display image
            img_data = s3_access.download_file_to_memory(img_key)
            img = Image.open(img_data)
            
            row, col = idx // 2, idx % 2
            axes[row, col].imshow(img)
            axes[row, col].set_title(f'{os.path.basename(img_key)}\n{img.size[0]}x{img.size[1]}')
            axes[row, col].axis('off')
            
        except Exception as e:
            axes[row, col].text(0.5, 0.5, f'Error loading\n{os.path.basename(img_key)}', 
                              ha='center', va='center', transform=axes[row, col].transAxes)
            axes[row, col].set_title(f'Error: {str(e)[:30]}...')
    
    plt.tight_layout()
    plt.show()
    
else:
    print("⚠️ Not enough sample images available for visualization")

## 7. Summary and Recommendations

In [None]:
# Generate comprehensive summary report
print("📋 Dataset Analysis Summary Report")
print("=" * 50)

print(f"\n🗂️ Dataset Overview:")
print(f"   • Total files in bucket: {len(all_objects)}")
print(f"   • Image files identified: {len(image_files)}")
print(f"   • Annotation files found: {len(annotation_files)}")
print(f"   • Total dataset size: {total_size / (1024**3):.2f} GB")

if len(sample_images) > 0 and 'image_profile' in locals():
    print(f"\n🖼️ Image Characteristics:")
    print(f"   • Resolution range: {image_profile['min_width']}x{image_profile['min_height']} to {image_profile['max_width']}x{image_profile['max_height']}")
    print(f"   • Average resolution: {image_profile['avg_width']:.0f}x{image_profile['avg_height']:.0f}")
    print(f"   • Dominant color mode: {max(set(image_profile['color_modes']), key=image_profile['color_modes'].count)}")
    print(f"   • Average file size: {image_profile['avg_file_size']:.1f} MB")

if 'validation_results' in locals():
    print(f"\n🎯 YOLOv11 Readiness:")
    print(f"   • Images ready for YOLOv11: {validation_results['valid_images']}")
    print(f"   • Images needing preprocessing: {validation_results['images_need_preprocessing']}")
    print(f"   • Recommended input size: {validation_results['recommended_input_size']}")

if 'quality_report' in locals():
    print(f"\n✅ Data Quality:")
    print(f"   • Overall quality score: {quality_report['overall_score']:.1f}/10")
    print(f"   • High quality images: {quality_report['high_quality_count']}")
    print(f"   • Images requiring attention: {quality_report['low_quality_count']}")

print(f"\n💡 Next Steps:")
print(f"   1. Review data quality issues and consider filtering low-quality images")
print(f"   2. Implement data preprocessing pipeline for YOLOv11 format conversion")
print(f"   3. Create or validate annotation files for supervised learning")
print(f"   4. Consider data augmentation strategies for training enhancement")
print(f"   5. Set up data versioning and lineage tracking for MLOps pipeline")

print("\n" + "=" * 50)
print("✅ Dataset analysis completed successfully!")