# Task 6: Dataset Analysis

This notebook analyzes the COCO dataset for text-to-image generation tasks.


In [None]:
import matplotlib.pyplot as plt
import json
from PIL import Image
import os
import numpy as np
import pandas as pd


In [None]:
# Load COCO annotations
# Update paths as needed
annotations_file = "annotations/captions_train2017.json"  # Update with your path
image_dir = "train2017/"  # Update with your path

# For demonstration, we'll create sample data structure
print("Dataset Analysis - COCO Dataset")
print("=" * 50)

# If files exist, load them
if os.path.exists(annotations_file) and os.path.exists(image_dir):
    with open(annotations_file, 'r') as f:
        coco_data = json.load(f)
    
    captions = [ann['caption'] for ann in coco_data['annotations']]
    images = coco_data['images']
    caption_lengths = [len(caption.split()) for caption in captions]
    
    print(f"Number of images: {len(images)}")
    print(f"Number of captions: {len(captions)}")
    print(f"Average caption length: {np.mean(caption_lengths):.2f} words")
    print(f"Min caption length: {np.min(caption_lengths)} words")
    print(f"Max caption length: {np.max(caption_lengths)} words")
    
    # Visualize caption length distribution
    plt.figure(figsize=(10, 5))
    plt.hist(caption_lengths, bins=30, edgecolor='black')
    plt.title("Caption Length Distribution")
    plt.xlabel("Number of Words")
    plt.ylabel("Frequency")
    plt.grid(True, alpha=0.3)
    plt.show()
    
    # Display sample images with captions
    for i in range(min(3, len(images))):
        img_info = images[i]
        img_path = os.path.join(image_dir, img_info['file_name'])
        
        if os.path.exists(img_path):
            img = Image.open(img_path)
            anns = [ann['caption'] for ann in coco_data['annotations'] if ann['image_id'] == img_info['id']]
            caption = anns[0] if anns else "No caption"
            
            plt.figure(figsize=(8, 8))
            plt.imshow(img)
            plt.title(caption, fontsize=10, wrap=True)
            plt.axis('off')
            plt.tight_layout()
            plt.show()
else:
    print("Note: Please update the paths to your COCO dataset files")
    print("Expected structure:")
    print("  - annotations/captions_train2017.json")
    print("  - train2017/ (directory with images)")
