# sample image of orginal and anoted



In [None]:
import os
import numpy as np
from PIL import Image  # Pillow for image loading
import matplotlib.pyplot as plt
from google.colab import drive
from collections import Counter
import glob  # To find files easily

# Mount Google Drive (this cell will prompt for authorization)
drive.mount('/content/drive')


In [None]:

# IMPORTANT: Update this path to where your Cityscapes dataset is located in your Drive
# Assuming a standard Cityscapes structure (e.g., .../cityscapes/)
DRIVE_PATH = '/content/drive/MyDrive/'  # Change MyDrive if your path is different
CITYSCAPES_ROOT = os.path.join(DRIVE_PATH, 'cityscape')  # Adjust 'cityscapes' if needed

# Define paths for images and labels (adjust 'leftImg8bit' and 'gtFine' if necessary)
# Using the 'train' split as an example
IMG_DIR = os.path.join(CITYSCAPES_ROOT, 'leftImg8bit', 'train')
LABEL_DIR = os.path.join(CITYSCAPES_ROOT, 'gtFine', 'train')  # For fine annotations

# Check if directories exist
if not os.path.isdir(CITYSCAPES_ROOT):
    print(f"ERROR: Cityscapes root directory not found at {CITYSCAPES_ROOT}")
    # Add further error handling or exit if needed
elif not os.path.isdir(IMG_DIR):
    print(f"ERROR: Image directory not found at {IMG_DIR}")
elif not os.path.isdir(LABEL_DIR):
    print(f"ERROR: Label directory not found at {LABEL_DIR}")
else:
    print("Dataset paths seem okay.")


In [None]:

# Get a list of image files (e.g., from the first city folder found)
# Note: Cityscapes has subfolders for each city
city_folders = [f for f in os.listdir(IMG_DIR) if os.path.isdir(os.path.join(IMG_DIR, f))]

if city_folders:
    example_city_img_dir = os.path.join(IMG_DIR, city_folders[0])
    image_files = glob.glob(os.path.join(example_city_img_dir, '*.png'))  # Find all PNG images in the first city folder

    if image_files:
        num_samples_to_show = 3
        print(f"\n--- Visualizing {num_samples_to_show} Sample Images and Masks ---")

        for i in range(min(num_samples_to_show, len(image_files))):
            img_path = image_files[i]

            # Construct the corresponding label path.
            base_name = os.path.basename(img_path).replace('_leftImg8bit.png', '')
            label_name_options = [
                f'{base_name}_gtFine_labelIds.png',   # Standard label ID file
                f'{base_name}_gtFine_color.png',      # Color visualization file
                f'{base_name}_gtFine_instanceIds.png' # Instance ID file
            ]

            label_path = None
            example_city_label_dir = os.path.join(LABEL_DIR, city_folders[0])  # Corresponding label city folder
            for name in label_name_options:
                potential_path = os.path.join(example_city_label_dir, name)
                if os.path.exists(potential_path):
                    label_path = potential_path
                    print(f"Found label: {os.path.basename(label_path)}")
                    break  # Use the first one found (prefer labelIds if available)

            if label_path:
                try:
                    img = Image.open(img_path)
                    label = Image.open(label_path)  # Load label mask

                    fig, axes = plt.subplots(1, 2, figsize=(12, 6))
                    axes[0].imshow(img)
                    axes[0].set_title(f"Image: {os.path.basename(img_path)}")
                    axes[0].axis('off')

                    axes[1].imshow(label)  # Displaying the label mask (might be grayscale IDs or color)
                    axes[1].set_title(f"Mask: {os.path.basename(label_path)}")
                    axes[1].axis('off')

                    plt.tight_layout()
                    plt.show()

                except Exception as e:
                    print(f"Could not load or display image/label pair: {img_path}, {label_path}. Error: {e}")
            else:
                print(f"Could not find a matching label for image: {img_path}")
    else:
        print(f"No image files found in example directory: {example_city_img_dir}")
else:
    print(f"No city subfolders found in image directory: {IMG_DIR}")


In [None]:

print("\n--- Checking Image Dimensions (First few images) ---")
image_dims = set()
if city_folders and image_files:  # Reuse files list from visualization
    for i in range(min(5, len(image_files))):  # Check first 5 images
        try:
            with Image.open(image_files[i]) as img:
                image_dims.add(img.size)  # PIL uses (width, height)
        except Exception as e:
            print(f"Could not read image {image_files[i]}: {e}")

    if image_dims:
        print(f"Found image dimensions (width, height): {image_dims}")
        if len(image_dims) == 1:
            print("All checked images have the same dimensions.")
        else:
            print("Images have varying dimensions.")
    else:
        print("Could not determine image dimensions.")


In [None]:

print("\n--- Class Distribution Analysis (Outline) ---")
print("To analyze class distribution:")
print("1. Find all relevant label files (e.g., '*_gtFine_labelIds.png') across all cities/splits.")
print("2. Load each label image (e.g., using Pillow or OpenCV).")
print("3. Convert the image to a NumPy array.")
print("4. Count the occurrences of each pixel value (class ID) using numpy.unique or collections.Counter.")
print("5. Aggregate counts across all images.")
print("6. Map the IDs to class names (refer to Cityscapes documentation or 'labels.py' from cityscapesscripts).")
print("7. Visualize the distribution (e.g., using matplotlib bar chart).")

# Example snippet for counting in ONE label file:
if city_folders and 'label_path' in locals() and label_path:  # Reuse label_path from visualization if available
    if 'labelIds' in label_path:
        try:
            print(f"\nExample count for one file: {os.path.basename(label_path)}")
            label_img = Image.open(label_path)
            label_array = np.array(label_img)
            unique_ids, counts = np.unique(label_array, return_counts=True)
            pixel_counts = dict(zip(unique_ids, counts))
            print("Pixel counts (ID: count):", pixel_counts)
            # You would need to aggregate this across all files.
            # Remember to consult Cityscapes docs for what each ID means!
        except Exception as e:
            print(f"Could not process label file {label_path} for counts: {e}")
    else:
        print("\nSkipping single file count example as the found label was not a 'labelIds' file.")


# Useful distribution

In [None]:

import os
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from collections import Counter
import glob
import random
import cv2  # OpenCV for brightness analysis

# Re-define paths if needed (ensure these are correct for your Drive setup)
DRIVE_PATH = '/content/drive/MyDrive/'
CITYSCAPES_ROOT = os.path.join(DRIVE_PATH, 'cityscape')
IMG_DIR_TRAIN = os.path.join(CITYSCAPES_ROOT, 'leftImg8bit', 'train')
LABEL_DIR_TRAIN = os.path.join(CITYSCAPES_ROOT, 'gtFine', 'train')
IMG_DIR_VAL = os.path.join(CITYSCAPES_ROOT, 'leftImg8bit', 'val')
LABEL_DIR_VAL = os.path.join(CITYSCAPES_ROOT, 'gtFine', 'val')


In [None]:

def get_label_files(label_dir, file_pattern="*_gtFine_labelIds.png", limit=None):
    """Finds label files matching a pattern within city subdirectories."""
    all_files = []
    if not os.path.isdir(label_dir):
        print(f"Warning: Label directory not found: {label_dir}")
        return all_files

    city_folders = [f for f in os.listdir(label_dir) if os.path.isdir(os.path.join(label_dir, f))]
    for city in city_folders:
        city_path = os.path.join(label_dir, city)
        files = glob.glob(os.path.join(city_path, file_pattern))
        all_files.extend(files)

    if limit:
        if len(all_files) > limit:
            # Sample randomly if limiting
            return random.sample(all_files, limit)
        else:
            return all_files  # Return all if less than limit
    return all_files


In [None]:

print("\n--- 1. Calculating Class Pixel Distribution (Subset) ---")
# Process a limited number of label files for efficiency
NUM_FILES_FOR_DIST = 100  # Adjust as needed for performance vs accuracy
label_files_subset = get_label_files(LABEL_DIR_TRAIN, limit=NUM_FILES_FOR_DIST)

if label_files_subset:
    print(f"Analyzing {len(label_files_subset)} label files...")
    total_pixel_counts = Counter()
    processed_files = 0
    for label_path in label_files_subset:
        try:
            label_img = Image.open(label_path)
            label_array = np.array(label_img)
            unique_ids, counts = np.unique(label_array, return_counts=True)
            # Add counts for this image to the total
            total_pixel_counts.update(dict(zip(unique_ids, counts)))
            processed_files += 1
        except Exception as e:
            print(f"Could not process label file {label_path}: {e}")

    if processed_files > 0:
        print(f"\nAggregated Pixel Counts (Top 20 IDs) from {processed_files} files:")
        # Sort by pixel count (descending) for display
        sorted_counts = dict(sorted(total_pixel_counts.items(), key=lambda item: item[1], reverse=True))

        # --- Plotting the distribution ---
        # Example manual mapping (subset - refer to Cityscapes docs for full list!)
        id_to_name_map = {
            0: 'unlabeled', 1: 'ego vehicle', 2: 'rect border', 3: 'out of roi', 4: 'static',
            5: 'dynamic', 6: 'ground', 7: 'road', 8: 'sidewalk', 11: 'building', 12: 'wall',
            13: 'fence', 17: 'pole', 19: 'traffic light', 20: 'traffic sign', 21: 'vegetation',
            22: 'terrain', 23: 'sky', 24: 'person', 25: 'rider', 26: 'car', 27: 'truck',
            28: 'bus', 31: 'train', 32: 'motorcycle', 33: 'bicycle', -1: 'license plate'
        }

        # Prepare data for plotting (use names if available, else IDs)
        plot_labels = []
        plot_counts = []
        item_count = 0
        for P_id, count in sorted_counts.items():
            plot_labels.append(id_to_name_map.get(P_id, f"ID_{P_id}"))
            plot_counts.append(count)
            print(f"  {id_to_name_map.get(P_id, f'ID_{P_id}')} ({P_id}): {count}")
            item_count += 1
            if item_count >= 20:  # Limit printed count to top 20
                break

        # Plotting the distribution using a log scale on the y-axis
        if plot_labels:
            plt.figure(figsize=(12, 8))
            plt.bar(plot_labels, plot_counts)
            plt.xlabel("Class Name (or ID)")
            plt.ylabel("Total Pixel Count (Log Scale)")
            plt.title(f"Pixel Class Distribution (Top {len(plot_labels)} from {processed_files} Train Images)")
            plt.xticks(rotation=90)
            plt.yscale('log')  # Use log scale to better visualize large differences
            plt.tight_layout()
            plt.show()
        else:
            print("No plottable data generated for class distribution.")
    else:
        print("No label files were successfully processed.")
else:
    print("No label files found to calculate distribution.")


In [None]:

print("\n--- 3. Analysis per City (Example: Image Count) ---")

def count_images_per_city(image_base_dir):
    city_counts = Counter()
    if not os.path.isdir(image_base_dir):
        print(f"Warning: Image directory not found: {image_base_dir}")
        return city_counts
    city_folders = [f for f in os.listdir(image_base_dir) if os.path.isdir(os.path.join(image_base_dir, f))]
    for city in city_folders:
        city_path = os.path.join(image_base_dir, city)
        files = glob.glob(os.path.join(city_path, '*.png'))
        city_counts[city] = len(files)
    return city_counts

train_city_counts = count_images_per_city(IMG_DIR_TRAIN)
val_city_counts = count_images_per_city(IMG_DIR_VAL)

print("\nImage Counts per City (Train Split):")
if train_city_counts:
    for city, count in train_city_counts.items():
        print(f"  {city}: {count}")
else:
    print("  Could not retrieve train counts.")

print("\nImage Counts per City (Validation Split):")
if val_city_counts:
    for city, count in val_city_counts.items():
        print(f"  {city}: {count}")
else:
    print("  Could not retrieve validation counts.")


In [None]:

print("\n--- 4. Comparing Train vs Validation Splits ---")
print("To compare splits (e.g., Train vs Val):")
print("1. Define functions for specific analyses (like pixel distribution or brightness).")
print("2. Call these functions separately for the Train and Validation directories.")
print("3. Plot the results side-by-side or overlayed.")

# Calculate brightness for validation images (on a subset)
all_val_image_files = []
val_city_folders = [f for f in os.listdir(IMG_DIR_VAL) if os.path.isdir(os.path.join(IMG_DIR_VAL, f))]
for city in val_city_folders:
    city_path = os.path.join(IMG_DIR_VAL, city)
    files = glob.glob(os.path.join(city_path, '*.png'))
    all_val_image_files.extend(files)

if len(all_val_image_files) > NUM_FILES_FOR_BRIGHTNESS:
    val_image_files_subset = random.sample(all_val_image_files, NUM_FILES_FOR_BRIGHTNESS)
else:
    val_image_files_subset = all_val_image_files

val_mean_brightness_values = []
val_processed_count = 0
if val_image_files_subset:
    for img_path in val_image_files_subset:
        try:
            img_cv = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            if img_cv is not None:
                val_mean_brightness_values.append(np.mean(img_cv))
                val_processed_count += 1
        except Exception as e:
            pass  # Ignore errors for this example comparison

# Plotting brightness comparison between Train and Validation images
if mean_brightness_values and val_mean_brightness_values:
    plt.figure(figsize=(12, 7))
    plt.hist(mean_brightness_values, bins=50, alpha=0.7, label=f'Train (n={processed_count})', density=True)
    plt.hist(val_mean_brightness_values, bins=50, alpha=0.7, label=f'Validation (n={val_processed_count})', density=True)
    plt.xlabel("Mean Pixel Brightness (0-255)")
    plt.ylabel("Density")
    plt.title("Comparison of Mean Image Brightness Distribution (Train vs Val Subsets)")
    plt.legend()
    plt.grid(axis='y', alpha=0.75)
    plt.show()
else:
    print("\nCould not generate brightness comparison plot (missing data for train or val).")
