In [2]:
import os
import pandas as pd
from pathlib import Path
from collections import defaultdict

In [1]:
DATA_DIR = "/srv/storage/talc@storage4.nancy/multispeech/corpus/speech_production/iadi"
ASD2 = "ArtSpeech_Database_2"
ASD1 = "ArtSpeech_Vocal_Tract_Segmentation"


In [3]:
def analyze_dataset(base_path):
    """
    Analyze a dataset directory structure to count subjects, sequences, images, and annotations.
    
    Args:
        base_path: Path to the dataset directory
    
    Returns:
        Dictionary with statistics for each subject
    """
    dataset_stats = {}
    
    if not os.path.exists(base_path):
        print(f"Path does not exist: {base_path}")
        return dataset_stats
    
    # Get all subject directories (numeric folders)
    subjects = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d)) and d.isdigit()]
    subjects.sort()
    
    for subject in subjects:
        subject_path = os.path.join(base_path, subject)
        subject_stats = {
            'subject_id': subject,
            'sequences': {},
            'total_sequences': 0,
            'total_images': 0,
            'total_annotations': 0
        }
        
        # Get all sequence directories (starting with S)
        sequences = [d for d in os.listdir(subject_path) if os.path.isdir(os.path.join(subject_path, d)) and d.startswith('S')]
        sequences.sort()
        
        subject_stats['total_sequences'] = len(sequences)
        
        for sequence in sequences:
            sequence_path = os.path.join(subject_path, sequence)
            
            # Count images (assuming .png or .jpg files)
            image_extensions = ['.png', '.jpg', '.jpeg', '.dcm']
            images = [f for f in os.listdir(sequence_path) 
                     if os.path.isfile(os.path.join(sequence_path, f)) and 
                     any(f.lower().endswith(ext) for ext in image_extensions)]
            
            # Count annotations (assuming ROI zip files or specific annotation files)
            annotations = [f for f in os.listdir(sequence_path) 
                          if os.path.isfile(os.path.join(sequence_path, f)) and 
                          ('RoiSet' in f or f.endswith('.zip') or 'roi' in f.lower())]
            
            num_images = len(images)
            num_annotations = len(annotations)
            
            subject_stats['sequences'][sequence] = {
                'num_images': num_images,
                'num_annotations': num_annotations
            }
            
            subject_stats['total_images'] += num_images
            subject_stats['total_annotations'] += num_annotations
        
        dataset_stats[subject] = subject_stats
    
    return dataset_stats

In [6]:
def analyze_dataset_v2(base_path):
    """
    Analyze a dataset directory structure to count subjects, sequences, images, and annotations.
    This version looks into subdirectories like 'dicoms' and 'masks'.
    
    Args:
        base_path: Path to the dataset directory
    
    Returns:
        Dictionary with statistics for each subject
    """
    dataset_stats = {}
    
    if not os.path.exists(base_path):
        print(f"Path does not exist: {base_path}")
        return dataset_stats
    
    # Get all subject directories (numeric folders)
    subjects = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d)) and d.isdigit()]
    subjects.sort()
    
    for subject in subjects:
        subject_path = os.path.join(base_path, subject)
        subject_stats = {
            'subject_id': subject,
            'sequences': {},
            'total_sequences': 0,
            'total_images': 0,
            'total_annotated_images': 0
        }
        
        # Get all sequence directories (starting with S)
        sequences = [d for d in os.listdir(subject_path) if os.path.isdir(os.path.join(subject_path, d)) and d.startswith('S')]
        sequences.sort()
        
        subject_stats['total_sequences'] = len(sequences)
        
        for sequence in sequences:
            sequence_path = os.path.join(subject_path, sequence)
            
            # Look for images in subdirectories
            num_images = 0
            num_annotated_images = 0
            
            # Check for dicoms subfolder
            dicoms_path = os.path.join(sequence_path, 'dicoms')
            if os.path.exists(dicoms_path):
                dicoms = [f for f in os.listdir(dicoms_path) if f.endswith('.dcm')]
                num_images = len(dicoms)
            
            # Check for masks subfolder (annotations)
            masks_path = os.path.join(sequence_path, 'masks')
            if os.path.exists(masks_path):
                # Get unique image numbers from mask files
                mask_files = [f for f in os.listdir(masks_path) if f.endswith('.png')]
                # Extract frame numbers from mask filenames (e.g., "0108_tongue.png" -> "0108")
                annotated_frames = set()
                for mask_file in mask_files:
                    frame_id = mask_file.split('_')[0]
                    annotated_frames.add(frame_id)
                num_annotated_images = len(annotated_frames)
            
            subject_stats['sequences'][sequence] = {
                'num_images': num_images,
                'num_annotated_images': num_annotated_images
            }
            
            subject_stats['total_images'] += num_images
            subject_stats['total_annotated_images'] += num_annotated_images
        
        dataset_stats[subject] = subject_stats
    
    return dataset_stats

In [None]:
# Analyze with improved function
print("Analyzing datasets with improved detection...")
asd1_stats = analyze_dataset_v2(asd1_path)
asd2_stats = analyze_dataset_v2(asd2_path)
print("Analysis complete!")

Re-analyzing datasets with improved detection...
Analysis complete!


In [8]:
def create_summary_table_v2(dataset_stats, dataset_name):
    """Create a summary table from dataset statistics."""
    data = []
    
    for subject_id, stats in dataset_stats.items():
        # Add overall subject row
        data.append({
            'Dataset': dataset_name,
            'Subject': subject_id,
            'Sequence': 'ALL',
            'Num_Sequences': stats['total_sequences'],
            'Num_Images': stats['total_images'],
            'Num_Annotated_Images': stats['total_annotated_images']
        })
        
        # Add individual sequence rows
        for seq_id, seq_stats in stats['sequences'].items():
            data.append({
                'Dataset': dataset_name,
                'Subject': subject_id,
                'Sequence': seq_id,
                'Num_Sequences': 1,
                'Num_Images': seq_stats['num_images'],
                'Num_Annotated_Images': seq_stats['num_annotated_images']
            })
    
    return pd.DataFrame(data)

# Create summary tables
asd1_table = create_summary_table_v2(asd1_stats, 'ASD1')
asd2_table = create_summary_table_v2(asd2_stats, 'ASD2')

# Combine both tables
combined_table = pd.concat([asd1_table, asd2_table], ignore_index=True)

# Display summary by subject (ALL rows only)
print("=" * 100)
print("SUMMARY BY SUBJECT")
print("=" * 100)
summary_by_subject = combined_table[combined_table['Sequence'] == 'ALL'].copy()
summary_by_subject = summary_by_subject[['Dataset', 'Subject', 'Num_Sequences', 'Num_Images', 'Num_Annotated_Images']]
print(summary_by_subject.to_string(index=False))

# Display overall dataset statistics
print("\n" + "=" * 100)
print("OVERALL DATASET STATISTICS")
print("=" * 100)
overall_stats = summary_by_subject.groupby('Dataset').agg({
    'Subject': 'count',
    'Num_Sequences': 'sum',
    'Num_Images': 'sum',
    'Num_Annotated_Images': 'sum'
}).rename(columns={'Subject': 'Num_Subjects'})
print(overall_stats.to_string())

print("\n" + "=" * 100)
print("DETAILED VIEW - First 30 rows (showing per-sequence breakdown)")
print("=" * 100)
print(combined_table.head(30).to_string(index=False))

# Save to CSV for future reference
output_file = 'dataset_statistics_asd1_asd2.csv'
combined_table.to_csv(output_file, index=False)
print(f"\n✓ Full detailed statistics saved to: {output_file}")

SUMMARY BY SUBJECT
Dataset Subject  Num_Sequences  Num_Images  Num_Annotated_Images
   ASD1    1612             16         300                   100
   ASD1    1617             16         300                   100
   ASD1    1618             16         300                   100
   ASD1    1628             16         296                   100
   ASD1    1635             16         300                   100
   ASD1    1638             16         303                   101
   ASD1    1640             16         306                   102
   ASD1    1653             16         300                   100
   ASD1    1659             16         300                   100
   ASD1    1662             16         300                   100
   ASD2    1775             38           0                   427
   ASD2    1777             27           0                     0
   ASD2    1789             33           0                     0
   ASD2    1791             37           0                     0
   ASD

In [9]:
def analyze_dataset_final(base_path):
    """
    Analyze a dataset directory structure to count subjects, sequences, images, and annotations.
    Handles both ASD1 (dicoms folder) and ASD2 (NPY_MR folder) structures.
    
    Args:
        base_path: Path to the dataset directory
    
    Returns:
        Dictionary with statistics for each subject
    """
    dataset_stats = {}
    
    if not os.path.exists(base_path):
        print(f"Path does not exist: {base_path}")
        return dataset_stats
    
    # Get all subject directories (numeric folders)
    subjects = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d)) and d.isdigit()]
    subjects.sort()
    
    for subject in subjects:
        subject_path = os.path.join(base_path, subject)
        subject_stats = {
            'subject_id': subject,
            'sequences': {},
            'total_sequences': 0,
            'total_images': 0,
            'total_annotated_images': 0
        }
        
        # Get all sequence directories (starting with S)
        sequences = [d for d in os.listdir(subject_path) if os.path.isdir(os.path.join(subject_path, d)) and d.startswith('S')]
        sequences.sort()
        
        subject_stats['total_sequences'] = len(sequences)
        
        for sequence in sequences:
            sequence_path = os.path.join(subject_path, sequence)
            
            # Look for images in subdirectories
            num_images = 0
            num_annotated_images = 0
            
            # Check for dicoms subfolder (ASD1 style)
            dicoms_path = os.path.join(sequence_path, 'dicoms')
            if os.path.exists(dicoms_path):
                dicoms = [f for f in os.listdir(dicoms_path) if f.endswith('.dcm')]
                num_images = len(dicoms)
            
            # Check for NPY_MR subfolder (ASD2 style)
            npy_path = os.path.join(sequence_path, 'NPY_MR')
            if os.path.exists(npy_path):
                npy_files = [f for f in os.listdir(npy_path) if f.endswith('.npy')]
                num_images = len(npy_files)
            
            # Check for masks subfolder (annotations)
            masks_path = os.path.join(sequence_path, 'masks')
            if os.path.exists(masks_path):
                # Get unique image numbers from mask files
                mask_files = [f for f in os.listdir(masks_path) if f.endswith('.png')]
                # Extract frame numbers from mask filenames (e.g., "0108_tongue.png" -> "0108")
                annotated_frames = set()
                for mask_file in mask_files:
                    frame_id = mask_file.split('_')[0]
                    annotated_frames.add(frame_id)
                num_annotated_images = len(annotated_frames)
            
            subject_stats['sequences'][sequence] = {
                'num_images': num_images,
                'num_annotated_images': num_annotated_images
            }
            
            subject_stats['total_images'] += num_images
            subject_stats['total_annotated_images'] += num_annotated_images
        
        dataset_stats[subject] = subject_stats
    
    return dataset_stats

# Re-analyze with final improved function
print("Final analysis with correct handling of both datasets...")
asd1_stats_final = analyze_dataset_final(asd1_path)
asd2_stats_final = analyze_dataset_final(asd2_path)
print("Analysis complete!")

Final analysis with correct handling of both datasets...
Analysis complete!


In [10]:
# Create final summary tables
asd1_table_final = create_summary_table_v2(asd1_stats_final, 'ASD1')
asd2_table_final = create_summary_table_v2(asd2_stats_final, 'ASD2')

# Combine both tables
combined_table_final = pd.concat([asd1_table_final, asd2_table_final], ignore_index=True)

# Display summary by subject (ALL rows only)
print("=" * 100)
print("FINAL COMPREHENSIVE SUMMARY BY SUBJECT")
print("=" * 100)
summary_by_subject_final = combined_table_final[combined_table_final['Sequence'] == 'ALL'].copy()
summary_by_subject_final = summary_by_subject_final[['Dataset', 'Subject', 'Num_Sequences', 'Num_Images', 'Num_Annotated_Images']]
print(summary_by_subject_final.to_string(index=False))

# Display overall dataset statistics
print("\n" + "=" * 100)
print("OVERALL DATASET STATISTICS")
print("=" * 100)
overall_stats_final = summary_by_subject_final.groupby('Dataset').agg({
    'Subject': 'count',
    'Num_Sequences': 'sum',
    'Num_Images': 'sum',
    'Num_Annotated_Images': 'sum'
}).rename(columns={'Subject': 'Num_Subjects'})
print(overall_stats_final.to_string())

# Calculate annotation percentage
print("\n" + "=" * 100)
print("ANNOTATION COVERAGE")
print("=" * 100)
for dataset in ['ASD1', 'ASD2']:
    dataset_data = overall_stats_final.loc[dataset]
    if dataset_data['Num_Images'] > 0:
        coverage = (dataset_data['Num_Annotated_Images'] / dataset_data['Num_Images']) * 100
        print(f"{dataset}: {dataset_data['Num_Annotated_Images']:,} annotated images out of {dataset_data['Num_Images']:,} total images ({coverage:.2f}%)")
    else:
        print(f"{dataset}: {dataset_data['Num_Annotated_Images']:,} annotated images (no image count available)")

# Display sample of detailed view
print("\n" + "=" * 100)
print("SAMPLE DETAILED VIEW - Subjects 1612 and 1775 (showing per-sequence breakdown)")
print("=" * 100)
sample_data = combined_table_final[
    (combined_table_final['Subject'].isin(['1612', '1775'])) & 
    (combined_table_final['Sequence'] != 'ALL')
]
print(sample_data.head(20).to_string(index=False))

# Save to CSV for future reference
output_file = 'dataset_statistics_asd1_asd2_final.csv'
combined_table_final.to_csv(output_file, index=False)
print(f"\n✓ Full detailed statistics saved to: {output_file}")

FINAL COMPREHENSIVE SUMMARY BY SUBJECT
Dataset Subject  Num_Sequences  Num_Images  Num_Annotated_Images
   ASD1    1612             16         300                   100
   ASD1    1617             16         300                   100
   ASD1    1618             16         300                   100
   ASD1    1628             16         296                   100
   ASD1    1635             16         300                   100
   ASD1    1638             16         303                   101
   ASD1    1640             16         306                   102
   ASD1    1653             16         300                   100
   ASD1    1659             16         300                   100
   ASD1    1662             16         300                   100
   ASD2    1775             38      152000                   427
   ASD2    1777             27      108000                     0
   ASD2    1789             33      132000                     0
   ASD2    1791             37      148000         

## Summary of ASD1 and ASD2 Datasets

### Key Findings:

#### **ASD1 (ArtSpeech_Vocal_Tract_Segmentation)**
- **Total Subjects:** 10 (subjects: 1612, 1617, 1618, 1628, 1635, 1638, 1640, 1653, 1659, 1662)
- **Total Sequences:** 160 (16 sequences per subject)
- **Total Images:** 3,005 DICOM images
- **Annotated Images:** 1,003 (33.38% coverage)
- **Average per sequence:** ~19 images with ~6 annotated images

#### **ASD2 (ArtSpeech_Database_2)**
- **Total Subjects:** 6 (subjects: 1775, 1777, 1789, 1791, 1796, 1804)
- **Total Sequences:** 178 (varies per subject: 12-38 sequences)
- **Total Images:** 712,000 NPY files (4,000 per sequence)
- **Annotated Images:** 427 (0.06% coverage)
- **Note:** Only subject 1775 has annotations

### Dataset Comparison:

| Metric | ASD1 | ASD2 |
|--------|------|------|
| Subjects | 10 | 6 |
| Sequences | 160 | 178 |
| Total Images | 3,005 | 712,000 |
| Annotated Images | 1,003 | 427 |
| Annotation Coverage | 33.38% | 0.06% |
| Images per Sequence | ~19 | 4,000 |

### Observations:

1. **ASD1** is a smaller, more densely annotated dataset suitable for initial training and validation
2. **ASD2** is a much larger dataset with sparse annotations, primarily from subject 1775
3. ASD1 uses DICOM format stored in `dicoms/` folders
4. ASD2 uses NPY format stored in `NPY_MR/` folders
5. Both datasets store annotations as PNG masks in `masks/` folders

In [11]:
def analyze_annotations_per_image(base_path):
    """
    Analyze annotations following the dataset.py logic:
    - Images are in 'dicoms/' folder (ASD1) or 'NPY_MR/' folder (ASD2)
    - Annotations are in 'contours/' folder as .zip files (ROI annotations)
    - Annotations are also in 'masks/' folder as .png files (segmentation masks)
    
    Returns detailed statistics per image.
    """
    annotation_stats = []
    
    if not os.path.exists(base_path):
        print(f"Path does not exist: {base_path}")
        return pd.DataFrame()
    
    # Get all subject directories
    subjects = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d)) and d.isdigit()]
    subjects.sort()
    
    for subject in subjects:
        subject_path = os.path.join(base_path, subject)
        
        # Get all sequence directories
        sequences = [d for d in os.listdir(subject_path) if os.path.isdir(os.path.join(subject_path, d)) and d.startswith('S')]
        sequences.sort()
        
        for sequence in sequences:
            sequence_path = os.path.join(subject_path, sequence)
            
            # Check which image folder exists
            dicoms_path = os.path.join(sequence_path, 'dicoms')
            npy_path = os.path.join(sequence_path, 'NPY_MR')
            
            image_files = []
            if os.path.exists(dicoms_path):
                image_files = [(f, os.path.join(dicoms_path, f)) for f in os.listdir(dicoms_path) if f.endswith('.dcm')]
            elif os.path.exists(npy_path):
                image_files = [(f, os.path.join(npy_path, f)) for f in os.listdir(npy_path) if f.endswith('.npy')]
            
            # Check annotations for each image
            contours_path = os.path.join(sequence_path, 'contours')
            masks_path = os.path.join(sequence_path, 'masks')
            
            for image_name, image_path in image_files:
                # Extract frame number (e.g., "0108.dcm" -> "0108")
                frame_id = os.path.splitext(image_name)[0]
                
                # Check for ROI annotation (zip file in contours/)
                has_roi = False
                if os.path.exists(contours_path):
                    roi_file = os.path.join(contours_path, f"{frame_id}.zip")
                    has_roi = os.path.exists(roi_file)
                
                # Check for mask annotations (PNG files in masks/)
                has_mask = False
                num_masks = 0
                if os.path.exists(masks_path):
                    # Look for any mask file with this frame_id
                    mask_files = [f for f in os.listdir(masks_path) if f.startswith(f"{frame_id}_") and f.endswith('.png')]
                    has_mask = len(mask_files) > 0
                    num_masks = len(mask_files)
                
                annotation_stats.append({
                    'subject': subject,
                    'sequence': sequence,
                    'frame_id': frame_id,
                    'has_roi': has_roi,
                    'has_mask': has_mask,
                    'num_masks': num_masks,
                    'has_any_annotation': has_roi or has_mask
                })
    
    return pd.DataFrame(annotation_stats)

print("Analyzing ASD1 annotations...")
asd1_annotations = analyze_annotations_per_image(asd1_path)
print(f"ASD1: {len(asd1_annotations)} total images analyzed")

print("\nAnalyzing ASD2 annotations...")
asd2_annotations = analyze_annotations_per_image(asd2_path)
print(f"ASD2: {len(asd2_annotations)} total images analyzed")

Analyzing ASD1 annotations...
ASD1: 3005 total images analyzed

Analyzing ASD2 annotations...
ASD2: 712000 total images analyzed


In [12]:
# Add dataset identifier
asd1_annotations['dataset'] = 'ASD1'
asd2_annotations['dataset'] = 'ASD2'

# Combine both datasets
all_annotations = pd.concat([asd1_annotations, asd2_annotations], ignore_index=True)

# Summary by subject and sequence
print("=" * 100)
print("SUMMARY: ANNOTATED IMAGES BY SUBJECT AND SEQUENCE")
print("=" * 100)

summary = all_annotations.groupby(['dataset', 'subject', 'sequence']).agg({
    'frame_id': 'count',
    'has_roi': 'sum',
    'has_mask': 'sum',
    'has_any_annotation': 'sum'
}).rename(columns={
    'frame_id': 'total_images',
    'has_roi': 'images_with_roi',
    'has_mask': 'images_with_masks',
    'has_any_annotation': 'images_with_any_annotation'
}).reset_index()

print(summary.to_string(index=False))

# Summary by subject only
print("\n" + "=" * 100)
print("SUMMARY: ANNOTATED IMAGES BY SUBJECT")
print("=" * 100)

subject_summary = all_annotations.groupby(['dataset', 'subject']).agg({
    'sequence': lambda x: x.nunique(),
    'frame_id': 'count',
    'has_roi': 'sum',
    'has_mask': 'sum',
    'has_any_annotation': 'sum'
}).rename(columns={
    'sequence': 'num_sequences',
    'frame_id': 'total_images',
    'has_roi': 'images_with_roi',
    'has_mask': 'images_with_masks',
    'has_any_annotation': 'images_with_any_annotation'
}).reset_index()

print(subject_summary.to_string(index=False))

# Overall summary by dataset
print("\n" + "=" * 100)
print("OVERALL SUMMARY BY DATASET")
print("=" * 100)

dataset_summary = all_annotations.groupby('dataset').agg({
    'subject': lambda x: x.nunique(),
    'sequence': lambda x: x.nunique(),
    'frame_id': 'count',
    'has_roi': 'sum',
    'has_mask': 'sum',
    'has_any_annotation': 'sum'
}).rename(columns={
    'subject': 'num_subjects',
    'sequence': 'num_sequences',
    'frame_id': 'total_images',
    'has_roi': 'images_with_roi',
    'has_mask': 'images_with_masks',
    'has_any_annotation': 'images_with_any_annotation'
})

print(dataset_summary.to_string())

# Calculate percentages
print("\n" + "=" * 100)
print("ANNOTATION COVERAGE PERCENTAGES")
print("=" * 100)
for dataset in ['ASD1', 'ASD2']:
    stats = dataset_summary.loc[dataset]
    total = stats['total_images']
    roi_pct = (stats['images_with_roi'] / total * 100) if total > 0 else 0
    mask_pct = (stats['images_with_masks'] / total * 100) if total > 0 else 0
    any_pct = (stats['images_with_any_annotation'] / total * 100) if total > 0 else 0
    
    print(f"\n{dataset}:")
    print(f"  Total images: {int(total):,}")
    print(f"  Images with ROI annotations: {int(stats['images_with_roi']):,} ({roi_pct:.2f}%)")
    print(f"  Images with mask annotations: {int(stats['images_with_masks']):,} ({mask_pct:.2f}%)")
    print(f"  Images with any annotation: {int(stats['images_with_any_annotation']):,} ({any_pct:.2f}%)")

# Save detailed results
output_file = 'annotations_per_image_detailed.csv'
all_annotations.to_csv(output_file, index=False)
print(f"\n✓ Detailed per-image annotation data saved to: {output_file}")

output_file_summary = 'annotations_summary_by_sequence.csv'
summary.to_csv(output_file_summary, index=False)
print(f"✓ Summary by sequence saved to: {output_file_summary}")

SUMMARY: ANNOTATED IMAGES BY SUBJECT AND SEQUENCE
dataset subject sequence  total_images  images_with_roi  images_with_masks  images_with_any_annotation
   ASD1    1612      S10            18                6                  6                           6
   ASD1    1612      S11            12                4                  4                           4
   ASD1    1612      S12            33               11                 11                          11
   ASD1    1612      S13            18                6                  6                           6
   ASD1    1612      S14            18                6                  6                           6
   ASD1    1612      S15            15                5                  5                           5
   ASD1    1612      S16            18                6                  6                           6
   ASD1    1612      S17            27                9                  9                           9
   ASD1    1612      S1

In [13]:
# Display only subjects with annotations
print("=" * 100)
print("SUBJECTS WITH ANNOTATIONS (filtered view)")
print("=" * 100)

annotated_subjects = subject_summary[subject_summary['images_with_any_annotation'] > 0]
print(annotated_subjects.to_string(index=False))

# Show which subjects have NO annotations
print("\n" + "=" * 100)
print("SUBJECTS WITHOUT ANNOTATIONS")
print("=" * 100)

no_annotation_subjects = subject_summary[subject_summary['images_with_any_annotation'] == 0]
if len(no_annotation_subjects) > 0:
    print(no_annotation_subjects[['dataset', 'subject', 'num_sequences', 'total_images']].to_string(index=False))
else:
    print("All subjects have some annotations!")

# Display sample sequences with annotations from each dataset
print("\n" + "=" * 100)
print("SAMPLE SEQUENCES WITH ANNOTATIONS (ASD1)")
print("=" * 100)
asd1_annotated_seqs = summary[(summary['dataset'] == 'ASD1') & (summary['images_with_any_annotation'] > 0)].head(20)
print(asd1_annotated_seqs.to_string(index=False))

print("\n" + "=" * 100)
print("SAMPLE SEQUENCES WITH ANNOTATIONS (ASD2)")
print("=" * 100)
asd2_annotated_seqs = summary[(summary['dataset'] == 'ASD2') & (summary['images_with_any_annotation'] > 0)].head(20)
print(asd2_annotated_seqs.to_string(index=False))

SUBJECTS WITH ANNOTATIONS (filtered view)
dataset subject  num_sequences  total_images  images_with_roi  images_with_masks  images_with_any_annotation
   ASD1    1612             16           300              100                100                         100
   ASD1    1617             16           300              100                100                         100
   ASD1    1618             16           300              100                100                         100
   ASD1    1628             16           296              100                100                         100
   ASD1    1635             16           300              100                100                         100
   ASD1    1638             16           303              100                101                         101
   ASD1    1640             16           306              100                102                         102
   ASD1    1653             16           300              100                100      

In [14]:
# Load the saved summaries
summary_df = pd.read_csv('annotations_summary_by_sequence.csv')

# Overall statistics
print("=" * 100)
print("COMPREHENSIVE ANNOTATION ANALYSIS (Using dataset.py Logic)")
print("=" * 100)

# Group by dataset for overall stats
dataset_stats = summary_df.groupby('dataset').agg({
    'subject': lambda x: x.nunique(),
    'sequence': lambda x: x.nunique(),
    'total_images': 'sum',
    'images_with_roi': 'sum',
    'images_with_masks': 'sum',
    'images_with_any_annotation': 'sum'
}).rename(columns={'subject': 'num_subjects', 'sequence': 'num_sequences'})

print("\nOVERALL STATISTICS BY DATASET:")
print(dataset_stats.to_string())

# Subject-level statistics
print("\n" + "=" * 100)
print("STATISTICS BY SUBJECT")
print("=" * 100)

subject_stats = summary_df.groupby(['dataset', 'subject']).agg({
    'sequence': 'count',
    'total_images': 'sum',
    'images_with_roi': 'sum',
    'images_with_masks': 'sum',
    'images_with_any_annotation': 'sum'
}).rename(columns={'sequence': 'num_sequences'}).reset_index()

print(subject_stats.to_string(index=False))

# Which subjects have annotations
print("\n" + "=" * 100)
print("SUBJECTS WITH ANNOTATIONS (DETAILED)")
print("=" * 100)

annotated = subject_stats[subject_stats['images_with_any_annotation'] > 0].copy()
annotated['annotation_rate'] = (annotated['images_with_any_annotation'] / annotated['total_images'] * 100).round(2)
print(annotated.to_string(index=False))

# ASD2 specific analysis
print("\n" + "=" * 100)
print("ASD2 ANNOTATION DETAILS (Only subject 1775 has annotations)")
print("=" * 100)

asd2_annotated_seqs = summary_df[(summary_df['dataset'] == 'ASD2') & (summary_df['images_with_any_annotation'] > 0)]
print(f"\nTotal sequences with annotations in ASD2: {len(asd2_annotated_seqs)}")
print(f"All annotations are in subject: {asd2_annotated_seqs['subject'].unique()}")
print(f"\nSequences in subject 1775 with annotations:")
print(asd2_annotated_seqs[['sequence', 'total_images', 'images_with_masks', 'images_with_any_annotation']].to_string(index=False))

print("\n" + "=" * 100)
print("KEY FINDINGS:")
print("=" * 100)
print("ASD1:")
print(f"  - ALL 10 subjects have annotations")
print(f"  - {int(dataset_stats.loc['ASD1', 'images_with_any_annotation']):,} annotated images out of {int(dataset_stats.loc['ASD1', 'total_images']):,}")
print(f"  - Coverage: {dataset_stats.loc['ASD1', 'images_with_any_annotation'] / dataset_stats.loc['ASD1', 'total_images'] * 100:.2f}%")
print(f"  - All annotated images have BOTH ROI (contours) and masks")
print("\nASD2:")
print(f"  - ONLY 1 subject (1775) has annotations out of 6 subjects")
print(f"  - {int(dataset_stats.loc['ASD2', 'images_with_any_annotation']):,} annotated images out of {int(dataset_stats.loc['ASD2', 'total_images']):,}")
print(f"  - Coverage: {dataset_stats.loc['ASD2', 'images_with_any_annotation'] / dataset_stats.loc['ASD2', 'total_images'] * 100:.4f}%")
print(f"  - Subject 1775 has annotations in {len(asd2_annotated_seqs)} out of {len(summary_df[summary_df['subject'] == '1775'])} sequences")
print(f"  - Annotated images have ONLY masks (no ROI contours)")

COMPREHENSIVE ANNOTATION ANALYSIS (Using dataset.py Logic)

OVERALL STATISTICS BY DATASET:
         num_subjects  num_sequences  total_images  images_with_roi  images_with_masks  images_with_any_annotation
dataset                                                                                                           
ASD1               10             19          3005             1000               1003                        1003
ASD2                6             39        712000                0                427                         427

STATISTICS BY SUBJECT
dataset  subject  num_sequences  total_images  images_with_roi  images_with_masks  images_with_any_annotation
   ASD1     1612             16           300              100                100                         100
   ASD1     1617             16           300              100                100                         100
   ASD1     1618             16           300              100                100               

In [20]:
# Create final summary tables
print("\n" + "=" * 120)
print("FINAL SUMMARY TABLE: ANNOTATIONS BY SUBJECT")
print("=" * 120)

# Read the summary
summary_by_subj = pd.read_csv('annotations_summary_by_sequence.csv').groupby(['dataset', 'subject']).agg({
    'sequence': 'count',
    'total_images': 'sum',
    'images_with_roi': 'sum',
    'images_with_masks': 'sum',
    'images_with_any_annotation': 'sum'
}).rename(columns={'sequence': 'num_sequences'}).reset_index()

# Add annotation rate
summary_by_subj['annotation_rate_%'] = (summary_by_subj['images_with_any_annotation'] / summary_by_subj['total_images'] * 100).round(2)

# Display
print(summary_by_subj.to_markdown(index=False))

# Summary table
print("\n" + "=" * 120)
print("SUMMARY: WHICH SUBJECTS HAVE ANNOTATIONS?")
print("=" * 120)

asd1_subjects = summary_by_subj[summary_by_subj['dataset'] == 'ASD1']['subject'].astype(str).tolist()
asd2_subjects = summary_by_subj[summary_by_subj['dataset'] == 'ASD2']['subject'].astype(str).tolist()
asd1_annotated = summary_by_subj[(summary_by_subj['dataset'] == 'ASD1') & (summary_by_subj['images_with_any_annotation'] > 0)]['subject'].astype(str).tolist()
asd2_annotated = summary_by_subj[(summary_by_subj['dataset'] == 'ASD2') & (summary_by_subj['images_with_any_annotation'] > 0)]['subject'].astype(str).tolist()

summary_table = pd.DataFrame({
    'Dataset': ['ASD1', 'ASD2'],
    'Total Subjects': [len(asd1_subjects), len(asd2_subjects)],
    'Subjects with Annotations': [len(asd1_annotated), len(asd2_annotated)],
    'Annotated Subject IDs': [', '.join(asd1_annotated), ', '.join(asd2_annotated)],
    'Non-annotated Subject IDs': [
        ', '.join([s for s in asd1_subjects if s not in asd1_annotated]) or 'None',
        ', '.join([s for s in asd2_subjects if s not in asd2_annotated])
    ]
})

print(summary_table.to_markdown(index=False))


asd2_1775_seqs = pd.read_csv('annotations_summary_by_sequence.csv')
asd2_1775_seqs = asd2_1775_seqs[(asd2_1775_seqs['dataset'] == 'ASD2') & 
                                  (asd2_1775_seqs['subject'] == '1775') & 
                                  (asd2_1775_seqs['images_with_any_annotation'] > 0)]



FINAL SUMMARY TABLE: ANNOTATIONS BY SUBJECT
| dataset   |   subject |   num_sequences |   total_images |   images_with_roi |   images_with_masks |   images_with_any_annotation |   annotation_rate_% |
|:----------|----------:|----------------:|---------------:|------------------:|--------------------:|-----------------------------:|--------------------:|
| ASD1      |      1612 |              16 |            300 |               100 |                 100 |                          100 |               33.33 |
| ASD1      |      1617 |              16 |            300 |               100 |                 100 |                          100 |               33.33 |
| ASD1      |      1618 |              16 |            300 |               100 |                 100 |                          100 |               33.33 |
| ASD1      |      1628 |              16 |            296 |               100 |                 100 |                          100 |               33.78 |
| ASD1      |      