In [None]:
import sys
import os
sys.path.append('../src')

import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from collections import defaultdict, Counter
from PIL import Image
import cv2
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configure display
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

print("Setup complete!")


In [None]:
# Dataset paths
DATA_DIR = Path('../data')

# Load dataset information
with open(DATA_DIR / 'train_dataset_infor.json', 'r') as f:
    train_info = json.load(f)

with open(DATA_DIR / 'test_dataset_infor.json', 'r') as f:
    test_info = json.load(f)

with open(DATA_DIR / 'train_twin_pairs.json', 'r') as f:
    train_pairs = json.load(f)

with open(DATA_DIR / 'test_twin_pairs.json', 'r') as f:
    test_pairs = json.load(f)

print(f"Train dataset: {len(train_info)} people")
print(f"Test dataset: {len(test_info)} people")
print(f"Train twin pairs: {len(train_pairs)} pairs")
print(f"Test twin pairs: {len(test_pairs)} pairs")


In [None]:
def analyze_dataset_stats(dataset_info, dataset_name):
    """Analyze basic statistics of the dataset."""
    
    # Count images per person
    images_per_person = [len(images) for images in dataset_info.values()]
    
    # Total statistics
    total_people = len(dataset_info)
    total_images = sum(images_per_person)
    
    stats = {
        'total_people': total_people,
        'total_images': total_images,
        'min_images_per_person': min(images_per_person),
        'max_images_per_person': max(images_per_person),
        'avg_images_per_person': np.mean(images_per_person),
        'median_images_per_person': np.median(images_per_person),
        'std_images_per_person': np.std(images_per_person)
    }
    
    print(f"\n{dataset_name} Dataset Statistics:")
    print(f"  Total people: {stats['total_people']}")
    print(f"  Total images: {stats['total_images']}")
    print(f"  Images per person: {stats['min_images_per_person']} - {stats['max_images_per_person']}")
    print(f"  Average images per person: {stats['avg_images_per_person']:.2f}")
    print(f"  Median images per person: {stats['median_images_per_person']:.2f}")
    print(f"  Std images per person: {stats['std_images_per_person']:.2f}")
    
    return stats, images_per_person

# Analyze both datasets
train_stats, train_images_per_person = analyze_dataset_stats(train_info, "Train")
test_stats, test_images_per_person = analyze_dataset_stats(test_info, "Test")


In [None]:
# Create comprehensive visualization of dataset statistics
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Train dataset distribution
axes[0, 0].hist(train_images_per_person, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
axes[0, 0].set_title('Train Dataset: Images per Person Distribution')
axes[0, 0].set_xlabel('Number of Images')
axes[0, 0].set_ylabel('Number of People')
axes[0, 0].grid(True, alpha=0.3)

# Test dataset distribution
axes[0, 1].hist(test_images_per_person, bins=20, alpha=0.7, color='lightcoral', edgecolor='black')
axes[0, 1].set_title('Test Dataset: Images per Person Distribution')
axes[0, 1].set_xlabel('Number of Images')
axes[0, 1].set_ylabel('Number of People')
axes[0, 1].grid(True, alpha=0.3)

# Combined box plot
data_to_plot = [train_images_per_person, test_images_per_person]
axes[1, 0].boxplot(data_to_plot, labels=['Train', 'Test'])
axes[1, 0].set_title('Images per Person: Train vs Test')
axes[1, 0].set_ylabel('Number of Images')
axes[1, 0].grid(True, alpha=0.3)

# Summary statistics table
summary_data = {
    'Dataset': ['Train', 'Test'],
    'People': [train_stats['total_people'], test_stats['total_people']],
    'Images': [train_stats['total_images'], test_stats['total_images']],
    'Min Images/Person': [train_stats['min_images_per_person'], test_stats['min_images_per_person']],
    'Max Images/Person': [train_stats['max_images_per_person'], test_stats['max_images_per_person']],
    'Avg Images/Person': [f"{train_stats['avg_images_per_person']:.2f}", f"{test_stats['avg_images_per_person']:.2f}"]
}

# Create summary table
axes[1, 1].axis('tight')
axes[1, 1].axis('off')
table = axes[1, 1].table(cellText=[[summary_data[col][i] for col in summary_data.keys()] for i in range(2)],
                        colLabels=list(summary_data.keys()),
                        cellLoc='center',
                        loc='center')
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1.2, 1.5)
axes[1, 1].set_title('Dataset Summary Statistics')

plt.tight_layout()
plt.show()


In [None]:
print("="*60)
print("DATASET ANALYSIS SUMMARY AND RECOMMENDATIONS")
print("="*60)

print("\n1. DATASET STATISTICS:")
print(f"   - Train: {train_stats['total_people']} people, {train_stats['total_images']} images")
print(f"   - Test: {test_stats['total_people']} people, {test_stats['total_images']} images")
print(f"   - Train twin pairs: {len(train_pairs)}")
print(f"   - Test twin pairs: {len(test_pairs)}")

print("\n2. KEY OBSERVATIONS:")
print(f"   - Images per person range: {train_stats['min_images_per_person']}-{train_stats['max_images_per_person']} (train), {test_stats['min_images_per_person']}-{test_stats['max_images_per_person']} (test)")
print(f"   - Average images per person: {train_stats['avg_images_per_person']:.1f} (train), {test_stats['avg_images_per_person']:.1f} (test)")
print(f"   - Standard deviation: {train_stats['std_images_per_person']:.1f} (train), {test_stats['std_images_per_person']:.1f} (test)")

print("\n3. RECOMMENDATIONS:")
print("   - Image size: Use 224x224 for initial training, 448x448 for final models")
print("   - Batch size: 16 for local training, 8 for Kaggle")
print("   - Data augmentation: Face-preserving augmentations (rotation, flip, color jitter)")
print("   - Sampling strategy: Balanced positive/negative pairs with hard negative mining")
print("   - Validation split: 20% of training data for validation")

print("\n4. TRAINING STRATEGY:")
print("   - Start with lower resolution (224x224) for faster experimentation")
print("   - Use multiple similarity functions (cosine, euclidean, learned)")
print("   - Implement attention visualization for interpretability")
print("   - Monitor both same-twin and different-twin accuracies")
print("   - Use transfer learning from pre-trained Vision Transformers")

print("\n5. POTENTIAL CHALLENGES:")
print("   - High similarity between twins requires fine-grained discrimination")
print("   - Variable image quality and lighting conditions")
print("   - Limited training data compared to standard face recognition datasets")
print("   - Need for robust evaluation metrics (EER, ROC-AUC)")

print("\n" + "="*60)
print("Dataset exploration complete! Ready for model training.")
