## 1. Install Dependencies

In [None]:
import subprocess
import sys

# Install required packages
packages = [
    'torch',
    'torchvision',
    'pytorch-lightning',
    'anomalib',
    'opencv-python',
    'numpy',
    'pillow',
    'matplotlib',
    'scikit-learn'
]

for package in packages:
    print(f"Installing {package}...")
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', package])

print("\nâœ“ All packages installed successfully!")

: 

## 2. Import Libraries & Set Paths

In [None]:
import os
import cv2
import numpy as np
from pathlib import Path
import torch
import torchvision
from PIL import Image
import matplotlib.pyplot as plt

# Set paths
DATASET_ROOT = Path("dataset")
TRAIN_GOOD = DATASET_ROOT / "train" / "good"
TEST_GOOD = DATASET_ROOT / "test" / "good"
TEST_DEFECT = DATASET_ROOT / "test" / "defect"

print(f"Dataset root: {DATASET_ROOT.absolute()}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 3. Validate Dataset Structure

In [None]:
def validate_dataset():
    """
    Validate dataset structure and count images in each folder.
    """
    results = {}
    
    for name, path in [("train/good", TRAIN_GOOD), 
                        ("test/good", TEST_GOOD), 
                        ("test/defect", TEST_DEFECT)]:
        if not path.exists():
            print(f"âœ— Missing: {name}")
            results[name] = 0
        else:
            images = list(path.glob("*.png")) + list(path.glob("*.jpg")) + list(path.glob("*.jpeg"))
            results[name] = len(images)
            print(f"âœ“ {name}: {len(images)} images")
    
    return results

counts = validate_dataset()

print(f"\n{'='*40}")
print(f"Total training good images: {counts['train/good']}")
print(f"Total test good images: {counts['test/good']}")
print(f"Total test defect images: {counts['test/defect']}")
print(f"Total images: {sum(counts.values())}")
print(f"{'='*40}")

## 4. Check Image Sizes & Validity

In [None]:
def check_image_validity():
    """
    Check if all images can be loaded and report sizes.
    """
    all_sizes = []
    invalid_count = 0
    
    for folder in [TRAIN_GOOD, TEST_GOOD, TEST_DEFECT]:
        if not folder.exists():
            continue
            
        images = list(folder.glob("*.png")) + list(folder.glob("*.jpg")) + list(folder.glob("*.jpeg"))
        
        for img_path in images:
            try:
                img = cv2.imread(str(img_path))
                if img is None:
                    invalid_count += 1
                else:
                    h, w = img.shape[:2]
                    all_sizes.append((w, h))
            except Exception as e:
                print(f"Error reading {img_path}: {e}")
                invalid_count += 1
    
    if all_sizes:
        sizes_array = np.array(all_sizes)
        print(f"Image size statistics:")
        print(f"  Valid images: {len(all_sizes)}")
        print(f"  Invalid images: {invalid_count}")
        print(f"  Width range: {sizes_array[:, 0].min()} - {sizes_array[:, 0].max()}")
        print(f"  Height range: {sizes_array[:, 1].min()} - {sizes_array[:, 1].max()}")
        print(f"  Mean size: {sizes_array.mean(axis=0).astype(int)}")
    else:
        print("No valid images found!")

check_image_validity()

## 5. Quick Data Visualization

In [None]:
def visualize_samples():
    """
    Display a few sample images from each folder.
    """
    fig, axes = plt.subplots(2, 3, figsize=(12, 8))
    fig.suptitle("Dataset Samples", fontsize=14, fontweight="bold")
    
    folders = [(TRAIN_GOOD, "Train/Good"), 
               (TEST_GOOD, "Test/Good"), 
               (TEST_DEFECT, "Test/Defect")]
    
    for col, (folder, label) in enumerate(folders):
        if not folder.exists():
            continue
        
        images = list(folder.glob("*.png")) + list(folder.glob("*.jpg")) + list(folder.glob("*.jpeg"))
        
        # Show first image (full row)
        if images:
            img1 = cv2.imread(str(images[0]))
            img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2RGB)
            axes[0, col].imshow(img1)
            axes[0, col].set_title(f"{label} (1/2)")
            axes[0, col].axis("off")
        
        # Show second image (full row)
        if len(images) > 1:
            img2 = cv2.imread(str(images[1]))
            img2 = cv2.cvtColor(img2, cv2.COLOR_BGR2RGB)
            axes[1, col].imshow(img2)
            axes[1, col].set_title(f"{label} (2/2)")
            axes[1, col].axis("off")
        else:
            axes[1, col].text(0.5, 0.5, "No second image", ha="center", va="center")
            axes[1, col].axis("off")
    
    plt.tight_layout()
    plt.show()

visualize_samples()

## 6. Summary & Next Steps

In [None]:
print("\n" + "="*50)
print("âœ“ SETUP COMPLETE")
print("="*50)
print(f"\nâœ“ Dependencies installed")
print(f"âœ“ Dataset validated")
print(f"âœ“ Images accessible")
print(f"\nðŸ“Š Dataset Summary:")
print(f"   Training (good): {counts['train/good']} images")
print(f"   Testing (good): {counts['test/good']} images")
print(f"   Testing (defect): {counts['test/defect']} images")
print(f"\nðŸš€ Next: Run 02_anomalib_train_patchcore.ipynb")
print("="*50)