In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from PIL import Image
from pathlib import Path
plt.style.use('dark_background') #I'm using dark background, so I used a compatible style. If not, the images would look color-inverted on my IDE.

In [None]:
PROCESSED_DATA_DIR = Path("../data/processed")
TRAIN_MANIFEST_PATH = PROCESSED_DATA_DIR / "train.csv"

In [None]:
train_df = pd.read_csv(PROCESSED_DATA_DIR / "train.csv")
test_df = pd.read_csv(PROCESSED_DATA_DIR / "test.csv")
val_df = pd.read_csv(PROCESSED_DATA_DIR / "val.csv")

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
val_df.head()

In [None]:
print(f"Train num classes: {len(train_df['label'].unique())}")
print(f"Train num samples: {len(train_df)}")

In [None]:
plt.figure(figsize=(12, 10))
sns.countplot(y='label', data=train_df, order=train_df['label'].value_counts().index)
plt.title('Training Set Class Distribution')
plt.xlabel('Number of Samples')
plt.ylabel('Breed')
plt.tight_layout()
plt.show()

In [None]:
print(f"Test num classes: {len(test_df['label'].unique())}")
print(f"Test num samples: {len(test_df)}")

In [None]:
plt.figure(figsize=(12, 10))
sns.countplot(y='label', data=test_df, order=test_df['label'].value_counts().index)
plt.title('Test Set Class Distribution')
plt.xlabel('Number of Samples')
plt.ylabel('Breed')
plt.tight_layout()
plt.show()

In [None]:
print(f"Val num classes: {len(val_df['label'].unique())}")
print(f"Val num samples: {len(val_df)}")

In [None]:
plt.figure(figsize=(12, 10))
sns.countplot(y='label', data=val_df, order=val_df['label'].value_counts().index)
plt.title('Validation Set Class Distribution')
plt.xlabel('Number of Samples')
plt.ylabel('Breed')
plt.tight_layout()
plt.show()

The classes are well-balanced. No special handling is needed.

In [None]:
def plot_sample_images_of_class(df: pd.DataFrame, class_name: str, n_samples: int =9):
    """
    Plot samples of class name. Used to inspect intra-class variance (poses, lighting, color, etc.)
    Parameters
    ----------
    df: pd.DataFrame
        Dataframe containing samples
    class_name: string
        Name of class
    n_samples:
        Number of samples
    Returns
    -------
    """
    plt.figure(figsize=(10, 10))
    class_df = df[df['label'] == class_name]

    if len(class_df) < n_samples:
        print(f"Not enough samples for class '{class_name}'. Found {len(class_df)}.")
        n_samples = len(class_df)

    # Get random samples.
    sample_paths = class_df['filepath'].sample(n_samples)

    for i, img_path in enumerate(sample_paths):
        ax = plt.subplot(3, 3, i + 1)
        img = Image.open(img_path).convert("RGB")
        plt.imshow(img)
        plt.title(f"{class_name}")
        plt.axis("off")

    plt.suptitle(f"Intra-Class Variation for: '{class_name}'", fontsize=16)
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.show()

In [None]:
plot_sample_images_of_class(train_df, class_name='beagle')

In [None]:
plot_sample_images_of_class(train_df, class_name='havanese')

In [None]:
plot_sample_images_of_class(train_df, class_name='Abyssinian')

There are many variations of lighting, poses, color, background objects, positioning, etc. which necessitates a robust augmentation pipeline. For example, color jitter (to correct for high color variations), rotation (to correct for random poses), random resized cropping (to correct for some subjects not being fully in the image), etc.

In [None]:
def plot_inter_class_similarity(df: pd.DataFrame, class_names: list[str]):
    """
    Plot inter-class similarity. Used to inspect inter-class variance.
    Parameters
    ----------
    df: pd.DataFrame
        Dataframe containing samples
    class_names: list
        Names of classes

    """
    n_classes = len(class_names)
    n_samples_per_class = 4

    plt.figure(figsize=(n_classes * 4, n_samples_per_class * 4))

    for i, class_name in enumerate(class_names):
        class_df = df[df['label'] == class_name]
        sample_paths = class_df['filepath'].sample(n_samples_per_class)

        for j, img_path in enumerate(sample_paths):
            ax_idx = j * n_classes + i + 1
            ax = plt.subplot(n_samples_per_class, n_classes, ax_idx)
            img = Image.open(img_path)
            plt.imshow(img)
            plt.title(class_name)
            plt.axis("off")

    plt.suptitle("Inter-Class Similarity", fontsize=16)
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.show()

In [None]:
similar_breeds = ['american_bulldog', 'american_pit_bull_terrier', 'staffordshire_bull_terrier']
plot_inter_class_similarity(train_df, class_names=similar_breeds)

In [None]:
similar_breeds = ['samoyed', 'keeshond', 'pomeranian']
plot_inter_class_similarity(train_df, class_names=similar_breeds)

In [None]:
similar_breeds = ['basset_hound', 'beagle']
plot_inter_class_similarity(train_df, class_names=similar_breeds)

The classes look very similar, indicating low inter-class variance. A high intra-class variance and low inter-class variance means it's a fine-grainedd classification problem. That's why the usage of a DeiT-Ti backbone fine-tuned with MoE-LoRA is justified.