# Deepchecks
This notebook explores deepchecks capabilities to test our dataset.

In [1]:
import os
import polars as pl

train_dfs = []
test_dfs = []

for base_dir, sub_dirs, files in os.walk("../data/raw/food101/data"):
    for filename in files:
        if "train" in filename and filename.endswith(".parquet"):
            # print(f"Reading: {os.path.join(base_dir, filename)}")
            train_dfs.append(pl.read_parquet(os.path.join(base_dir, filename)))
        elif "validation" in filename and filename.endswith(".parquet"):
            # print(f"Reading: {os.path.join(base_dir, filename)}")
            test_dfs.append(pl.read_parquet(os.path.join(base_dir, filename)))

train_data = pl.concat(train_dfs)
train_data = train_data.sample(fraction=1.0, shuffle=True, seed=42) # Shuffle the dataset

test_data = pl.concat(test_dfs)
test_data = test_data.sample(fraction=1.0, shuffle=True, seed=42) # Shuffle the dataset

display(train_data.head(3))

image,label
struct[2],i64
"{b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H\x00H\x00\x00\xff\xe1\x03\x12Exif\x00\x00MM\x00*\x00\x00\x00\x08\x00\x0a\x01\x0f\x00\x02\x00\x00\x00\x06\x00\x00\x00\x86\x01\x10\x00\x02\x00\x00\x00\x09""…,""2632284.jpg""}",44
"{b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00\x90\x00\x90\x00\x00\xff\xe1\x02\xc2Exif\x00\x00MM\x00*\x00\x00\x00\x08\x00\x06\x01\x12\x00\x03\x00\x00\x00\x01\x00\x01\x00\x00\x01\x1a\x00\x05\x00\x00\x00\x01""…,""3560567.jpg""}",48
"{b""\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00\xb4\x00\xb4\x00\x00\xff\xed\x1f\xaaPhotoshop\x203.0\x008BIM\x04\x04\x00\x00\x00\x00\x00\x11\x1c\x02\x00\x00\x02\x00\x02\x1c\x02P""…,""1702116.jpg""}",30


In [2]:
from deepchecks.vision.vision_data import VisionData
from src.labels import LABELS
from src.data.food_dataset import FoodDataset, dc_collate
from torch.utils.data import DataLoader

# Build label map once (Deepchecks expects idx->class_name)
label_map = {i: label for i, label in enumerate(LABELS)}
print("Num classes:", len(LABELS))

# Instantiate datasets from polars DataFrames
train_dataset = FoodDataset(train_data, class_names=LABELS, image_col="image", label_col="label")
test_dataset  = FoodDataset(test_data,  class_names=LABELS, image_col="image", label_col="label")

# DataLoaders that yield Deepchecks-compatible dicts
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True,  num_workers=0, collate_fn=dc_collate)
test_loader  = DataLoader(test_dataset,  batch_size=32, shuffle=False, num_workers=0, collate_fn=dc_collate)

# Wrap for Deepchecks
train_ds = VisionData(batch_loader=train_loader, task_type="classification", label_map=label_map)
test_ds  = VisionData(batch_loader=test_loader,  task_type="classification", label_map=label_map)

Num classes: 101


In [3]:
from deepchecks.vision.suites import train_test_validation

suite = train_test_validation()
result = suite.run(train_ds, test_ds)


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



In [4]:
result

Accordion(children=(VBox(children=(HTML(value='\n<h1 id="summary_XAJI0Y6DPBHSAHXTHV3A3ZMF8">Train Test Validat…

In [10]:
from datetime import datetime, timezone

result.save_as_html(f'../reports/figures/deepchecks{datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S")}.html')

'../reports/figures/deepchecks20251009102658.html'