Dataset

In [33]:
import kagglehub
root_path = Path(kagglehub.dataset_download("klemenko/kitti-dataset")).resolve()
print("Dataset downloaded to:", root_path)
from pathlib import Path

Dataset downloaded to: C:\Users\nico3\.cache\kagglehub\datasets\klemenko\kitti-dataset\versions\1


In [34]:
from pathlib import Path
import kagglehub


# Download dataset and point to train folders
#root_path = Path(kagglehub.dataset_download("klemenko/kitti-dataset")).resolve()
image_train_folder = root_path / "data_object_image_2" / "training" / "image_2"
label_train_folder = root_path / "data_object_label_2" / "training" / "label_2"

# List image and label files once
image_files = sorted(image_train_folder.glob("*.png"))
label_files = sorted(label_train_folder.glob("*.txt"))

# Keep only these classes
class_keep = {"Pedestrian": 0, "Cyclist": 1, "Car": 2}

def parse_label_file(path):
    objects = []
    for line in open(path):
        parts = line.split()
        if not parts:
            continue  # If the line is empty we skip

        # If the line dosn't contain Pedestrian/Cyclist/Car then skip
        cls = parts[0]
        if cls not in class_keep:
            continue

        # if too truncated(how much of the object is cut off at the image border) / occluded (how much the object is blocked by others)
        trunc, occ = float(parts[1]), int(parts[2])
        if trunc > 0.7 or occ > 2:
            continue
              
        x1, y1, x2, y2 = map(float, parts[4:8])
        objects.append({
            "class_name": cls,
            "class_id": class_keep[cls],
            "bbox": [x1, y1, x2, y2],
        })
    return objects

# Parse all label files
label_data = [parse_label_file(p) for p in label_files]

# Build list of object samples
samples = [
    {
        "image_path": img_path,
        "class_id": obj["class_id"],
        "bbox": obj["bbox"],
    }
    for img_path, objects in zip(image_files, label_data)
    for obj in objects
]

print("Total number of objects:", len(samples))
print("Example sample:", samples[0])


Total number of objects: 32037
Example sample: {'image_path': WindowsPath('C:/Users/nico3/.cache/kagglehub/datasets/klemenko/kitti-dataset/versions/1/data_object_image_2/training/image_2/000000.png'), 'class_id': 0, 'bbox': [712.4, 143.0, 810.73, 307.92]}


In [35]:
from torch.utils.data import Dataset
from PIL import Image

class KittiCropClassificationDataset(Dataset):
    def __init__(self, samples, transform=None):
        self.samples = samples
        self.transform = transform

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, index):
        sample = self.samples[index]
        image = Image.open(sample["image_path"]).convert("RGB")
        x1, y1, x2, y2 = sample["bbox"]
        image = image.crop((x1, y1, x2, y2))
        if self.transform is not None:
            image = self.transform(image)
        label = sample["class_id"]
        return image, label


In [46]:
import random
import torchvision as tv

# KITTI full resolution
# the original image is 1242x375, but we are only using crops of the bounding boxes so 244x244 is sufficient
full_width  = 244 #1242
full_height = 244 #375

batch_size = 16
epochs = 5
learning_rate = 1e-4
weight_decay = 1e-4
max_samples = 1500 # Set to a lower number for quick testing, e.g., 5000

random.seed(42)
random.shuffle(samples)

if len(samples) > max_samples:
    max_samples = samples[:max_samples]

print("Using total samples:", len(samples))
# -----------------------------

# 80/20 train/val split
split_index = int(0.8 * len(max_samples))
train_samples = max_samples[:split_index]
val_samples   = max_samples[split_index:]

print("Train samples:", len(train_samples))
print("Val samples:  ", len(val_samples))

# -----------------------------
# Transforms
# -----------------------------
train_transform = tv.transforms.Compose([
    tv.transforms.Resize((full_height, full_width)),    
    tv.transforms.RandomHorizontalFlip(),
    tv.transforms.ColorJitter(0.2, 0.2, 0.2, 0.1),
    tv.transforms.ToTensor(),
    tv.transforms.Normalize([0.485, 0.456, 0.406],
                            [0.229, 0.224, 0.225]),
])

val_transform = tv.transforms.Compose([
    tv.transforms.Resize((full_height, full_width)),
    tv.transforms.ToTensor(),
    tv.transforms.Normalize([0.485, 0.456, 0.406],
                            [0.229, 0.224, 0.225]),
])

train_dataset = KittiCropClassificationDataset(train_samples, transform=train_transform)
val_dataset   = KittiCropClassificationDataset(val_samples,   transform=val_transform)

from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False)

len(train_dataset), len(val_dataset)


Using total samples: 32037
Train samples: 1200
Val samples:   300


(1200, 300)

In [47]:
import torch
from torch import nn
import torchvision.models as models

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
print("CUDA device name: ", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

num_classes = len(class_keep)


#weights = models.ResNet50_Weights.IMAGENET1K_V2
#model = models.resnet50(weights=weights)
#model.fc = nn.Linear(model.fc.in_features, num_classes)

weights = models.VGG16_Weights.IMAGENET1K_V1
model = models.vgg16(weights=weights)
in_features = model.classifier[6].in_features
model.classifier[6] = nn.Linear(in_features, num_classes)
model = model.to(device)


Using device: cuda
CUDA device name:  NVIDIA GeForce RTX 3050 6GB Laptop GPU


In [48]:
from tqdm.notebook import tqdm

In [49]:
from sklearn.metrics import accuracy_score, f1_score

def train_one_epoch(model, data_loader, optimizer, loss_function, epoch):
    model.train()
    total_loss = 0.0
    total_samples = 0

    for images, labels in tqdm(data_loader, desc=f"Training Epoch {epoch}", leave=False):
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = loss_function(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * images.size(0)
        total_samples += images.size(0)

    average_loss = total_loss / max(1, total_samples)
    return average_loss



@torch.no_grad()
def evaluate_classifier(model, data_loader):
    model.eval()
    all_predictions = []
    all_labels = []

    for images, labels in tqdm(data_loader, desc="Validating", leave=False):
        images = images.to(device)
        labels = labels.to(device)

        outputs = model(images)
        predicted_labels = outputs.argmax(dim=1)

        all_predictions.extend(predicted_labels.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())

    accuracy = accuracy_score(all_labels, all_predictions)
    macro_f1 = f1_score(all_labels, all_predictions, average="macro")
    return accuracy, macro_f1



In [50]:
optimizer = torch.optim.AdamW(model.parameters(),
                              lr=learning_rate,
                              weight_decay=weight_decay)
loss_function = nn.CrossEntropyLoss()

best_validation_f1 = 0.0
best_validation_accuracy = 0.0

for epoch in range(1, epochs + 1):
    training_loss = train_one_epoch(model, train_loader, optimizer, loss_function, epoch)
    validation_accuracy, validation_macro_f1 = evaluate_classifier(model, val_loader)

    print(f"Epoch {epoch:02d} | "
          f"train_loss={training_loss:.4f} | "
          f"val_acc={validation_accuracy:.3f} | "
          f"val_f1={validation_macro_f1:.3f}")

    if validation_macro_f1 > best_validation_f1:
        best_validation_f1 = validation_macro_f1
        best_validation_accuracy = validation_accuracy

print("\nBest validation accuracy:", best_validation_accuracy)
print("Best validation macro F1:", best_validation_f1)


Training Epoch 1:   0%|          | 0/75 [00:00<?, ?it/s]

Validating:   0%|          | 0/19 [00:00<?, ?it/s]

Epoch 01 | train_loss=0.2059 | val_acc=0.963 | val_f1=0.636


Training Epoch 2:   0%|          | 0/75 [00:00<?, ?it/s]

Validating:   0%|          | 0/19 [00:00<?, ?it/s]

Epoch 02 | train_loss=0.1480 | val_acc=0.960 | val_f1=0.763


Training Epoch 3:   0%|          | 0/75 [00:00<?, ?it/s]

KeyboardInterrupt: 