In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Real-Time Object Detection for Autonomous Vehicles  
## Milestone 1: Data Collection, Exploration & Preprocessing

**Dataset:** KITTI (YOLO formatted)  

This notebook documents each step (tasks that are not needed are explicitly marked).


### Environment setup & copy dataset

In [3]:
# Cell: Setup - adjust dataset_dir to your Kaggle dataset name (visible in the right Data pane)
import os
import shutil
from pathlib import Path

KAGGLE_INPUT_DIR = "/kaggle/input" ]

DATASET_NAME = "kitti-dataset"  

input_dataset_path = os.path.join(KAGGLE_INPUT_DIR, DATASET_NAME)
working_root = "/kaggle/working/kitti"

print("Input dataset path:", input_dataset_path)
print("Working copy path:", working_root)

# Copy to working directory (faster read/write)
if os.path.exists(working_root):
    print("Working folder already exists — skipping copy.")
else:
    print("Copying dataset to working directory (this may take a while)...")
    shutil.copytree(input_dataset_path, working_root)
    print("Copy complete.")
    
# list top-level files
print("Top-level files in working dir:")
print(os.listdir(working_root))


Input dataset path: /kaggle/input/kitti-dataset
Working copy path: /kaggle/working/kitti
Copying dataset to working directory (this may take a while)...
Copy complete.
Top-level files in working dir:
['images', 'kitti.yaml', 'labels']


### Data Collection note
- **SKIPPED:** Download step — dataset is already uploaded to Kaggle and formatted for YOLO (contains `train` and `val` image folders and the YAML).
- We will still **verify** dataset structure and counts, then proceed to exploration and preprocessing.


### Confirm dataset structure & counts

In [4]:
from glob import glob
from pprint import pprint

root = Path(working_root)
images_train = sorted((root/"images/train").glob("*.*"))
images_val = sorted((root/"images/val").glob("*.*"))
labels_train = sorted((root/"labels/train").glob("*.txt"))
labels_val = sorted((root/"labels/val").glob("*.txt"))
yaml_file = root/"kitti.yaml"

print("Found yaml:", yaml_file.exists(), yaml_file)
print("Train images:", len(images_train))
print("Val images:", len(images_val))
print("Train labels:", len(labels_train))
print("Val labels:", len(labels_val))

# show sample paths
pprint({
    "images_train_sample": [str(p.name) for p in images_train[:5]],
    "labels_train_sample": [str(p.name) for p in labels_train[:5]],
})


Found yaml: True /kaggle/working/kitti/kitti.yaml
Train images: 5985
Val images: 1496
Train labels: 5985
Val labels: 1496
{'images_train_sample': ['000002.png',
                         '000003.png',
                         '000004.png',
                         '000005.png',
                         '000006.png'],
 'labels_train_sample': ['000002.txt',
                         '000003.txt',
                         '000004.txt',
                         '000005.txt',
                         '000006.txt']}
