In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))
root = "/kaggle/input"
for item in os.listdir(root):
    print(item)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

solesensei_bdd100k


In [2]:
base_path = "/kaggle/input/solesensei_bdd100k"
print(os.listdir(base_path))

['bdd100k_labels_release', 'bdd100k_seg', 'bdd100k']


In [3]:
for folder in os.listdir(base_path):
    print(folder, "->", os.listdir(os.path.join(base_path, folder))[:5])

bdd100k_labels_release -> ['bdd100k']
bdd100k_seg -> ['bdd100k']
bdd100k -> ['bdd100k']


# **1- Check for Missing or Corrupted Images**

In [4]:
import cv2

image_dir = "/kaggle/input/solesensei_bdd100k/bdd100k/bdd100k/images/100k/train"

missing_or_broken = []

for img_file in os.listdir(image_dir):
    path = os.path.join(image_dir, img_file)
    img = cv2.imread(path)
    if img is None:
        missing_or_broken.append(img_file)

print("Total broken/missing images:", len(missing_or_broken))
print("Examples:", missing_or_broken[:10])

Total broken/missing images: 4
Examples: ['testB', 'testA', 'trainB', 'trainA']


# **2- Check Labels for Missing Fields**

In [5]:
import json

label_path = "/kaggle/input/solesensei_bdd100k/bdd100k_labels_release/bdd100k/labels/bdd100k_labels_images_train.json"

with open(label_path, "r") as f:
    labels = json.load(f)

print("Total label entries:", len(labels))
print("First entry example:\n", labels[0])


Total label entries: 69863
First entry example:
 {'name': '0000f77c-6257be58.jpg', 'attributes': {'weather': 'clear', 'scene': 'city street', 'timeofday': 'daytime'}, 'timestamp': 10000, 'labels': [{'category': 'traffic light', 'attributes': {'occluded': False, 'truncated': False, 'trafficLightColor': 'green'}, 'manualShape': True, 'manualAttributes': True, 'box2d': {'x1': 1125.902264, 'y1': 133.184488, 'x2': 1156.978645, 'y2': 210.875445}, 'id': 0}, {'category': 'traffic light', 'attributes': {'occluded': False, 'truncated': False, 'trafficLightColor': 'green'}, 'manualShape': True, 'manualAttributes': True, 'box2d': {'x1': 1156.978645, 'y1': 136.637417, 'x2': 1191.50796, 'y2': 210.875443}, 'id': 1}, {'category': 'traffic sign', 'attributes': {'occluded': False, 'truncated': False, 'trafficLightColor': 'none'}, 'manualShape': True, 'manualAttributes': True, 'box2d': {'x1': 1101.731743, 'y1': 211.122087, 'x2': 1170.79037, 'y2': 233.566141}, 'id': 2}, {'category': 'traffic sign', 'attri

In [7]:
clean_labels = []
invalid_boxes = []

for lbl in labels:
    img_name = lbl["name"]
    valid_objs = []

    for obj in lbl.get("labels", []):
        # Only process objects with bounding boxes
        if "box2d" in obj and obj.get("category", "") != "":
            x1, y1, x2, y2 = obj["box2d"].values()
            
            if x1 < x2 and y1 < y2:
                valid_objs.append({
                    "category": obj["category"],
                    "box2d": obj["box2d"]
                })
            else:
                invalid_boxes.append((img_name, obj))

    if len(valid_objs) > 0:
        clean_labels.append({
            "name": img_name,
            "labels": valid_objs
        })

print("Original entries:", len(labels))
print("After cleaning:", len(clean_labels))
print("Invalid boxes:", len(invalid_boxes))


Original entries: 69863
After cleaning: 69863
Invalid boxes: 0


# **3- Flattening JSON → DataFrame**

In [8]:
import pandas as pd

# Convert clean_labels to DataFrame
rows = []
for lbl in clean_labels:
    img = lbl["name"]
    for obj in lbl["labels"]:
        x1, y1, x2, y2 = obj["box2d"].values()
        rows.append([img, obj["category"], x1, y1, x2, y2])

df = pd.DataFrame(rows, columns=["image", "category", "x1", "y1", "x2", "y2"])

print("Total annotations:", len(df))

# check duplicate rows
dup_annots = df.duplicated().sum()
print("Duplicate annotations:", dup_annots)

# check duplicate images
dup_imgs = df["image"].duplicated().sum()
print("Duplicate image names:", dup_imgs)

Total annotations: 1286871
Duplicate annotations: 1
Duplicate image names: 1217008


# **5- Handling Duplicates**

In [9]:
# Drop exact duplicate annotation rows
df_cleaned = df.drop_duplicates()

print("After removing exact duplicates:", len(df_cleaned))

After removing exact duplicates: 1286870


# **6- Handling Duplicates**

In [10]:
# Compute width & height
df_cleaned["width"] = df_cleaned["x2"] - df_cleaned["x1"]
df_cleaned["height"] = df_cleaned["y2"] - df_cleaned["y1"]

# Filter out invalid or extreme boxes
df_cleaned = df_cleaned[
    (df_cleaned["width"] > 1) & 
    (df_cleaned["height"] > 1) & 
    (df_cleaned["width"] < 2000) & 
    (df_cleaned["height"] < 2000)
]

print("After removing outliers:", len(df_cleaned))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned["width"] = df_cleaned["x2"] - df_cleaned["x1"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned["height"] = df_cleaned["y2"] - df_cleaned["y1"]


After removing outliers: 1286782
