In [1]:
import mmcv
from mmdet3d.apis import init_model, inference_detector
import torch
import open3d as o3d
import numpy as np
from math import cos, sin
import numpy as np
import pandas as pd 
import os
from sklearn.metrics import precision_score, recall_score, f1_score



## Data preprocessing

#### Data Loading
In this section, three main datasets are loaded:
- df_images: Contains the image data for object detection, including file paths for image_id, image_name, and corresponding metadata.
- df_labels: Contains the ground truth labels for object detection, including object_class, bounding_box_coordinates, and image_id.
- df_calibration: Includes calibration parameters such as camera matrix, distortion coefficients, and extrinsic parameters for aligning image and sensor data.

In [2]:
image_path  = "./data/kitti/training/image_2/"

In [3]:
# List all image files
image_files = os.listdir(image_path)

# Create a DataFrame to store image file paths
df_images = pd.DataFrame({
    'image_id': range(len(image_files)),
    'image_name': image_files,
    'file_path': [os.path.join(image_path, img) for img in image_files]
})

In [4]:
# Print the first few entries of df_images
print(df_images.head())

   image_id  image_name                                 file_path
0         0  004863.png  ./data/kitti/training/image_2/004863.png
1         1  006912.png  ./data/kitti/training/image_2/006912.png
2         2  006906.png  ./data/kitti/training/image_2/006906.png
3         3  004877.png  ./data/kitti/training/image_2/004877.png
4         4  005599.png  ./data/kitti/training/image_2/005599.png


In [5]:
# Define the path to your KITTI dataset labels
label_path = "./data/kitti/training/label_2/"

# Function to load and parse a label file
def load_labels(label_file):
    labels = []
    with open(label_file, 'r') as f:
        for line in f:
            parts = line.strip().split()
            obj_class = parts[0]
            x_min = int(float(parts[4]))
            y_min = int(float(parts[5]))
            x_max = int(float(parts[6]))
            y_max = int(float(parts[7]))
            labels.append({'object_class': obj_class, 'bounding_box': (x_min, y_min, x_max, y_max)})
    return labels

# List all label files and accumulate label data
label_files = os.listdir(label_path)
label_data = []

for label_file in label_files:
    file_path = os.path.join(label_path, label_file)
    
    # Use the filename without the extension as the image_id (e.g., '000001')
    image_id = os.path.splitext(label_file)[0]
    
    labels = load_labels(file_path)
    for label in labels:
        label_data.append({
            'image_id': image_id,  # Use image_id derived from the filename
            'object_class': label['object_class'],
            'bounding_box_coordinates': label['bounding_box']
        })

# Create a DataFrame using pd.concat
df_labels = pd.DataFrame(label_data)

# Print the first few entries of df_labels
print(df_labels.head())

  image_id object_class bounding_box_coordinates
0   006145          Car     (595, 177, 637, 217)
1   006145          Car       (0, 185, 198, 277)
2   006145          Car    (812, 169, 1023, 293)
3   006145          Car     (764, 172, 947, 261)
4   006145          Car     (708, 177, 852, 243)


In [6]:
# Check that bounding_box_coordinates are unique per object
print(df_labels.head(100))

   image_id object_class bounding_box_coordinates
0    006145          Car     (595, 177, 637, 217)
1    006145          Car       (0, 185, 198, 277)
2    006145          Car    (812, 169, 1023, 293)
3    006145          Car     (764, 172, 947, 261)
4    006145          Car     (708, 177, 852, 243)
..      ...          ...                      ...
95   003997   Pedestrian        (0, 152, 88, 336)
96   003997     DontCare        (0, 159, 93, 349)
97   004020          Car     (600, 177, 657, 229)
98   004020          Car     (585, 178, 606, 192)
99   004020     DontCare     (718, 171, 726, 178)

[100 rows x 3 columns]


In [7]:
# Define the path to your KITTI dataset calibration files
calibration_path = "./data/kitti/training/calib/"

# Function to load calibration parameters
def load_calibration(calib_file):
    calib_data = {}
    with open(calib_file, 'r') as f:
        for line in f:
            line = line.strip()
            
            # Skip empty lines or comments
            if not line:
                continue

            # Check if the line has the expected ':' delimiter
            if ':' in line:
                key, value = line.split(':', 1)
                calib_data[key.strip()] = [float(x) for x in value.strip().split()]
            else:
                print(f"Skipping malformed line: {line}")
    
    return calib_data

# List all calibration files and accumulate calibration data
calibration_files = os.listdir(calibration_path)
calibration_data = []

for i, calib_file in enumerate(calibration_files):
    file_path = os.path.join(calibration_path, calib_file)
    calib_data = load_calibration(file_path)
    calibration_data.append({
        'image_id': i,
        'calibration_data': calib_data
    })

# Create a DataFrame using pd.concat
df_calibration = pd.DataFrame(calibration_data)

# Print the first few entries of df_calibration
print(df_calibration.head())

   image_id                                   calibration_data
0         0  {'P0': [721.5377, 0.0, 609.5593, 0.0, 0.0, 721...
1         1  {'P0': [721.5377, 0.0, 609.5593, 0.0, 0.0, 721...
2         2  {'P0': [721.5377, 0.0, 609.5593, 0.0, 0.0, 721...
3         3  {'P0': [721.5377, 0.0, 609.5593, 0.0, 0.0, 721...
4         4  {'P0': [721.5377, 0.0, 609.5593, 0.0, 0.0, 721...


Check the shape of each DataFrame

In [8]:
print("Images DataFrame shape:", df_images.shape)
print("Labels DataFrame shape:", df_labels.shape)
print("Calibration DataFrame shape:", df_calibration.shape)

Images DataFrame shape: (7481, 3)
Labels DataFrame shape: (51865, 3)
Calibration DataFrame shape: (7481, 2)


Check for missing values in each DataFrame

In [9]:
print("Missing values in Images DataFrame:\n", df_images.isna().sum())
print("Missing values in Labels DataFrame:\n", df_labels.isna().sum())
print("Missing values in Calibration DataFrame:\n", df_calibration.isna().sum())

Missing values in Images DataFrame:
 image_id      0
image_name    0
file_path     0
dtype: int64
Missing values in Labels DataFrame:
 image_id                    0
object_class                0
bounding_box_coordinates    0
dtype: int64
Missing values in Calibration DataFrame:
 image_id            0
calibration_data    0
dtype: int64


Get a summary of the data types and basic info

In [10]:
print("Images DataFrame info:\n")
df_images.info()

print("Labels DataFrame info:\n")
df_labels.info()

print("Calibration DataFrame info:\n")
df_calibration.info()

Images DataFrame info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7481 entries, 0 to 7480
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   image_id    7481 non-null   int64 
 1   image_name  7481 non-null   object
 2   file_path   7481 non-null   object
dtypes: int64(1), object(2)
memory usage: 175.5+ KB
Labels DataFrame info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51865 entries, 0 to 51864
Data columns (total 3 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   image_id                  51865 non-null  object
 1   object_class              51865 non-null  object
 2   bounding_box_coordinates  51865 non-null  object
dtypes: object(3)
memory usage: 1.2+ MB
Calibration DataFrame info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7481 entries, 0 to 7480
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dt

## Deep learning model

In [11]:
# Step 1: Set the device (GPU or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
# Step 2: Define paths for the config and checkpoint files
config_file = './configs/pointpillars/pointpillars_hv_secfpn_8xb6-160e_kitti-3d-3class.py'
checkpoint_file = './checkpoints/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class_20220301_150306-37dc2420.pth'

In [13]:
# Step 3: Initialize the model
model = init_model(config_file, checkpoint_file, device=device)

Loads checkpoint by local backend from path: ./checkpoints/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class_20220301_150306-37dc2420.pth




In [14]:
# Step 6: Load and visualize the point cloud using Open3D
def load_kitti_bin_file(bin_file):
    point_cloud = np.fromfile(bin_file, dtype=np.float32).reshape(-1, 4)
    print(f"Loaded point cloud shape: {point_cloud.shape}")  # Check if the point cloud is loaded correctl
    return point_cloud

In [15]:
# Step 4: Load the LiDAR point cloud sample (replace the path to your point cloud file)
pcd_file = './data/kitti/training/velodyne/005137.bin'

In [16]:
# Step 5: Perform inference (object detection)
result, _  = inference_detector(model, pcd_file)

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


In [17]:
# Load the point cloud from the .bin file
point_cloud = load_kitti_bin_file(pcd_file)

Loaded point cloud shape: (117660, 4)


Check the shape of each DataFrame

In [18]:
# Convert to Open3D point cloud format
pcd = o3d.geometry.PointCloud()
pcd.points = o3d.utility.Vector3dVector(point_cloud[:, :3])

In [19]:
# Step 7: Process and display the detected objects
class_names = model.dataset_meta['classes']  # Classes for the dataset (e.g., Car, Pedestrian, Cyclist)
threshold = 0.5  # Confidence threshold for displaying the detected objects

print(f"Detected objects in {pcd_file}:")

# Convert to Open3D point cloud format
pcd = o3d.geometry.PointCloud()
pcd.points = o3d.utility.Vector3dVector(point_cloud[:, :3])  # Only use x, y, z

Detected objects in ./data/kitti/training/velodyne/005137.bin:


In [20]:
# Helper function to compute the 8 corners of a bounding box
def compute_bounding_box_corners(bbox):
    # Access the underlying tensor from LiDARInstance3DBoxes
    bbox = bbox.tensor.cpu().numpy()  # Convert to numpy array
    
    # Unpack the individual bounding boxes
    all_corners = []
    for box in bbox:
        x, y, z, dx, dy, dz, heading = box

        # Rotation matrix for heading
        rot_matrix = np.array([[cos(heading), -sin(heading)], [sin(heading), cos(heading)]])
        
        # 3D bounding box corners before rotation
        corners = np.array([
            [-dx / 2, -dy / 2, -dz / 2], [dx / 2, -dy / 2, -dz / 2], 
            [dx / 2, dy / 2, -dz / 2], [-dx / 2, dy / 2, -dz / 2], 
            [-dx / 2, -dy / 2, dz / 2], [dx / 2, -dy / 2, dz / 2], 
            [dx / 2, dy / 2, dz / 2], [-dx / 2, dy / 2, dz / 2]
        ])
        
        # Apply rotation to the x and y dimensions
        rotated_corners = np.dot(corners[:, [0, 1]], rot_matrix)
        corners[:, 0:2] = rotated_corners
        
        # Add the center of the box to all the corners to translate it
        corners[:, 0] += x
        corners[:, 1] += y
        corners[:, 2] += z

        all_corners.append(corners)

    return all_corners

In [21]:
# Helper function to create a 3D bounding box in Open3D
def create_bounding_box_lines(bbox):
    all_bbox_corners = compute_bounding_box_corners(bbox)  # Get the 8 corners of the 3D bounding box

    line_sets = []
    for bbox_corners in all_bbox_corners:
        # Define the 12 edges of the bounding box
        edges = [
            [0, 1], [1, 2], [2, 3], [3, 0],  # Bottom face
            [4, 5], [5, 6], [6, 7], [7, 4],  # Top face
            [0, 4], [1, 5], [2, 6], [3, 7]   # Vertical edges
        ]

        # Create LineSet object for bounding box
        line_set = o3d.geometry.LineSet()
        line_set.points = o3d.utility.Vector3dVector(bbox_corners)
        line_set.lines = o3d.utility.Vector2iVector(edges)
        
        # Assign a color to the lines (blue in this case)
        line_set.colors = o3d.utility.Vector3dVector([[0, 0, 1] for _ in range(len(edges))])
        
        line_sets.append(line_set)
    
    return line_sets

In [22]:
 # Prepare bounding boxes for visualization
bounding_boxes = []
pred_instances = result.pred_instances_3d
for i in range(len(pred_instances.bboxes_3d)):
    bbox = pred_instances.bboxes_3d[i]  # Get the bounding box tensor
    score = pred_instances.scores_3d[i].cpu().numpy()  # Get confidence score
    label = pred_instances.labels_3d[i].cpu().numpy()  # Get class label

    if score >= threshold:
        # Get the class name corresponding to the label
        class_name = class_names[label]
        
        # Print the class name and score
        print(f"Detected {class_name} with confidence {score}")

        # Create and visualize the bounding box lines
        bounding_box_lines = create_bounding_box_lines(bbox)  # Create the bounding box lines
        bounding_boxes.extend(bounding_box_lines)

Detected Pedestrian with confidence 0.5604645609855652
Detected Cyclist with confidence 0.9474270343780518
Detected Cyclist with confidence 0.8887242078781128
Detected Car with confidence 0.9626887440681458
Detected Car with confidence 0.9603883624076843
Detected Car with confidence 0.9451578855514526
Detected Car with confidence 0.921866774559021
Detected Car with confidence 0.809282660484314
Detected Car with confidence 0.7989785075187683
Detected Car with confidence 0.7863332033157349
Detected Car with confidence 0.7257812023162842
Detected Car with confidence 0.6600877046585083
Detected Car with confidence 0.6276063919067383


In [23]:
# Step 8: Visualize the point cloud with bounding boxes using Open3D
vis = o3d.visualization.Visualizer()
vis.create_window()
vis.add_geometry(pcd)





True

In [24]:
# Add all bounding boxes to the visualizer
for bbox in bounding_boxes:
    vis.add_geometry(bbox)

# Non-blocking visualization
while True:
    vis.poll_events()
    vis.update_renderer()



KeyboardInterrupt: 

## Model Evaluation Metrics


#### Overfitting/Underfitting Analysis

Due to the nature of this report, which utilizes a pre-trained model, we do not have access to the training loss metrics. The model has not been retrained in this context, and therefore, we cannot evaluate the training loss or monitor overfitting/underfitting behavior through this metric. If retraining was conducted, tracking training and validation loss over epochs would provide insights into the model's generalization capabilities.


#### Precision, Recall, and F1-Score

Using the ground truth and predicted labels, we calculate precision, recall, and F1-score to evaluate the model's performance in detecting objects accurately.


In [25]:
velodyne_dir = './data/kitti/training/velodyne/'  # Training point cloud directory
label_dir = './data/kitti/training/label_2/'      # Ground truth label directory

In [37]:
# Step 4: Iterate over a subset of point cloud files in the training set and store predictions
all_predictions = []
all_labels = []
subset_size = 500  # Define the subset size (e.g., first 100 point clouds)

# List all point cloud files in the training directory
pointcloud_files = [f for f in os.listdir(velodyne_dir) if f.endswith('.bin')]

# Iterate over a subset of the point cloud files
for pointcloud_file in pointcloud_files[:subset_size]:
    image_id = os.path.splitext(pointcloud_file)[0]  # Get image ID from the filename

    # Load the point cloud
    pcd_file = os.path.join(velodyne_dir, pointcloud_file)
    
    # Perform inference
    result, _ = inference_detector(model, pcd_file)
    
    # Store the predictions
    pred_instances = result.pred_instances_3d
    for i in range(len(pred_instances.bboxes_3d)):
        predicted_label = pred_instances.labels_3d[i].cpu().numpy()  # Get predicted class label
        score = pred_instances.scores_3d[i].cpu().numpy()  # Get confidence score
        bbox = pred_instances.bboxes_3d[i].tensor.cpu().numpy()[0]  # Get bounding box coordinates

        # Convert 3D to 2D bounding box (x_min, y_min, x_max, y_max)
        x_min = int(bbox[0] - bbox[3] / 2)
        y_min = int(bbox[1] - bbox[4] / 2)
        x_max = int(bbox[0] + bbox[3] / 2)
        y_max = int(bbox[1] + bbox[4] / 2)

        all_predictions.append({
            'image_id': image_id,
            'object_class': predicted_label,
            'bounding_box': (x_min, y_min, x_max, y_max),
            'confidence_score': score 
        })
    
    # Step 6: Load corresponding labels
    label_file = os.path.join(label_path, image_id + '.txt')
    labels = load_labels(label_file)

    # Store the labels with the corresponding image_id
    for label in labels:
        all_labels.append({
            'image_id': image_id,
            'object_class': label['object_class'],
            'bounding_box_coordinates': label['bounding_box']
        })

# Step 7: Convert predictions and labels to DataFrames
df_predictions_1 = pd.DataFrame(all_predictions)
df_labels1 = pd.DataFrame(all_labels)

# Step 8: Print the first few entries of predictions and labels
print(df_predictions_1.head())
print(df_labels1.head())

# Step 9: Save the predictions and labels to CSV files for later use
df_predictions_1.to_csv('kitti_train_subset_predictions.csv', index=False)
df_labels1.to_csv('kitti_train_subset_labels.csv', index=False)


  image_id object_class      bounding_box confidence_score
0   001372            0  (18, -4, 19, -3)       0.78117496
1   001372            0    (5, 20, 6, 21)       0.34733957
2   001372            0  (47, -4, 48, -3)       0.23190214
3   001372            0    (16, 8, 17, 9)       0.22101979
4   001372            0  (14, -9, 15, -8)       0.16573866
  image_id object_class bounding_box_coordinates
0   001372          Car      (91, 191, 369, 323)
1   001372      Cyclist     (328, 171, 426, 310)
2   001372          Car     (459, 181, 528, 219)
3   001372          Car     (660, 174, 707, 212)
4   001372          Car     (628, 169, 659, 188)


In [38]:
df_labels1.isna().sum()

image_id                    0
object_class                0
bounding_box_coordinates    0
dtype: int64

In [39]:
df_labels1['object_class'].unique()

array(['Car', 'Cyclist', 'DontCare', 'Pedestrian', 'Van', 'Truck', 'Tram',
       'Misc', 'Person_sitting'], dtype=object)

In [40]:
# Create a copy of df_labels1 to preserve the original data
df_labels2 = df_labels1.copy()

class_name_to_label = {
    'Car': 0, 'Pedestrian': 1, 'Cyclist': 2, 'Van': 0
}

# Apply the mapping to ground truth object classes
df_labels2['object_class'] = df_labels2['object_class'].map(class_name_to_label)

# Check for NaN values in 'object_class' after mapping
print("Number of NaN values in object_class after mapping:", df_labels2['object_class'].isna().sum())

# Drop rows with NaN values in 'object_class' (these correspond to 'DontCare' or other unmapped classes)
df_labels2 = df_labels2.dropna(subset=['object_class'])

# Verify that no NaN values remain after dropping
print(df_labels2['object_class'].unique())  # Should only show 0, 1, 2 (Car, Pedestrian, Cyclist)
print(df_labels2.head())

Number of NaN values in object_class after mapping: 886
[0. 2. 1.]
  image_id  object_class bounding_box_coordinates
0   001372           0.0      (91, 191, 369, 323)
1   001372           2.0     (328, 171, 426, 310)
2   001372           0.0     (459, 181, 528, 219)
3   001372           0.0     (660, 174, 707, 212)
4   001372           0.0     (628, 169, 659, 188)


In [41]:
df_labels1.head()

Unnamed: 0,image_id,object_class,bounding_box_coordinates
0,1372,Car,"(91, 191, 369, 323)"
1,1372,Cyclist,"(328, 171, 426, 310)"
2,1372,Car,"(459, 181, 528, 219)"
3,1372,Car,"(660, 174, 707, 212)"
4,1372,Car,"(628, 169, 659, 188)"


In [42]:
df_labels2.isna().sum()

image_id                    0
object_class                0
bounding_box_coordinates    0
dtype: int64

In [43]:
# Ensure 'image_id' is a string in both DataFrames
df_labels2['image_id'] = df_labels2['image_id'].astype(str)
df_predictions_1['image_id'] = df_predictions_1['image_id'].astype(str)

# Rename bounding box columns in both DataFrames to the same name
df_labels2.rename(columns={'bounding_box_coordinates': 'bounding_box'}, inplace=True)

# Try merging again
df_merged = pd.merge(df_labels2, df_predictions_1, on=['image_id'], how='inner', suffixes=('_true', '_pred'))

# Check the first few rows of the merged DataFrame
print(df_merged.head())

  image_id  object_class_true    bounding_box_true object_class_pred  \
0   001372                0.0  (91, 191, 369, 323)                 0   
1   001372                0.0  (91, 191, 369, 323)                 0   
2   001372                0.0  (91, 191, 369, 323)                 0   
3   001372                0.0  (91, 191, 369, 323)                 0   
4   001372                0.0  (91, 191, 369, 323)                 0   

  bounding_box_pred confidence_score  
0  (18, -4, 19, -3)       0.78117496  
1    (5, 20, 6, 21)       0.34733957  
2  (47, -4, 48, -3)       0.23190214  
3    (16, 8, 17, 9)       0.22101979  
4  (14, -9, 15, -8)       0.16573866  


In [44]:
print(df_merged)

      image_id  object_class_true    bounding_box_true object_class_pred  \
0       001372                0.0  (91, 191, 369, 323)                 0   
1       001372                0.0  (91, 191, 369, 323)                 0   
2       001372                0.0  (91, 191, 369, 323)                 0   
3       001372                0.0  (91, 191, 369, 323)                 0   
4       001372                0.0  (91, 191, 369, 323)                 0   
...        ...                ...                  ...               ...   
33684   001359                0.0  (19, 201, 271, 301)                 2   
33685   001359                0.0  (19, 201, 271, 301)                 2   
33686   001359                0.0  (19, 201, 271, 301)                 2   
33687   001359                0.0  (19, 201, 271, 301)                 2   
33688   001359                0.0  (19, 201, 271, 301)                 2   

        bounding_box_pred confidence_score  
0        (18, -4, 19, -3)       0.78117496

In [32]:
# Check for missing values in the merged DataFrame
print(df_merged.isna().sum())

# Check if any NaN values are present in y_true or y_pred
print("NaN in y_true:", df_merged['object_class_true'].isna().sum())
print("NaN in y_pred:", df_merged['object_class_pred'].isna().sum())

image_id             0
object_class_true    0
bounding_box_true    0
object_class_pred    0
bounding_box_pred    0
dtype: int64
NaN in y_true: 0
NaN in y_pred: 0


In [45]:
y_true = df_merged['object_class_true'].astype(int)
y_pred = df_merged['object_class_pred'].astype(int)

precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

Precision: 0.46553751879069294
Recall: 0.22624595565318056
F1-Score: 0.28564660296210825


## Intersection over Union (IoU) Calculation

To compare the predicted and ground truth bounding boxes, we calculate the Intersection over Union (IoU) metric. The IoU measures how much the predicted bounding box overlaps with the ground truth bounding box.


In [35]:
# Function to calculate IoU for two bounding boxes
def calculate_iou(box_true, box_pred):
    x_min_true, y_min_true, x_max_true, y_max_true = box_true
    x_min_pred, y_min_pred, x_max_pred, y_max_pred = box_pred

    # Calculate intersection
    x_min_inter = max(x_min_true, x_min_pred)
    y_min_inter = max(y_min_true, y_min_pred)
    x_max_inter = min(x_max_true, x_max_pred)
    y_max_inter = min(y_max_true, y_max_pred)
    
    inter_area = max(0, x_max_inter - x_min_inter) * max(0, y_max_inter - y_min_inter)
    
    # Calculate areas of the two boxes
    true_area = (x_max_true - x_min_true) * (y_max_true - y_min_true)
    pred_area = (x_max_pred - x_min_pred) * (y_max_pred - y_min_pred)
    
    # Calculate IoU
    iou = inter_area / float(true_area + pred_area - inter_area)
    return iou

# Add IoU calculations to the merged DataFrame
df_merged['iou'] = df_merged.apply(lambda row: calculate_iou(row['bounding_box_true'], row['bounding_box_pred']), axis=1)

# Print the first few entries with IoU values
print(df_merged[['image_id', 'object_class_true', 'object_class_pred', 'iou']].head())

  image_id  object_class_true object_class_pred  iou
0   001372                0.0                 0  0.0
1   001372                0.0                 0  0.0
2   001372                0.0                 0  0.0
3   001372                0.0                 0  0.0
4   001372                0.0                 0  0.0


In [46]:
from sklearn.metrics import roc_auc_score

#### ROC AUC score (Receiver Operating Characteristic - Area Under the Curve) 

The ROC AUC score evaluates the ability of a model to distinguish between classes by plotting the True Positive Rate (TPR or Recall) against the False Positive Rate (FPR) at different threshold levels.

In the context of object detection, it can be computed for each class using the confidence scores of predictions.

In [47]:
# Convert object classes to binary labels for a "one-vs-rest" ROC AUC calculation
class_name_to_label = {
    'Car': 0, 'Pedestrian': 1, 'Cyclist': 2, 
    'Van': 0
}

# If you have confidence scores from predictions
y_true = df_merged['object_class_true'].astype(int)
y_pred_confidence = df_merged['confidence_score']  # Assuming this exists
y_pred_label = df_merged['object_class_pred'].astype(int)

# One-vs-rest ROC AUC for each class
roc_auc_scores = {}
for class_label in np.unique(y_true):
    # Convert y_true and y_pred into binary labels (1 for class, 0 for others)
    y_true_binary = (y_true == class_label).astype(int)
    
    # Compute the ROC AUC score for this class
    try:
        auc_score = roc_auc_score(y_true_binary, y_pred_confidence)
        roc_auc_scores[class_label] = auc_score
        print(f'ROC AUC Score for class {class_label}: {auc_score}')
    except ValueError as e:
        print(f'Could not calculate AUC for class {class_label}: {e}')

# Overall (macro-average) ROC AUC
macro_roc_auc = np.mean(list(roc_auc_scores.values()))
print(f'Macro-Averaged ROC AUC: {macro_roc_auc}')

ROC AUC Score for class 0: 0.6232038267568941
ROC AUC Score for class 1: 0.37072089871098945
ROC AUC Score for class 2: 0.4412394296687697
Macro-Averaged ROC AUC: 0.4783880517122177


El código proporcionado demuestra un sólido enfoque para el procesamiento de datos de imágenes en tareas de detección de objetos. La elección de pandas para manejar los datos estructurados y la orientación hacia algoritmos de detección de objetos como Faster R-CNN o YOLO son decisiones acertadas. La capacidad de pandas para gestionar grandes conjuntos de datos y la eficiencia de estos algoritmos en tareas de visión por computadora hacen de esta combinación una solución efectiva para este tipo de problemas.
Como equipo de data scientists, hemos analizado a fondo el código que nos has proporcionado. Nuestro primer acercamiento nos ha permitido identificar que este código se enfoca en la etapa inicial y fundamental de cualquier proyecto de detección de objetos: el preprocesamiento de datos.

Hemos detectado que se utiliza Pandas para estructurar y manejar eficientemente la información de las imágenes y sus correspondientes etiquetas. Esta herramienta es ideal para este tipo de tareas, ya que nos permite crear DataFrames que facilitan la exploración y manipulación de los datos. Además, hemos identificado que se han sentado las bases para la visualización de los datos, aunque esta parte aún no está completamente implementada.

Como equipo, hemos logrado un avance significativo al implementar un modelo preentrenado en el conjunto de datos KITTI. Sin embargo, al evaluar el modelo en un subconjunto de los datos de entrenamiento, hemos observado que la precisión aún no cumple con nuestras expectativas. Esto nos indica que hay margen de mejora y que debemos profundizar en nuestro análisis.

Una de las principales limitantes que hemos identificado es la falta de evaluación en el conjunto de pruebas oficial de KITTI. Al no contar con las etiquetas correspondientes, no podemos obtener métricas precisas y comparables con otros modelos. A pesar de esta restricción, los resultados preliminares nos brindan una valiosa línea de base para futuras investigaciones.

A partir de estos hallazgos, hemos definido los siguientes pasos:

Evaluación en el conjunto de pruebas oficial: Tan pronto como tengamos acceso a las etiquetas del conjunto de pruebas, realizaremos una evaluación exhaustiva del modelo para obtener resultados más confiables.
Exploración de diferentes arquitecturas: Investigaremos otras arquitecturas de redes neuronales convolucionales (CNN) que puedan ser más adecuadas para nuestra tarea.
Ajuste de hiperparámetros: Experimentaremos con diferentes valores de hiperparámetros para optimizar el rendimiento del modelo.
Aumento de datos: Consideraremos técnicas de aumento de datos para mejorar la generalización del modelo.
Estamos convencidos de que al abordar estos puntos, podremos mejorar significativamente el rendimiento de nuestro modelo y obtener resultados más precisos en la detección de objetos en imágenes de KITTI. Esta experiencia nos ha permitido consolidar nuestros conocimientos en el área de visión por computadora y ha sentado las bases para futuros proyectos más ambiciosos.