# MobileNetV2 Object Detection Training - GPU Accelerated

**Environment:**
- TensorFlow 2.17.0 with GPU support
- CUDA 12+ with cuDNN
- RTX 5050 GPU with XLA acceleration
- NVIDIA NGC TensorFlow Container

This notebook trains a MobileNetV2-based SSD object detection model with GPU acceleration and XLA optimization.

In [None]:
# Verify GPU availability and configuration
import tensorflow as tf
import os

print("=" * 60)
print("GPU CONFIGURATION")
print("=" * 60)
print(f"TensorFlow Version: {tf.__version__}")
print(f"Built with CUDA: {tf.test.is_built_with_cuda()}")

# List physical devices
gpus = tf.config.list_physical_devices('GPU')
print(f"\nAvailable GPUs: {len(gpus)}")
for gpu in gpus:
    print(f"  - {gpu}")

if gpus:
    # Enable memory growth to prevent TF from allocating all GPU memory at once
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    print("\nâœ“ GPU memory growth enabled")
    
    # Get GPU details
    gpu_details = tf.config.experimental.get_device_details(gpus[0])
    print(f"\nGPU Details: {gpu_details}")
    
    # Enable XLA compilation for better performance
    tf.config.optimizer.set_jit(True)
    print("âœ“ XLA compilation enabled")
else:
    print("\nâš  WARNING: No GPU detected! Training will use CPU.")

# Enable mixed precision for faster training on modern GPUs
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')
print("âœ“ Mixed precision (float16) enabled for faster training")

print("=" * 60)

## Install Dependencies

Install required packages for TensorFlow Object Detection API and dataset handling.

In [None]:
# Install dependencies
import sys

# Uninstall Cython to avoid conflicts
!pip uninstall Cython -y

# Install required packages
!pip install -q roboflow protobuf==3.20.3 tensorflow-model-optimization

print("âœ“ Dependencies installed successfully")

In [None]:
# Clone TensorFlow models repository
import os

models_dir = '/workspace/models'
if not os.path.exists(models_dir):
    print("Cloning TensorFlow models repository...")
    !git clone --depth 1 https://github.com/tensorflow/models {models_dir}
    print("âœ“ Repository cloned")
else:
    print("âœ“ TensorFlow models repository already exists")

In [None]:
# Setup Object Detection API
import os

research_dir = '/workspace/models/research'
os.chdir(research_dir)

# Compile protobuf files
print("Compiling protobuf files...")
!protoc object_detection/protos/*.proto --python_out=.

# Copy setup.py
!cp object_detection/packages/tf2/setup.py .

print("âœ“ Object Detection API setup complete")

In [None]:
# Install Object Detection API
import os

os.chdir('/workspace/models/research')

# Install the API
print("Installing Object Detection API...")
!python -m pip install . --quiet

print("âœ“ Object Detection API installed")

In [None]:
# Test Object Detection API installation
print("Testing Object Detection API installation...")
os.chdir('/workspace/models/research')
!python object_detection/builders/model_builder_tf2_test.py

## Prepare Dataset

Download dataset using Roboflow and prepare TFRecords.

In [None]:
# Download dataset from Roboflow
from roboflow import Roboflow

# Initialize Roboflow with your API key
rf = Roboflow(api_key="LHiJvoAFmvmbSi50SwC1")
project = rf.workspace("hvac-whaik").project("ai-hvac-nameplate-focus-kcnb5")
version = project.version(12)

# Download dataset to workspace
dataset = version.download("tensorflow", location="/workspace/dataset")
print(f"âœ“ Dataset downloaded to: {dataset.location}")

In [None]:
# Setup dataset paths and create label map
import os
import pandas as pd
import tensorflow as tf
from object_detection.utils import dataset_util
from PIL import Image
import io

# Dataset paths
DATASET_DIR = "/workspace/dataset/ai-hvac-nameplate-focus-12"
TRAIN_DIR = os.path.join(DATASET_DIR, 'train')
TEST_DIR = os.path.join(DATASET_DIR, 'test')
VALID_DIR = os.path.join(DATASET_DIR, 'valid')

# Label map
label_map = {
    1: 'HVAC_Spec_Label'
}

def create_label_map_file(label_map, output_path):
    """Create label map file in pbtxt format"""
    with open(output_path, 'w') as f:
        for id, name in label_map.items():
            f.write(f'item {{\n  id: {id}\n  name: "{name}"\n}}\n')

LABEL_MAP_PATH = os.path.join(DATASET_DIR, 'label_map.pbtxt')
create_label_map_file(label_map, LABEL_MAP_PATH)

print("âœ“ Label map created")
print(f"  Classes: {list(label_map.values())}")

In [None]:
# Convert CSV annotations to TFRecord format
def create_tf_example(row, image_dir):
    """Convert a single annotation row to TFRecord example"""
    filename = row['filename']
    img_path = os.path.join(image_dir, filename)
    
    with tf.io.gfile.GFile(img_path, 'rb') as fid:
        encoded_jpg = fid.read()
    
    encoded_jpg_io = io.BytesIO(encoded_jpg)
    image = Image.open(encoded_jpg_io)
    width, height = image.size

    filename_bytes = filename.encode('utf8')
    image_format = b'jpg'
    
    xmins = [row['xmin'] / width]
    xmaxs = [row['xmax'] / width]
    ymins = [row['ymin'] / height]
    ymaxs = [row['ymax'] / height]
    classes_text = [row['class'].encode('utf8')]
    classes = [list(label_map.keys())[list(label_map.values()).index(row['class'])]]

    tf_example = tf.train.Example(features=tf.train.Features(feature={
        'image/height': dataset_util.int64_feature(height),
        'image/width': dataset_util.int64_feature(width),
        'image/filename': dataset_util.bytes_feature(filename_bytes),
        'image/source_id': dataset_util.bytes_feature(filename_bytes),
        'image/encoded': dataset_util.bytes_feature(encoded_jpg),
        'image/format': dataset_util.bytes_feature(image_format),
        'image/object/bbox/xmin': dataset_util.float_list_feature(xmins),
        'image/object/bbox/xmax': dataset_util.float_list_feature(xmaxs),
        'image/object/bbox/ymin': dataset_util.float_list_feature(ymins),
        'image/object/bbox/ymax': dataset_util.float_list_feature(ymaxs),
        'image/object/class/text': dataset_util.bytes_list_feature(classes_text),
        'image/object/class/label': dataset_util.int64_list_feature(classes),
    }))
    return tf_example

def csv_to_tfrecord(csv_path, image_dir, output_path):
    """Convert CSV annotations to TFRecord file"""
    writer = tf.io.TFRecordWriter(output_path)
    df = pd.read_csv(csv_path)
    
    print(f"Converting {len(df)} annotations from {os.path.basename(csv_path)}...")
    
    for idx, row in df.iterrows():
        tf_example = create_tf_example(row, image_dir)
        writer.write(tf_example.SerializeToString())
    
    writer.close()
    print(f"âœ“ Created {output_path}")

# Create TFRecords for train, test, and validation sets
print("\nCreating TFRecord files...")

TRAIN_CSV = os.path.join(TRAIN_DIR, '_annotations.csv')
TEST_CSV = os.path.join(TEST_DIR, '_annotations.csv')
VALID_CSV = os.path.join(VALID_DIR, '_annotations.csv')

TRAIN_TFRECORD = os.path.join(DATASET_DIR, 'train.tfrecord')
TEST_TFRECORD = os.path.join(DATASET_DIR, 'test.tfrecord')
VALID_TFRECORD = os.path.join(DATASET_DIR, 'valid.tfrecord')

csv_to_tfrecord(TRAIN_CSV, TRAIN_DIR, TRAIN_TFRECORD)
csv_to_tfrecord(TEST_CSV, TEST_DIR, TEST_TFRECORD)
csv_to_tfrecord(VALID_CSV, VALID_DIR, VALID_TFRECORD)

print("\nâœ“ All TFRecord files created successfully")

## Download Pre-trained Model

Download MobileNetV2 SSD FPNLite pre-trained on COCO dataset.

In [None]:
# Download pre-trained model
import os

MODEL_DIR = '/workspace/models_checkpoints'
MODEL_NAME = 'ssd_mobilenet_v2_fpnlite_320x320_coco17_tpu-8'
MODEL_URL = f'http://download.tensorflow.org/models/object_detection/tf2/20200711/{MODEL_NAME}.tar.gz'

os.makedirs(MODEL_DIR, exist_ok=True)

# Download and extract
print(f"Downloading {MODEL_NAME}...")
!wget -q {MODEL_URL} -O /tmp/{MODEL_NAME}.tar.gz
!tar -xf /tmp/{MODEL_NAME}.tar.gz -C {MODEL_DIR}
!rm /tmp/{MODEL_NAME}.tar.gz

print(f"âœ“ Model downloaded to {MODEL_DIR}/{MODEL_NAME}")

## Configure Training Pipeline

Update the pipeline configuration with dataset paths and GPU-optimized settings.

In [None]:
# Configure pipeline for GPU training
import re

PIPELINE_CONFIG_PATH = os.path.join(MODEL_DIR, MODEL_NAME, 'pipeline.config')

# Read the pipeline config
with open(PIPELINE_CONFIG_PATH, 'r') as f:
    config = f.read()

# Checkpoint path (empty string to train from scratch, or use pre-trained weights)
checkpoint_path = os.path.join(MODEL_DIR, MODEL_NAME, 'checkpoint', 'ckpt-0')

# Replace placeholders with actual paths
config = re.sub(r'num_classes: \d+', 'num_classes: 1', config)
config = re.sub(r'batch_size: \d+', 'batch_size: 16', config)  # Increased batch size for GPU
config = re.sub(r'fine_tune_checkpoint: ".*?"', f'fine_tune_checkpoint: "{checkpoint_path}"', config)
config = re.sub(r'fine_tune_checkpoint_type: ".*?"', 'fine_tune_checkpoint_type: "detection"', config)

# Update train config
config = re.sub(r'label_map_path: ".*?"', f'label_map_path: "{LABEL_MAP_PATH}"', config, count=1)
config = re.sub(r'input_path: ".*?"', f'input_path: "{TRAIN_TFRECORD}"', config, count=1)

# Update eval config
config = re.sub(r'label_map_path: ".*?"', f'label_map_path: "{LABEL_MAP_PATH}"', config, count=1)
config = re.sub(r'input_path: ".*?"', f'input_path: "{VALID_TFRECORD}"', config, count=1)

# Add GPU-specific optimizations
if 'use_bfloat16: false' in config:
    config = config.replace('use_bfloat16: false', 'use_bfloat16: true')  # Enable bfloat16 for faster training

# Write back the modified config
with open(PIPELINE_CONFIG_PATH, 'w') as f:
    f.write(config)

print("âœ“ Pipeline configuration updated for GPU training")
print(f"  - Batch size: 16 (optimized for GPU)")
print(f"  - Mixed precision: Enabled")
print(f"  - Fine-tune checkpoint: {checkpoint_path}")

## Train the Model ðŸš€

Train with GPU acceleration and XLA optimization. The training will automatically use:
- GPU acceleration
- Mixed precision (float16/float32)
- XLA compilation for optimized operations
- Increased batch size for better GPU utilization

**Note:** Training will save checkpoints to `/workspace/models_checkpoints/trained_model` every few minutes. You can resume training by running this cell again.

In [None]:
# Train the model with GPU acceleration
import os

# Output directory for trained model
TRAINED_MODEL_DIR = os.path.join(MODEL_DIR, 'trained_model')
os.makedirs(TRAINED_MODEL_DIR, exist_ok=True)

# Training parameters
NUM_TRAIN_STEPS = 15000  # Adjust based on your needs
CHECKPOINT_EVERY_N = 500  # Save checkpoint every N steps

print("=" * 60)
print("STARTING GPU TRAINING")
print("=" * 60)
print(f"Pipeline config: {PIPELINE_CONFIG_PATH}")
print(f"Model output dir: {TRAINED_MODEL_DIR}")
print(f"Training steps: {NUM_TRAIN_STEPS}")
print(f"GPU: {gpus[0] if gpus else 'CPU (No GPU detected)'}")
print("=" * 60)
print()

# Change to research directory
os.chdir('/workspace/models/research')

# Run training with GPU
# The model_main_tf2.py script will automatically use GPU if available
!python object_detection/model_main_tf2.py \
    --pipeline_config_path={PIPELINE_CONFIG_PATH} \
    --model_dir={TRAINED_MODEL_DIR} \
    --alsologtostderr \
    --num_train_steps={NUM_TRAIN_STEPS} \
    --sample_1_of_n_eval_examples=1 \
    --checkpoint_every_n={CHECKPOINT_EVERY_N}

## Monitor Training with TensorBoard

Launch TensorBoard to monitor training progress in real-time.

In [None]:
# Launch TensorBoard
%load_ext tensorboard
%tensorboard --logdir {TRAINED_MODEL_DIR} --port 6006

## Export Trained Model

Export the trained model to SavedModel format and convert to TFLite for deployment.

In [None]:
# Export to TFLite format
import os

OUTPUT_DIR = '/workspace/exported_model'
os.makedirs(OUTPUT_DIR, exist_ok=True)

TFLITE_MODEL_PATH = os.path.join(OUTPUT_DIR, 'model.tflite')

print("Exporting model for TFLite...")
os.chdir('/workspace/models/research')

# Export TFLite graph
!python object_detection/export_tflite_graph_tf2.py \
    --trained_checkpoint_dir {TRAINED_MODEL_DIR} \
    --output_directory {OUTPUT_DIR} \
    --pipeline_config_path {PIPELINE_CONFIG_PATH}

print("âœ“ TFLite graph exported")

In [None]:
# Convert to TFLite format
import tensorflow as tf

saved_model_dir = os.path.join(OUTPUT_DIR, 'saved_model')

print(f"Converting SavedModel to TFLite...")
print(f"SavedModel directory: {saved_model_dir}")

converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
tflite_model = converter.convert()

with open(TFLITE_MODEL_PATH, 'wb') as f:
    f.write(tflite_model)

# Get model size
model_size_mb = os.path.getsize(TFLITE_MODEL_PATH) / (1024 * 1024)
print(f"âœ“ TFLite model saved to: {TFLITE_MODEL_PATH}")
print(f"  Model size: {model_size_mb:.2f} MB")

In [None]:
# Verify TFLite model
import tensorflow as tf

interpreter = tf.lite.Interpreter(TFLITE_MODEL_PATH)
interpreter.allocate_tensors()

print("âœ“ TFLite model loaded successfully")
print("\nInput details:")
for detail in interpreter.get_input_details():
    print(f"  {detail}")

print("\nOutput details:")
for detail in interpreter.get_output_details():
    print(f"  {detail}")

## Create Quantized Model (INT8)

Create an optimized INT8 quantized model for faster inference on edge devices.

In [None]:
# Create quantized INT8 model
import glob
import random
import tensorflow as tf
import cv2
import numpy as np

QUANT_MODEL_PATH = os.path.join(OUTPUT_DIR, 'model_quant.tflite')

# Get training images for representative dataset
train_images = glob.glob(os.path.join(TRAIN_DIR, '*.jpg')) + \
               glob.glob(os.path.join(TRAIN_DIR, '*.png'))

print(f"Using {len(train_images)} training images for quantization")

# Load original model to get input shape
interpreter = tf.lite.Interpreter(TFLITE_MODEL_PATH)
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
height = input_details[0]['shape'][1]
width = input_details[0]['shape'][2]

def representative_data_gen():
    """Generate representative dataset for quantization"""
    num_samples = 300
    for i in range(min(num_samples, len(train_images))):
        image_path = random.choice(train_images)
        image = tf.io.read_file(image_path)
        
        # Decode based on file extension
        if image_path.endswith('.jpg') or image_path.endswith('.JPG'):
            image = tf.io.decode_jpeg(image, channels=3)
        elif image_path.endswith('.png'):
            image = tf.io.decode_png(image, channels=3)
        
        image = tf.image.resize(image, [width, height])
        image = tf.cast(image / 255., tf.float32)
        image = tf.expand_dims(image, 0)
        yield [image]

print("Creating quantized model...")

# Initialize converter
converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)

# Enable full integer quantization
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = representative_data_gen
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.uint8
converter.inference_output_type = tf.uint8

# Convert
tflite_quant_model = converter.convert()

# Save
with open(QUANT_MODEL_PATH, 'wb') as f:
    f.write(tflite_quant_model)

# Compare sizes
float_size = os.path.getsize(TFLITE_MODEL_PATH) / (1024 * 1024)
quant_size = os.path.getsize(QUANT_MODEL_PATH) / (1024 * 1024)
reduction = (1 - quant_size/float_size) * 100

print(f"âœ“ Quantized model saved to: {QUANT_MODEL_PATH}")
print(f"\nModel Comparison:")
print(f"  Float32 model: {float_size:.2f} MB")
print(f"  INT8 model: {quant_size:.2f} MB")
print(f"  Size reduction: {reduction:.1f}%")

## Test Inference Performance

Compare inference speed and accuracy between float32 and INT8 quantized models.

In [None]:
# Benchmark models on GPU vs CPU
import time
import numpy as np
import glob
import random

# Get test images
test_images = glob.glob(os.path.join(TEST_DIR, '*.jpg'))[:20]

def benchmark_model(model_path, images, device='/GPU:0'):
    """Benchmark model inference time"""
    import cv2
    
    # For TFLite, we can't directly control device, but we can measure
    interpreter = tf.lite.Interpreter(model_path)
    interpreter.allocate_tensors()
    
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()
    
    height = input_details[0]['shape'][1]
    width = input_details[0]['shape'][2]
    float_input = (input_details[0]['dtype'] == np.float32)
    
    times = []
    confidences = []
    
    for img_path in images:
        # Load and preprocess
        image = cv2.imread(img_path)
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image_resized = cv2.resize(image_rgb, (width, height))
        input_data = np.expand_dims(image_resized, axis=0)
        
        if float_input:
            input_data = (np.float32(input_data) - 127.5) / 127.5
        else:
            input_data = np.uint8(input_data)
        
        # Time inference
        start = time.perf_counter()
        interpreter.set_tensor(input_details[0]['index'], input_data)
        interpreter.invoke()
        scores = interpreter.get_tensor(output_details[0]['index'])[0]
        end = time.perf_counter()
        
        times.append(end - start)
        confidences.extend([s for s in scores if s > 0.5])
    
    avg_time = np.mean(times) * 1000  # Convert to ms
    avg_conf = np.mean(confidences) if confidences else 0
    
    return avg_time, avg_conf, len(confidences)

print("=" * 60)
print("INFERENCE BENCHMARK")
print("=" * 60)

# Benchmark float32 model
print("\nBenchmarking Float32 model...")
float_time, float_conf, float_dets = benchmark_model(TFLITE_MODEL_PATH, test_images)

# Benchmark INT8 model
print("Benchmarking INT8 quantized model...")
quant_time, quant_conf, quant_dets = benchmark_model(QUANT_MODEL_PATH, test_images)

# Print results
print("\n" + "=" * 60)
print("RESULTS")
print("=" * 60)
print(f"\nFloat32 Model:")
print(f"  Average inference time: {float_time:.2f} ms")
print(f"  Average confidence: {float_conf:.4f}")
print(f"  Total detections: {float_dets}")

print(f"\nINT8 Quantized Model:")
print(f"  Average inference time: {quant_time:.2f} ms")
print(f"  Average confidence: {quant_conf:.4f}")
print(f"  Total detections: {quant_dets}")

print(f"\nSpeedup: {float_time/quant_time:.2f}x faster")
print("=" * 60)

## Visualize Detection Results

Test the model on sample images and visualize the results.

In [None]:
# Visualize detections
import cv2
import numpy as np
import glob
import random
import matplotlib.pyplot as plt
from tensorflow.lite.python.interpreter import Interpreter

def detect_and_visualize(model_path, image_paths, num_images=5, threshold=0.5):
    """Run detection and visualize results"""
    
    # Load model
    interpreter = Interpreter(model_path)
    interpreter.allocate_tensors()
    
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()
    
    height = input_details[0]['shape'][1]
    width = input_details[0]['shape'][2]
    float_input = (input_details[0]['dtype'] == np.float32)
    
    labels = ['background', 'HVAC_Spec_Label']
    
    # Select random images
    selected_images = random.sample(image_paths, min(num_images, len(image_paths)))
    
    for img_path in selected_images:
        # Load image
        image = cv2.imread(img_path)
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        imH, imW, _ = image.shape
        
        # Preprocess
        image_resized = cv2.resize(image_rgb, (width, height))
        input_data = np.expand_dims(image_resized, axis=0)
        
        if float_input:
            input_data = (np.float32(input_data) - 127.5) / 127.5
        else:
            input_data = np.uint8(input_data)
        
        # Inference
        interpreter.set_tensor(input_details[0]['index'], input_data)
        interpreter.invoke()
        
        boxes = interpreter.get_tensor(output_details[1]['index'])[0]
        classes = interpreter.get_tensor(output_details[3]['index'])[0]
        scores = interpreter.get_tensor(output_details[0]['index'])[0]
        
        # Draw detections
        for i in range(len(scores)):
            if scores[i] > threshold and scores[i] <= 1.0:
                ymin = int(max(1, boxes[i][0] * imH))
                xmin = int(max(1, boxes[i][1] * imW))
                ymax = int(min(imH, boxes[i][2] * imH))
                xmax = int(min(imW, boxes[i][3] * imW))
                
                # Draw box
                cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)
                
                # Draw label
                label = f'{labels[int(classes[i])]}: {int(scores[i]*100)}%'
                cv2.putText(image, label, (xmin, ymin-10),
                           cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
        
        # Display
        plt.figure(figsize=(12, 8))
        plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        plt.title(f'Detection: {os.path.basename(img_path)}')
        plt.axis('off')
        plt.show()

# Visualize detections
test_images = glob.glob(os.path.join(TEST_DIR, '*.jpg'))

print("Visualizing Float32 model detections:")
detect_and_visualize(TFLITE_MODEL_PATH, test_images, num_images=5, threshold=0.5)

print("\nVisualizing INT8 quantized model detections:")
detect_and_visualize(QUANT_MODEL_PATH, test_images, num_images=5, threshold=0.5)

## Training Summary

**Key Features:**
- âœ“ GPU acceleration with CUDA
- âœ“ Mixed precision training (float16/float32)
- âœ“ XLA compilation for optimized operations
- âœ“ Increased batch size for better GPU utilization
- âœ“ TFLite export for deployment
- âœ“ INT8 quantization for edge devices

**Model Files:**
- Float32 TFLite: `/workspace/exported_model/model.tflite`
- INT8 TFLite: `/workspace/exported_model/model_quant.tflite`
- Checkpoints: `/workspace/models_checkpoints/trained_model/`

**Next Steps:**
1. Deploy the TFLite model to your target device
2. Fine-tune hyperparameters if needed
3. Collect more training data to improve accuracy
4. Experiment with different model architectures