# Test the Full Pipeline: Detection and Classification

This notebook runs the complete vehicle processing pipeline:
1. **Detection**: It uses the fine-tuned YOLO model to detect vehicles in the `traffic_test.mp4` video.
2. **Classification**: For each detected vehicle, it uses the fine-tuned `EfficientNetB4_CBAM` classifier to identify the car's make and model.

The final output is a video with bounding boxes and class labels drawn on each frame.

### Clone Model Code Repository

In [2]:
!git clone -b feat/pretrained-model https://github.com/luthfiarifin/hybrid-envit-car-retrieval.git code
!mv code/* .
!ls

Cloning into 'code'...
remote: Enumerating objects: 266, done.[K
remote: Counting objects: 100% (1/1), done.[K
remote: Total 266 (delta 0), reused 0 (delta 0), pack-reused 265 (from 2)[K
Receiving objects: 100% (266/266), 500.85 MiB | 47.01 MiB/s, done.
Resolving deltas: 100% (99/99), done.
Updating files: 100% (70/70), done.
1_run_scraper_into_dataset.ipynb	code		 requirements.txt
2_finetune_the_detection_model.ipynb	data_processing  traffic_test.mp4
3_train_the_classification_model.ipynb	logs
4_final_traffic_cam_test.ipynb		models


In [4]:
import cv2
import torch
from ultralytics import YOLO
from PIL import Image
from torchvision import transforms
from IPython.display import Video, display
import os

# Import the model definition from your project files
from models.classification.model import EfficientNetB4_CBAM

# --- Configuration ---
DETECTION_MODEL_PATH = "models/detection/yolo_finetune/vehicle_detection/weights/best.pt"
CLASSIFICATION_MODEL_PATH = "models/classification/results/car_classifier_model_20250624_014531_best_acc.pth"
VIDEO_PATH = "traffic_test.mp4"
OUTPUT_VIDEO_PATH = "traffic_test_classified.mp4"

# This should be the same list of classes used to train the classifier
# You might need to load this from a file or define it as it was in your training script
CLASS_NAME_PATH = "models/classification/class_names.txt"
with open(CLASS_NAME_PATH, "r") as f:
    CLASS_NAMES = [line.strip() for line in f.readlines()]
NUM_CLASSES = len(CLASS_NAMES)

### Load the Models

In [5]:
# Load YOLO Detection Model
print(f"Loading detection model from: {DETECTION_MODEL_PATH}")
detection_model = YOLO(DETECTION_MODEL_PATH)

# Load EfficientNetB4_CBAM Classification Model
print(f"Loading classification model from: {CLASSIFICATION_MODEL_PATH}")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
classification_model = EfficientNetB4_CBAM(num_classes=NUM_CLASSES)
if torch.cuda.is_available():
    classification_model.load_state_dict(torch.load(CLASSIFICATION_MODEL_PATH))
else:
    classification_model.load_state_dict(torch.load(CLASSIFICATION_MODEL_PATH, map_location=torch.device('cpu')))
classification_model.eval() # Set model to evaluation mode
classification_model.to(device)

print("Models loaded successfully.")

Loading detection model from: models/detection/yolo_finetune/vehicle_detection/weights/best.pt
Loading classification model from: models/classification/results/car_classifier_model_20250624_014531_best_acc.pth
Models loaded successfully.


### Define Image Transformations

In [6]:
# Define the same transformations used during the classification model training
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

### Process the Video

In [7]:
from tqdm import tqdm
import torch.nn.functional as F  # For softmax

cap = cv2.VideoCapture(VIDEO_PATH)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter(OUTPUT_VIDEO_PATH, fourcc, fps, (width, height))

total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
print(f"Processing video: {VIDEO_PATH}...")

with tqdm(total=total_frames, desc="Processing frames") as pbar:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # 1. Run Detection
        detection_results = detection_model(frame, verbose=False)

        # 2. Process each detection
        for result in detection_results:
            for box in result.boxes:
                # Get bounding box coordinates
                x1, y1, x2, y2 = map(int, box.xyxy[0])

                # Crop the detected vehicle
                vehicle_crop = frame[y1:y2, x1:x2]

                # 3. Classify the vehicle
                pil_img = Image.fromarray(cv2.cvtColor(vehicle_crop, cv2.COLOR_BGR2RGB))
                input_tensor = transform(pil_img).unsqueeze(0).to(device)
                
                with torch.no_grad():
                    outputs = classification_model(input_tensor)
                    probabilities = F.softmax(outputs[0], dim=0)
                    top1_prob, predicted_idx = torch.topk(probabilities, 1)
                    class_name = CLASS_NAMES[predicted_idx.item()]
                    confidence_score = top1_prob.item()

                # 4. Draw bounding box and class label on the frame
                label = f'Car: {class_name} ({confidence_score:.2f})'
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
        
        # Write the annotated frame to the output video
        out.write(frame)
        pbar.update(1)

cap.release()
out.release()
cv2.destroyAllWindows()

print(f"Finished processing. Classified video saved to: {OUTPUT_VIDEO_PATH}")

Processing video: traffic_test.mp4...


Processing frames: 100%|██████████| 5920/5920 [18:04<00:00,  5.46it/s]

Finished processing. Classified video saved to: traffic_test_classified.mp4



