# **FLOATING OBJECT DETECTION**

**About the dataset**


1. Dataset size?
2. Size of images?
3. How many categories?
4. Exist annotation file with no data
5. Six categories: human, wind/sup-board, boat, bouy, sailboat, kayak


**[Download dataset](https://www.kaggle.com/datasets/jangsienicajzkowy/afo-aerial-dataset-of-floating-objects/data)**

In [1]:
import shutil
from pathlib import Path

**Data path**

In [2]:
# Image path of PART 1,2,3
img_path_1 = 'dataset/PART_1/PART_1/images/'
img_path_2 = 'dataset/PART_2/PART_2/images/'
img_path_3 = 'dataset/PART_3/PART_3/images/'

# Categories path
# Categories: human, wind/sup-board, boat, bouy, sailboat, kayak
categories_path = 'dataset/PART_1/PART_1/6categories/'

**Split Data into Train, Test & Validation**

In [3]:
# Split into three parts: the training (67,4% of objects), the test (19,12% of objects),
# and the validation set (13,48% of objects). In order to prevent overfitting of the model to the given data,
# the test set contains selected frames from nine videos that were not used in either the training or validation sets.

# Split image to : dataset/working/images
# Split annotation to: dataset/working/labels

def split_data(file_list, img_path, ann_path, mode):
    #Check if we have our mode folders
    images_working_folder = Path( 'dataset/working/images/'+  mode)
    if not images_working_folder.exists():
        print(f"Path {images_working_folder} does not exit")
        os.makedirs(images_working_folder)

    labels_working_folder = Path('dataset/working/labels/' + mode)
    if not labels_working_folder.exists():
        print(f"Path {labels_working_folder} does not exit")
        os.makedirs(labels_working_folder)

    #Creates the name of our label file from the img name and creates our source file
    for file in file_list:
        name = file.replace('.jpg', '')
        img_src_file = str(img_path) + '/' + name + '.jpg'
        annot_src_file = str(ann_path) + '/' + name + '.txt'
        
        if Path(img_src_file).exists() and Path(annot_src_file).exists():
            #move image
            IMG_DIR = 'dataset/working/images/' + mode
            img_dest_file = str(IMG_DIR) + '/' + name + '.jpg'
            if os.path.isfile(img_src_file) and not Path(img_dest_file).exists():
                shutil.move(img_src_file, img_dest_file)
    
            # Copy annotations
            ANNOT_DIR = 'dataset/working/labels/' + mode
            annot_dest_file = str(ANNOT_DIR) + '/' + name + '.txt'
            if os.path.isfile(annot_src_file) and not Path(annot_dest_file).exists():
                shutil.move(annot_src_file, annot_dest_file)

In [4]:
#Get our images list
train_imgs = 'dataset/PART_1/PART_1/train.txt'
test_imgs = 'dataset/PART_1/PART_1/test.txt'
val_imgs = 'dataset/PART_1/PART_1/validation.txt'
with open(train_imgs, 'r') as f:
    train_img_list = [line.strip() for line in f.readlines()]

with open(test_imgs, 'r') as f:
    test_img_list = [line.strip() for line in f.readlines()]

with open(val_imgs, 'r') as f:
    val_img_list = [line.strip() for line in f.readlines()]

print(train_img_list[0], test_img_list[0], val_img_list[0])

a_102.jpg k2_38.jpg a_101.jpg


In [5]:
# Root path
root_img_path = Path('dataset/images/')
root_ann_path = Path('dataset/annotations/')

#Split Data
split_data(train_img_list, root_img_path, root_ann_path, 'train')
split_data(test_img_list, root_img_path, root_ann_path, 'test')
split_data(val_img_list, root_img_path, root_ann_path, 'val')

In [6]:
import glob
import os
working_image_path = 'dataset/working/images/'
working_labels_path = 'dataset/working/labels/'

# Images
img_test_path = glob.glob(os.path.join(working_image_path + '/test/' , "*.jpg"))
print(f'img_test_path: {len(img_test_path)}')

img_train_path = glob.glob(os.path.join(working_image_path + '/train/' , "*.jpg"))
print(f'img_train_path: {len(img_train_path)}')

img_val_path = glob.glob(os.path.join(working_image_path + '/val/' , "*.jpg"))
print(f'img_val_path: {len(img_val_path)}')

# Labels
label_test_path = glob.glob(os.path.join(working_labels_path + '/test/' , "*.txt"))
print(f'label_test_path: {len(label_test_path)}')

label_train_path = glob.glob(os.path.join(working_labels_path + '/train/' , "*.txt"))
print(f'label_train_path: {len(label_train_path)}')

label_val_path = glob.glob(os.path.join(working_image_path + '/val/' , "*.txt"))
print(f'label_val_path: {len(label_val_path)}')

img_test_path: 514
img_train_path: 2787
img_val_path: 339
label_test_path: 514
label_train_path: 2787
label_val_path: 0


### **Train model**

In [7]:
import os
import cv2
import numpy as np
import pickle
from tqdm import tqdm
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from skimage.feature import hog
from skimage import color
from joblib import Parallel, delayed


class ImprovedObjectDetector:
    """
    Enhanced object detector for floating objects using HOG features, 
    color histograms, PCA, and SVM with multi-scale detection.
    """
    
    def __init__(self, class_names, pca_components=100, n_jobs=-1):
        self.class_names = class_names
        self.num_classes = len(class_names)
        self.pca_components = pca_components
        self.n_jobs = n_jobs
        
        # Define colors for visualization
        self.colors = [
            (255, 0, 0),    # Red for human
            (0, 255, 0),    # Green for wind/sup-board  
            (0, 0, 255),    # Blue for boat
            (255, 255, 0),  # Yellow for buoy
            (255, 0, 255),  # Magenta for sailboat
            (0, 255, 255)   # Cyan for kayak
        ]
        
        # Create a pipeline with hyperparameter search
        self.base_classifier = Pipeline([
            ('scaler', StandardScaler()),
            ('pca', PCA(n_components=pca_components, random_state=42)),
            ('svm', SVC(probability=True, random_state=42))
        ])
    
    def extract_features(self, image, feature_types=None):
        """
        Extract multiple types of features from an image
        
        Parameters:
        -----------
        image : numpy.ndarray
            Input image
        feature_types : list
            List of feature types to extract (default: ['hog', 'color'])
            
        Returns:
        --------
        numpy.ndarray
            Concatenated feature vector
        """
        if feature_types is None:
            feature_types = ['hog', 'color']
            
        # Resize to a fixed size
        resized = cv2.resize(image, (64, 64))
        features = []
        
        if 'hog' in feature_types:
            # Extract HOG features
            # Convert to grayscale if image is in color
            if len(resized.shape) == 3:
                gray = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
            else:
                gray = resized
                
            hog_features = hog(gray, 
                              orientations=9,
                              pixels_per_cell=(8, 8),
                              cells_per_block=(2, 2),
                              block_norm='L2-Hys',
                              visualize=False)
            features.append(hog_features)
            
        if 'color' in feature_types and len(resized.shape) == 3:
            # Extract color histogram features from each channel
            hsv = cv2.cvtColor(resized, cv2.COLOR_BGR2HSV)
            
            # Compute histograms for each channel
            h_hist = cv2.calcHist([hsv], [0], None, [32], [0, 180])
            s_hist = cv2.calcHist([hsv], [1], None, [32], [0, 256])
            v_hist = cv2.calcHist([hsv], [2], None, [32], [0, 256])
            
            # Normalize histograms
            h_hist = cv2.normalize(h_hist, h_hist).flatten()
            s_hist = cv2.normalize(s_hist, s_hist).flatten()
            v_hist = cv2.normalize(v_hist, v_hist).flatten()
            
            # Concatenate histograms
            color_features = np.concatenate((h_hist, s_hist, v_hist))
            features.append(color_features)
        
        # Return concatenated features
        return np.concatenate(features) if features else np.array([])
    
    def load_dataset(self, image_dir, annotation_dir, subset='train'):
        """
        Load dataset from images and annotations
        
        Parameters:
        -----------
        image_dir : str
            Directory containing image folders
        annotation_dir : str
            Directory containing annotation folders
        subset : str
            Dataset subset (train, test, val)
            
        Returns:
        --------
        tuple
            Features, labels, and bounding boxes
        """
        images_path = os.path.join(image_dir, subset)
        annotations_path = os.path.join(annotation_dir, subset)
        
        print(f'Loading {subset} data from:')
        print(f'Images path: {images_path}')
        print(f'Annotations path: {annotations_path}')
        
        X = []
        y = []
        bboxes = []
        img_paths = []
        
        print(f"Loading {subset} dataset...")
        image_files = [f for f in os.listdir(images_path) if f.endswith('.jpg')]
        
        def process_image(img_file):
            """Process a single image and its annotations"""
            local_X = []
            local_y = []
            local_bboxes = []
            local_img_paths = []
            
            base_name = os.path.splitext(img_file)[0]
            ann_file = os.path.join(annotations_path, base_name + '.txt')
            
            if not os.path.exists(ann_file):
                print(f"Warning: No annotation file for {img_file}")
                return local_X, local_y, local_bboxes, local_img_paths
            
            # Read image
            img_path = os.path.join(images_path, img_file)
            img = cv2.imread(img_path)
            
            if img is None:
                print(f"Warning: Could not read image {img_path}")
                return local_X, local_y, local_bboxes, local_img_paths
            
            height, width = img.shape[:2]
            
            # Read annotations
            with open(ann_file, 'r') as f:
                lines = f.readlines()
            
            for line in lines:
                parts = line.strip().split()
                if len(parts) >= 5:
                    try:
                        class_id = int(parts[0])
                        
                        # YOLO format is center_x, center_y, width, height (normalized)
                        center_x = float(parts[1])
                        center_y = float(parts[2])
                        bbox_width = float(parts[3])
                        bbox_height = float(parts[4])
                        
                        # Convert to pixel coordinates
                        x1 = int((center_x - bbox_width/2) * width)
                        y1 = int((center_y - bbox_height/2) * height)
                        x2 = int((center_x + bbox_width/2) * width)
                        y2 = int((center_y + bbox_height/2) * height)
                        
                        # Ensure coordinates are within image
                        x1 = max(0, x1)
                        y1 = max(0, y1)
                        x2 = min(width, x2)
                        y2 = min(height, y2)
                        
                        # Only process if bbox has area
                        if x2 > x1 and y2 > y1:
                            # Extract object patch
                            object_img = img[y1:y2, x1:x2]
                            
                            if object_img.size > 0:
                                # Extract features
                                features = self.extract_features(object_img)
                                
                                local_X.append(features)
                                local_y.append(class_id)
                                local_bboxes.append((x1, y1, x2-x1, y2-y1))  # x, y, width, height
                                local_img_paths.append(img_path)
                    except Exception as e:
                        print(f"Error processing annotation: {line.strip()} - {e}")
            
            return local_X, local_y, local_bboxes, local_img_paths
        
        # Process images in parallel
        results = Parallel(n_jobs=self.n_jobs)(
            delayed(process_image)(img_file) for img_file in tqdm(image_files)
        )
        
        # Combine results
        for local_X, local_y, local_bboxes, local_img_paths in results:
            X.extend(local_X)
            y.extend(local_y)
            bboxes.extend(local_bboxes)
            img_paths.extend(local_img_paths)
        
        print(f"Loaded {len(X)} samples from {subset} set")
        
        return np.array(X), np.array(y), bboxes, img_paths

    def train(self, image_dir, annotation_dir, model_save_path="improved_object_detector.pkl", tune_hyperparams=True):
        # Load training data
        X_train, y_train, _, _ = self.load_dataset(image_dir, annotation_dir, 'train')
        
        if X_train.shape[0] == 0:
            raise ValueError("No training samples found!")
        
        print(f"Training with {len(X_train)} samples...")
        
        if tune_hyperparams:
            print("Performing hyperparameter tuning...")
            # Define parameter grid
            param_grid = {
                'pca__n_components': [min(50, X_train.shape[1]), min(100, X_train.shape[1]), min(150, X_train.shape[1])],
                'svm__C': [0.1, 1, 10],
                'svm__gamma': ['scale', 'auto'],
                'svm__kernel': ['rbf', 'linear']
            }
            
            # Create grid search
            self.classifier = GridSearchCV(
                self.base_classifier,
                param_grid,
                cv=3,
                n_jobs=self.n_jobs,
                verbose=1
            )
            
            # Train with grid search
            self.classifier.fit(X_train, y_train)
            print(f"Best parameters: {self.classifier.best_params_}")
            print(f"Best CV score: {self.classifier.best_score_:.4f}")
        else:
            # Train with default parameters
            self.classifier = self.base_classifier
            self.classifier.fit(X_train, y_train)
        
        # Validate on validation set
        X_val, y_val, _, _ = self.load_dataset(image_dir, annotation_dir, 'val')
        
        if X_val.shape[0] > 0:
            accuracy = self.classifier.score(X_val, y_val)
            print(f"Validation accuracy: {accuracy:.4f}")
        else:
            print("No validation samples found!")
        
        # Save the model
        with open(model_save_path, 'wb') as f:
            pickle.dump(self.classifier, f)
        print(f"Model saved to {model_save_path}")
    
    def load_model(self, model_path):
        """
        Load a trained model
        
        Parameters:
        -----------
        model_path : str
            Path to the saved model
        """
        with open(model_path, 'rb') as f:
            self.classifier = pickle.load(f)
        print(f"Model loaded from {model_path}")
    
    def detect(self, image, confidence_threshold=0.5, nms_threshold=0.3, 
               multi_scale=True, scale_factors=None):
        """
        Detect objects in an image using sliding window approach
        
        Parameters:
        -----------
        image : numpy.ndarray
            Input image
        confidence_threshold : float
            Minimum confidence score to keep a detection
        nms_threshold : float
            Non-maximum suppression threshold
        multi_scale : bool
            Whether to use multi-scale detection
        scale_factors : list
            List of scale factors for multi-scale detection
            
        Returns:
        --------
        list
            List of (class_id, confidence, bbox) tuples
        """
        if scale_factors is None:
            scale_factors = [0.5, 0.75, 1.0, 1.25, 1.5]
        
        height, width = image.shape[:2]
        detections = []
        
        # Define window parameters
        base_window_size = (128, 128)
        step_size = 64
        
        # Process at different scales if multi_scale is True
        scales = scale_factors if multi_scale else [1.0]
        
        for scale in scales:
            # Scale the image
            if scale != 1.0:
                scaled_width = int(width * scale)
                scaled_height = int(height * scale)
                scaled_img = cv2.resize(image, (scaled_width, scaled_height))
            else:
                scaled_img = image
                scaled_width = width
                scaled_height = height
            
            # Adjust window and step size for this scale
            window_width = int(base_window_size[0])
            window_height = int(base_window_size[1])
            scaled_step = int(step_size)
            
            # Sliding window detection
            for y in range(0, scaled_height - window_height, scaled_step):
                for x in range(0, scaled_width - window_width, scaled_step):
                    # Extract window
                    window = scaled_img[y:y + window_height, x:x + window_width]
                    
                    # Skip if window is too small
                    if window.shape[0] < 16 or window.shape[1] < 16:
                        continue
                    
                    # Extract features
                    features = self.extract_features(window)
                    if features.size == 0:
                        continue
                    
                    features = features.reshape(1, -1)
                    
                    # Predict class and confidence
                    probabilities = self.classifier.predict_proba(features)[0]
                    class_id = np.argmax(probabilities)
                    confidence = probabilities[class_id]
                    
                    if confidence >= confidence_threshold:
                        # Convert back to original image coordinates
                        orig_x = int(x / scale)
                        orig_y = int(y / scale)
                        orig_width = int(window_width / scale)
                        orig_height = int(window_height / scale)
                        
                        detections.append((class_id, confidence, (orig_x, orig_y, orig_width, orig_height)))
        
        # Apply non-maximum suppression
        detections = self._non_max_suppression(detections, nms_threshold)
        
        return detections
    
    def _non_max_suppression(self, detections, threshold):
        """
        Apply non-maximum suppression to remove overlapping detections
        
        Parameters:
        -----------
        detections : list
            List of (class_id, confidence, bbox) tuples
        threshold : float
            IoU threshold for suppression
            
        Returns:
        --------
        list
            Filtered list of detections
        """
        if not detections:
            return []
        
        # Group detections by class
        class_detections = {}
        for detection in detections:
            class_id = detection[0]
            if class_id not in class_detections:
                class_detections[class_id] = []
            class_detections[class_id].append(detection)
        
        # Apply NMS for each class separately
        final_detections = []
        for class_id, class_dets in class_detections.items():
            # Sort by confidence (highest first)
            class_dets = sorted(class_dets, key=lambda x: x[1], reverse=True)
            
            # Initialize the list of picked detections
            picked = []
            
            # Extract coordinates
            confidences = [d[1] for d in class_dets]
            boxes = [d[2] for d in class_dets]
            
            # Extract coordinates
            x1 = np.array([box[0] for box in boxes])
            y1 = np.array([box[1] for box in boxes])
            x2 = np.array([box[0] + box[2] for box in boxes])
            y2 = np.array([box[1] + box[3] for box in boxes])
            
            # Calculate areas
            area = (x2 - x1 + 1) * (y2 - y1 + 1)
            idxs = np.argsort(confidences)[::-1]
            
            while len(idxs) > 0:
                last = len(idxs) - 1
                i = idxs[0]
                picked.append(i)
                
                # Find the largest (x, y) coordinates for the start of the bounding box
                xx1 = np.maximum(x1[i], x1[idxs[1:]])
                yy1 = np.maximum(y1[i], y1[idxs[1:]])
                
                # Find the smallest (x, y) coordinates for the end of the bounding box
                xx2 = np.minimum(x2[i], x2[idxs[1:]])
                yy2 = np.minimum(y2[i], y2[idxs[1:]])
                
                # Width and height of bounding box
                w = np.maximum(0, xx2 - xx1 + 1)
                h = np.maximum(0, yy2 - yy1 + 1)
                
                # Calculate the overlap ratio (IoU)
                overlap = (w * h) / area[idxs[1:]]
                
                # Delete all indexes from the index list that have overlap greater than the threshold
                idxs = np.delete(idxs, np.concatenate(([0], np.where(overlap > threshold)[0] + 1)))
            
            # Add picked detections to final list
            final_detections.extend([class_dets[i] for i in picked])
        
        return final_detections
    
    def evaluate(self, image_dir, annotation_dir, subset='test', iou_threshold=0.5):
        """
        Evaluate the detector on a dataset
        
        Parameters:
        -----------
        image_dir : str
            Directory containing image folders
        annotation_dir : str
            Directory containing annotation folders
        subset : str
            Dataset subset (train, test, val)
        iou_threshold : float
            IoU threshold for a true positive
            
        Returns:
        --------
        dict
            Evaluation metrics
        """
        # Load test data
        _, _, ground_truth_boxes, img_paths = self.load_dataset(image_dir, annotation_dir, subset)
        
        # Group ground truth boxes by image path
        gt_by_image = {}
        for i, img_path in enumerate(img_paths):
            if img_path not in gt_by_image:
                gt_by_image[img_path] = []
            gt_by_image[img_path].append(ground_truth_boxes[i])
        
        # Evaluate on each image
        total_tp = 0
        total_fp = 0
        total_gt = sum(len(boxes) for boxes in gt_by_image.values())
        
        for img_path, gt_boxes in tqdm(gt_by_image.items(), desc=f"Evaluating on {subset}"):
            # Read image
            img = cv2.imread(img_path)
            if img is None:
                print(f"Warning: Could not read image {img_path}")
                continue
            
            # Detect objects
            detections = self.detect(img, confidence_threshold=0.5)
            
            # Calculate IoU for each detection with each ground truth box
            tp = 0
            fp = 0
            
            # Create a copy of gt_boxes to mark matched boxes
            unmatched_gt = gt_boxes.copy()
            
            for _, _, det_bbox in detections:
                # Convert detection bbox to format [x1, y1, x2, y2]
                det_x1, det_y1, det_w, det_h = det_bbox
                det_x2, det_y2 = det_x1 + det_w, det_y1 + det_h
                
                best_iou = 0
                best_idx = -1
                
                for i, gt_bbox in enumerate(unmatched_gt):
                    # Convert gt bbox to format [x1, y1, x2, y2]
                    gt_x1, gt_y1, gt_w, gt_h = gt_bbox
                    gt_x2, gt_y2 = gt_x1 + gt_w, gt_y1 + gt_h
                    
                    # Calculate IoU
                    inter_x1 = max(det_x1, gt_x1)
                    inter_y1 = max(det_y1, gt_y1)
                    inter_x2 = min(det_x2, gt_x2)
                    inter_y2 = min(det_y2, gt_y2)
                    
                    inter_w = max(0, inter_x2 - inter_x1 + 1)
                    inter_h = max(0, inter_y2 - inter_y1 + 1)
                    
                    inter_area = inter_w * inter_h
                    
                    det_area = det_w * det_h
                    gt_area = gt_w * gt_h
                    
                    iou = inter_area / float(det_area + gt_area - inter_area)
                    
                    if iou > best_iou:
                        best_iou = iou
                        best_idx = i
                
                # Check if detection matches any ground truth box
                if best_iou >= iou_threshold:
                    tp += 1
                    # Remove matched ground truth box
                    unmatched_gt.pop(best_idx)
                else:
                    fp += 1
            
            total_tp += tp
            total_fp += fp
        
        # Calculate precision, recall, F1
        precision = total_tp / max(1, (total_tp + total_fp))
        recall = total_tp / max(1, total_gt)
        f1 = 2 * precision * recall / max(1e-8, precision + recall)
        
        metrics = {
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'true_positives': total_tp,
            'false_positives': total_fp,
            'total_ground_truth': total_gt
        }
        
        print(f"Evaluation results on {subset} set:")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        
        return metrics
    
    def visualize_detections(self, image, detections, output_path=None, show_confidence=True):
        """
        Visualize detections on the image
        
        Parameters:
        -----------
        image : numpy.ndarray
            Input image
        detections : list
            List of (class_id, confidence, bbox) tuples
        output_path : str, optional
            Path to save the output image
        show_confidence : bool
            Whether to display confidence scores
            
        Returns:
        --------
        numpy.ndarray
            Image with visualized detections
        """
        output_img = image.copy()
        
        for class_id, confidence, bbox in detections:
            x, y, w, h = bbox
            color = self.colors[class_id % len(self.colors)]
            class_name = self.class_names[class_id]
            
            # Draw bounding box
            cv2.rectangle(output_img, (x, y), (x + w, y + h), color, 2)
            
            # Prepare label text
            if show_confidence:
                label = f"{class_name}: {confidence:.2f}"
            else:
                label = class_name
                
            # Get text size
            text_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
            text_width, text_height = text_size
            
            # Draw label background
            cv2.rectangle(output_img, (x, y - text_height - 5), (x + text_width, y), color, -1)
            
            # Draw label text
            cv2.putText(output_img, label, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1)
        
        if output_path:
            cv2.imwrite(output_path, output_img)
        
        return output_img


In [8]:
# Demo function to test the detector on a single image
def demo_detection(image_path, model_path, class_names, output_path=None):
    """
    Demonstrate object detection on a single image
    
    Parameters:
    -----------
    image_path : str
        Path to the input image
    model_path : str
        Path to the saved model
    class_names : list
        List of class names
    output_path : str, optional
        Path to save the output image
    """
    # Load image
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Could not read image {image_path}")
        return
    
    # Create detector
    detector = ImprovedObjectDetector(class_names=class_names)
    
    # Load model
    detector.load_model(model_path)
    
    # Detect objects
    detections = detector.detect(image, 
                               confidence_threshold=0.5, 
                               nms_threshold=0.3,
                               multi_scale=True)
    
    # Visualize detections
    output_img = detector.visualize_detections(image, detections, output_path)
    
    print(f"Detected {len(detections)} objects")
    for i, (class_id, confidence, bbox) in enumerate(detections):
        print(f"  {i+1}. {class_names[class_id]} with confidence {confidence:.2f} at position {bbox}")
    
    # Display result if running in a GUI environment
    try:
        cv2.imshow("Detections", output_img)
        cv2.waitKey(0)
        cv2.destroyAllWindows()
    except:
        pass
    
    return output_img

In [9]:
# Usage Example
if __name__ == "__main__":
    # Directories
    image_dir = 'dataset/working/images/'
    annotation_dir = 'dataset/working/labels/'
    
    
    class_names = ["human", "wind/sup-board", "boat", "buoy", "sailboat", "kayak"]
    
    # Create detector
    detector = ImprovedObjectDetector(class_names=class_names)
    
    # Train model (uncomment to train)
    detector.train(image_dir, annotation_dir, "improved_object_detector.pkl", tune_hyperparams=True)
    
    # Or load pre-trained model
    detector.load_model("improved_object_detector.pkl")
    
    # Evaluate model
    # metrics = detector.evaluate(image_dir, annotation_dir, subset='test')
    
    # Test on single image
    test_image_path = 'dataset/working/images/val/a_101.jpg'
    demo_detection(test_image_path, "improved_object_detector.pkl", class_names, "output.jpg")

Loading train data from:
Images path: dataset/working/images/train
Annotations path: dataset/working/labels/train
Loading train dataset...


100%|██████████| 2787/2787 [00:11<00:00, 239.17it/s]


Loaded 26960 samples from train set
Training with 26960 samples...
Performing hyperparameter tuning...
Fitting 3 folds for each of 36 candidates, totalling 108 fits


Python(71337) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


KeyboardInterrupt: 