# Weapon Detection with Random Forest

In [1]:
#importing libraries
import kagglehub
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from skimage.feature import hog
import os
import glob
from PIL import Image
import cv2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score,classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#to download the data (only do it once)
# data_path = kagglehub.dataset_download('snehilsanyal/weapon-detection-test')
# data_path

In [3]:
def load_and_preprocess_data(image_dir, label_dir, image_size=(128, 128), feature_extraction_method='flatten'):
    """
    Loads images and labels, preprocesses them, extracts features,
    and splits into training, validation, and test sets.
    """
    print(f"Loading data from image_dir: {image_dir} and label_dir: {label_dir}")
    #formats
    image_extensions = ('*.jpg', '*.jpeg', '*.png', '*.bmp', '*.gif', '*.tif', '*.tiff')
    image_paths = []
    for ext in image_extensions:
        image_paths.extend(sorted(glob.glob(os.path.join(image_dir, ext))))

    if not image_paths:
        print(f"ERROR: No images found in {image_dir} with extensions {image_extensions}.")
        return None, None, None, None, None, None, None

    print(f"Found {len(image_paths)} potential image files.")

    raw_labels = []
    features_list = []
    processed_image_count = 0

    for img_path in image_paths:
        base_filename = os.path.splitext(os.path.basename(img_path))[0]
        label_filename = base_filename + '.txt'
        label_path = os.path.join(label_dir, label_filename)

        if os.path.exists(label_path):
            try:
                
                with open(label_path, 'r', encoding='utf-8') as f:
                    first_line = f.readline()
                
                if not first_line:
                    print(f"Warning: Label file {label_path} for image {img_path} is empty. Skipping.")
                    continue

                parts = first_line.strip().split()
                
                if not parts:
                    print(f"Warning: Label file {label_path} (first line) for image {img_path} is empty after stripping or has no parts. Skipping.")
                    continue
                
                class_id_str = parts[0] 
                
                label_normalized = class_id_str.strip().lower()
                
                raw_labels.append(label_normalized)

                # Load and process image
                try:
                    img = Image.open(img_path)
                except Exception as e_img_open:
                    print(f"Error opening image {img_path}: {e_img_open}. Skipping.")
                    raw_labels.pop()
                    continue

                img_gray = img.convert('L') # Convert to grayscale
                img_resized = img_gray.resize(image_size)
                img_array = np.array(img_resized)

                # Feature Extraction
                if feature_extraction_method == 'flatten':
                    feature_vector = img_array.flatten()
                elif feature_extraction_method == 'hog':
                    feature_vector = hog(img_array, pixels_per_cell=(8, 8),
                                         cells_per_block=(2, 2), visualize=False,
                                         feature_vector=True)
                else:
                    raise ValueError("Unsupported feature_extraction_method. Choose 'flatten' or 'hog'.")

                features_list.append(feature_vector)
                processed_image_count += 1


            except Exception as e:
                print(f"Error processing image {img_path} or its label {label_path}: {e}")
                if len(raw_labels) > len(features_list):
                    raw_labels.pop()
        else:
            print(f"Warning: Label file not found for {img_path} (expected at {label_path}). Skipping this image.")

    if not features_list or not raw_labels:
        print("ERROR: No features extracted or no labels loaded. Cannot proceed.")
        print(f"Images processed that had labels: {processed_image_count}")
        print(f"Labels collected: {len(raw_labels)}")
        return None, None, None, None, None, None, None

    print(f"\nSuccessfully processed {processed_image_count} images and collected {len(raw_labels)} corresponding labels.")

    unique_raw_labels, counts_raw_labels = np.unique(raw_labels, return_counts=True)
    print(f"\nFound {len(unique_raw_labels)} unique raw labels (after stripping, lowercasing, and taking first element) before encoding:")
    for lbl, count in zip(unique_raw_labels, counts_raw_labels):
        print(f"  Label: '{lbl}' \t Count: {count}")
    if len(unique_raw_labels) > 15:
        print("WARNING: A larger than expected number of unique labels were found. Please inspect the list above carefully!")


    label_encoder = LabelEncoder()
    try:
        numerical_labels = label_encoder.fit_transform(raw_labels)
    except Exception as e_le:
        print(f"Error during LabelEncoder fit_transform: {e_le}")
        print("Problematic raw labels might be in the list above.")
        return None, None, None, None, None, None, None


    X = np.array(features_list)
    y = np.array(numerical_labels)

    print(f"\nShape of full feature matrix (X): {X.shape}")
    print(f"Shape of full label vector (y): {y.shape}")
    num_encoded_classes = len(label_encoder.classes_)
    print(f"Number of unique classes after encoding (le.classes_): {num_encoded_classes}")
    if num_encoded_classes <= 20 :
        print(f"Encoded classes by LabelEncoder: {label_encoder.classes_}")
    
    expected_classes = 9
    if num_encoded_classes != expected_classes:
        print(f"CRITICAL WARNING: LabelEncoder found {num_encoded_classes} classes, but you expect {expected_classes}. Please check your .txt label files for consistency and the 'unique raw labels' printed above.")


    # Stratification logic
    min_samples_per_class_for_stratify = 2 
    
    # Check counts for the full dataset y
    unique_y, counts_y = np.unique(y, return_counts=True)
    valid_for_stratify_y = num_encoded_classes > 1 and np.all(counts_y >= min_samples_per_class_for_stratify)
    stratify_option_y = y if valid_for_stratify_y else None
    if not valid_for_stratify_y and num_encoded_classes > 1:
        print("Warning: Initial dataset stratification disabled. Not all classes have enough samples (>=2) for stratification.")

    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=0.15, random_state=42, stratify=stratify_option_y
    )
    
    # Check counts for y_train_val before splitting it
    if y_train_val.size > 0:
        unique_ytv, counts_ytv = np.unique(y_train_val, return_counts=True)
        valid_for_stratify_ytv = len(unique_ytv) > 1 and np.all(counts_ytv >= min_samples_per_class_for_stratify)
        stratify_option_ytv = y_train_val if valid_for_stratify_ytv else None
        if not valid_for_stratify_ytv and len(unique_ytv) > 1:
            print("Warning: Train/Validation split stratification disabled. Not all classes in train_val set have enough samples (>=2).")
    else:
        stratify_option_ytv = None


    val_size_ratio = 0.15 / 0.85

    if X_train_val.shape[0] * val_size_ratio < 1 or X_train_val.shape[0] <=1 :
        print("Warning: train_val dataset too small for further validation split. Using all of train_val for training and creating an empty validation set.")
        X_train, y_train = X_train_val, y_train_val
        X_val_shape_dim1 = X.shape[1] if X.ndim > 1 and X.shape[1] > 0 else (features_list[0].shape[0] if features_list else 0)
        X_val, y_val = np.array([]).reshape(0, X_val_shape_dim1), np.array([])
    else:
        X_train, X_val, y_train, y_val = train_test_split(
            X_train_val, y_train_val, test_size=val_size_ratio,
            random_state=42, stratify=stratify_option_ytv
        )

    print(f"\nData splitting results:")
    print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    if X_val.size > 0:
        print(f"X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")
    else:
        print("Validation set is empty.")
    print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

    return X_train, X_val, X_test, y_train, y_val, y_test, label_encoder
X_train, X_val, X_test, y_train, y_val, y_test, le = load_and_preprocess_data('data/images', 'data/labels', image_size=(128, 128), feature_extraction_method='flatten')

Loading data from image_dir: data/images and label_dir: data/labels
Found 714 potential image files.





Successfully processed 714 images and collected 714 corresponding labels.

Found 6 unique raw labels (after stripping, lowercasing, and taking first element) before encoding:
  Label: '0' 	 Count: 532
  Label: '1' 	 Count: 2
  Label: '2' 	 Count: 89
  Label: '3' 	 Count: 2
  Label: '4' 	 Count: 1
  Label: '5' 	 Count: 88

Shape of full feature matrix (X): (714, 16384)
Shape of full label vector (y): (714,)
Number of unique classes after encoding (le.classes_): 6
Encoded classes by LabelEncoder: ['0' '1' '2' '3' '4' '5']

Data splitting results:
X_train shape: (499, 16384), y_train shape: (499,)
X_val shape: (107, 16384), y_val shape: (107,)
X_test shape: (108, 16384), y_test shape: (108,)


In [4]:
#model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight='balanced')
#train
rf_classifier.fit(X_train, y_train)

In [7]:
#checking training accuracy
y_train_pred = rf_classifier.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"\nTraining Accuracy: {train_accuracy * 100:.2f}%")
all_numeric_labels_for_report = np.arange(len(le.classes_))


Training Accuracy: 99.80%


In [8]:
y_val_pred = rf_classifier.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"\nValidation Accuracy: {val_accuracy * 100:.2f}%")
print(classification_report(y_val, y_val_pred, labels=all_numeric_labels_for_report, target_names=le.classes_, zero_division=0))


Validation Accuracy: 85.05%
              precision    recall  f1-score   support

           0       0.83      1.00      0.91        80
           1       0.00      0.00      0.00         1
           2       1.00      0.50      0.67        14
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       1.00      0.33      0.50        12

    accuracy                           0.85       107
   macro avg       0.47      0.31      0.35       107
weighted avg       0.87      0.85      0.82       107



In [9]:
y_test_pred = rf_classifier.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"\nTest Accuracy: {test_accuracy * 100:.2f}%")
print(classification_report(y_test, y_test_pred, labels=all_numeric_labels_for_report, target_names=le.classes_, zero_division=0))


Test Accuracy: 87.04%
              precision    recall  f1-score   support

           0       0.86      0.99      0.92        78
           1       0.00      0.00      0.00         0
           2       1.00      0.75      0.86         8
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.92      0.52      0.67        21

    accuracy                           0.87       108
   macro avg       0.46      0.38      0.41       108
weighted avg       0.87      0.87      0.86       108



In [10]:
X_train, X_val, X_test, y_train, y_val, y_test, le = load_and_preprocess_data('data/images', 'data/labels', image_size=(128, 128), feature_extraction_method='hog')

Loading data from image_dir: data/images and label_dir: data/labels
Found 714 potential image files.





Successfully processed 714 images and collected 714 corresponding labels.

Found 6 unique raw labels (after stripping, lowercasing, and taking first element) before encoding:
  Label: '0' 	 Count: 532
  Label: '1' 	 Count: 2
  Label: '2' 	 Count: 89
  Label: '3' 	 Count: 2
  Label: '4' 	 Count: 1
  Label: '5' 	 Count: 88

Shape of full feature matrix (X): (714, 8100)
Shape of full label vector (y): (714,)
Number of unique classes after encoding (le.classes_): 6
Encoded classes by LabelEncoder: ['0' '1' '2' '3' '4' '5']

Data splitting results:
X_train shape: (499, 8100), y_train shape: (499,)
X_val shape: (107, 8100), y_val shape: (107,)
X_test shape: (108, 8100), y_test shape: (108,)


## Conclusion
Still finding only 6 classes even though we used different classification method. This means that some classes are heavily underrepresented in the dataset. However, we still have a pretty high accuracy in the classes we have detected.

- Training: 99.80%
- Validation: 85.05%
- Testing: 87.04%