<a href="https://colab.research.google.com/github/lorenzopaoria/Smoking-detection-and-distance-analysis/blob/main/distance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Find the distance between smoker and not with help of depth model

In [63]:
# Installazione delle dipendenze
!pip install torch torchvision
!pip install opencv-python
!git clone https://github.com/DepthAnything/Depth-Anything-V2.git
%cd Depth-Anything-V2
!pip install -r requirements.txt

Cloning into 'Depth-Anything-V2'...
remote: Enumerating objects: 142, done.[K
remote: Counting objects: 100% (75/75), done.[K
remote: Compressing objects: 100% (41/41), done.[K
remote: Total 142 (delta 45), reused 34 (delta 34), pack-reused 67 (from 2)[K
Receiving objects: 100% (142/142), 45.17 MiB | 8.83 MiB/s, done.
Resolving deltas: 100% (48/48), done.
/content/Depth-Anything-V2/Depth-Anything-V2/Depth-Anything-V2/Depth-Anything-V2/Depth-Anything-V2


In [64]:
# Import delle librerie necessarie
import torch
import cv2
import numpy as np
from depth_anything_v2.dpt import DepthAnythingV2
from dataclasses import dataclass
from typing import List, Tuple, Dict
import math
import os
import json

In [65]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [66]:
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
# configurazione del dispositivo (GPU T4 su Colab)
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Dispositivo in uso: {DEVICE}")

# configurazione del modello DepthAnythingV2
model_configs = {
    'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]}
}

encoder = 'vits'
model = DepthAnythingV2(**model_configs[encoder])

# caricamento dei pesi del modello
checkpoint_path = f'/content/drive/MyDrive/pth_depth_estimation_large/depth_anything_v2_{encoder}.pth'
model.load_state_dict(torch.load(checkpoint_path, map_location='cpu'))
model = model.to(DEVICE).eval()
print(f"Modello {encoder} caricato con successo su {DEVICE}")

Dispositivo in uso: cuda


  model.load_state_dict(torch.load(checkpoint_path, map_location='cpu'))


Modello vits caricato con successo su cuda


In [67]:
@dataclass
class Person:
    x1: int
    y1: int
    x2: int
    y2: int
    is_smoking: bool
    confidence: float
    class_id: int

In [68]:
def calculate_center_point(person: Person) -> Tuple[float, float]:
    """calcola il punto centrale di una bounding box"""
    center_x = (person.x1 + person.x2) / 2
    center_y = (person.y1 + person.y2) / 2
    return (center_x, center_y)

In [69]:
def resize_to_multiple_of_patch(image, patch_size=14):
    """ridimensioni le immagini per un multipplo della patch"""
    h, w = image.shape[:2]
    new_h = math.ceil(h / patch_size) * patch_size
    new_w = math.ceil(w / patch_size) * patch_size
    return cv2.resize(image, (new_w, new_h))

In [70]:
def preprocess_image_for_depth(image):
    """pre-processa le immagini per la depth estimation"""
    resized_image = resize_to_multiple_of_patch(image)
    image_tensor = torch.from_numpy(resized_image).permute(2, 0, 1).unsqueeze(0).float() / 255.0
    return resized_image, image_tensor

In [71]:
def calculate_depth_map(image_tensor, model):
    """genera una mappa di profondità utilizzando Depth-Anything-V2"""
    with torch.no_grad():
        depth_map = model(image_tensor.to(DEVICE))
    return depth_map.squeeze().cpu().numpy()

In [72]:
def scale_bounding_boxes(people, original_image, resized_image):
    """scala le coordinate delle bounding box in base al ridimensionamento dell'immagine"""
    orig_h, orig_w = original_image.shape[:2]
    new_h, new_w = resized_image.shape[:2]

    scale_x = new_w / orig_w
    scale_y = new_h / orig_h

    scaled_people = []
    for person in people:
        scaled_person = Person(
            x1=int(person.x1 * scale_x),
            y1=int(person.y1 * scale_y),
            x2=int(person.x2 * scale_x),
            y2=int(person.y2 * scale_y),
            is_smoking=person.is_smoking,
            confidence=person.confidence,
            class_id=person.class_id
        )
        scaled_people.append(scaled_person)

    return scaled_people

In [73]:
def calculate_3d_distance(p1: Person, p2: Person, depth_map, focal_length: float, sensor_width: float, image_width: float):
    """
    Calcola la distanza 3D tra due persone utilizzando la mappa di profondità
    e una lunghezza focale realistica (es. 50mm)

    Args:
        p1, p2: Oggetti Person
        depth_map: Mappa di profondità
        focal_length: Lunghezza focale in millimetri (es. 50mm)
        sensor_width: Larghezza del sensore in millimetri (es. 36mm per full frame)
        image_width: Larghezza dell'immagine in pixel

    Returns:
        Distanza in metri
    """
    # Calcola i punti centrali delle bounding box
    c1 = calculate_center_point(p1)
    c2 = calculate_center_point(p2)

    # Assicurati che i punti centrali siano all'interno della mappa di profondità
    c1_x, c1_y = max(0, min(int(c1[0]), depth_map.shape[1]-1)), max(0, min(int(c1[1]), depth_map.shape[0]-1))
    c2_x, c2_y = max(0, min(int(c2[0]), depth_map.shape[1]-1)), max(0, min(int(c2[1]), depth_map.shape[0]-1))

    # Utilizza un'area per migliorare la stima della profondità
    radius = 5
    p1_area = depth_map[max(0, c1_y-radius):min(depth_map.shape[0], c1_y+radius),
                    max(0, c1_x-radius):min(depth_map.shape[1], c1_x+radius)]
    p2_area = depth_map[max(0, c2_y-radius):min(depth_map.shape[0], c2_y+radius),
                    max(0, c2_x-radius):min(depth_map.shape[1], c2_x+radius)]

    # Usa la mediana per ridurre l'effetto di outlier
    depth1 = np.median(p1_area) if p1_area.size > 0 else depth_map[c1_y, c1_x]
    depth2 = np.median(p2_area) if p2_area.size > 0 else depth_map[c2_y, c2_x]

    # Calcola il fattore di conversione da pixel a metri
    # focal_length_pixel = focal_length * (image_width / sensor_width)
    focal_length_pixel = focal_length * (image_width / sensor_width)

    # Converte i valori di profondità in metri
    # I valori di depth_map sono normalizzati (0-1), convertiamoli in metri
    # Assumiamo che i valori massimi della depth_map rappresentino ~50 metri
    depth_scale = 50.0  # la profondità massima in metri
    depth1_meters = depth1 * depth_scale
    depth2_meters = depth2 * depth_scale

    # Calcola le coordinate spaziali 3D
    x1 = (c1_x - image_width / 2) * depth1_meters / focal_length_pixel
    y1 = (c1_y - depth_map.shape[0] / 2) * depth1_meters / focal_length_pixel
    z1 = depth1_meters

    x2 = (c2_x - image_width / 2) * depth2_meters / focal_length_pixel
    y2 = (c2_y - depth_map.shape[0] / 2) * depth2_meters / focal_length_pixel
    z2 = depth2_meters

    # Calcola la distanza euclidea 3D
    distance_3d = math.sqrt((x2 - x1)**2 + (y2 - y1)**2 + (z2 - z1)**2)

    return distance_3d

In [74]:
def find_smoker_nonsmoker_distances(people: List[Person], depth_map, focal_length: float, sensor_width: float, image_width: float) -> List[Dict]:
    """Trova tutte le distanze tra fumatori e non fumatori, le restituisce e le salva in json"""
    smokers = [p for p in people if p.class_id == 2]  # classe 2 per i fumatori
    non_smokers = [p for p in people if p.class_id == 1]  # classe 1 per i non fumatori
    distances_data = []

    for i, smoker in enumerate(smokers):
        for j, non_smoker in enumerate(non_smokers):
            distance = calculate_3d_distance(smoker, non_smoker, depth_map, focal_length, sensor_width, image_width)

            distance_info = {
                "smoker_id": i,
                "smoker_confidence": smoker.confidence,
                "smoker_bbox": [smoker.x1, smoker.y1, smoker.x2, smoker.y2],
                "non_smoker_id": j,
                "non_smoker_confidence": non_smoker.confidence,
                "non_smoker_bbox": [non_smoker.x1, non_smoker.y1, non_smoker.x2, non_smoker.y2],
                "distance_meters": float(distance)
            }
            distances_data.append(distance_info)

    return distances_data

In [75]:
def load_detections_from_json(json_path: str) -> List[Person]:
    """carica le detection dal file JSON e le converte in oggetti Person"""
    with open(json_path, 'r') as f:
        data = json.load(f)

    people = []
    if 'detections' in data:
        for detection in data['detections']:
            class_id = detection.get('class')
            # carica solo le classi 1 (non fumatori) e 2 (fumatori)
            if class_id in [1, 2]:
                is_smoking = class_id == 2
                bbox = detection.get('bbox', [0, 0, 0, 0])
                confidence = detection.get('confidence', 0.0)

                if len(bbox) == 4:
                    people.append(Person(
                        x1=int(bbox[0]),
                        y1=int(bbox[1]),
                        x2=int(bbox[2]),
                        y2=int(bbox[3]),
                        is_smoking=is_smoking,
                        confidence=confidence,
                        class_id=class_id
                    ))

    return people

In [76]:
def process_and_save_image(image_path: str, people: List[Person], output_dir: str, focal_length: float, sensor_width: float, model) -> bool:
    """Processa un'immagine disegnando le distanze 3D tra i centri delle bounding box"""
    os.makedirs(output_dir, exist_ok=True)

    original_image = cv2.imread(image_path)
    if original_image is None:
        print(f"Errore nel caricamento dell'immagine: {image_path}")
        return False

    try:
        torch.cuda.empty_cache()

        resized_image, image_tensor = preprocess_image_for_depth(original_image)
        scaled_people = scale_bounding_boxes(people, original_image, resized_image)
        depth_map = calculate_depth_map(image_tensor, model)
        depth_display = cv2.resize(depth_map, (original_image.shape[1], original_image.shape[0]))
        normalized_depth = cv2.normalize(depth_display, None, 0, 255, cv2.NORM_MINMAX, dtype=cv2.CV_8U)
        colored_depth = cv2.applyColorMap(normalized_depth, cv2.COLORMAP_INFERNO)

        distances_data = find_smoker_nonsmoker_distances(scaled_people, depth_map, focal_length, sensor_width, resized_image.shape[1])

        # salva i dati delle distanze in un file JSON
        filename = os.path.basename(image_path)
        json_output_path = os.path.join(output_dir, f"distances_{os.path.splitext(filename)[0]}.json")
        with open(json_output_path, 'w') as f:
            json.dump({
                "image_name": filename,
                "focal_length_mm": focal_length,
                "sensor_width_mm": sensor_width,
                "distances": distances_data
            }, f, indent=4)

        visualized_image = resized_image.copy()

        RED = (0, 0, 255)      #fumatori
        BLUE = (255, 0, 0)     #non fumatori
        YELLOW = (0, 255, 255) #punti centrali
        BROWN = (42, 42, 165)  #linee
        GREEN = (0, 255, 0)    #testo delle distanze

        # disegna le bounding box per le classi 1 e 2
        for person in scaled_people:
            if person.class_id in [1, 2]:
                color = RED if person.class_id == 2 else BLUE
                cv2.rectangle(visualized_image, (int(person.x1), int(person.y1)),
                            (int(person.x2), int(person.y2)), color, 2)
                center = calculate_center_point(person)
                cv2.circle(visualized_image, (int(center[0]), int(center[1])), 5, YELLOW, -1)

        # disegna le linee e le distanze
        smokers = [p for p in scaled_people if p.class_id == 2]
        non_smokers = [p for p in scaled_people if p.class_id == 1]

        # offset per separare le linee
        offset_increment = 15
        current_offset = 0

        for i, smoker in enumerate(smokers):
            for j, non_smoker in enumerate(non_smokers):
                s_center = calculate_center_point(smoker)
                ns_center = calculate_center_point(non_smoker)
                distance = calculate_3d_distance(smoker, non_smoker, depth_map, focal_length, sensor_width, resized_image.shape[1])

                offset_y = current_offset
                if i % 2 == 0:
                    offset_y = -offset_y

                mid_x = (s_center[0] + ns_center[0]) // 2
                mid_y = (s_center[1] + ns_center[1]) // 2 + offset_y

                cv2.line(visualized_image,
                        (int(s_center[0]), int(s_center[1])),
                        (int(mid_x), int(mid_y)),
                        BROWN, 2)
                cv2.line(visualized_image,
                        (int(mid_x), int(mid_y)),
                        (int(ns_center[0]), int(ns_center[1])),
                        BROWN, 2)

                text = f"S{i}-NS{j}: {distance:.2f}m"
                (text_w, text_h), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)

                cv2.rectangle(visualized_image,
                            (int(mid_x - text_w/2 - 5), int(mid_y - text_h - 5)),
                            (int(mid_x + text_w/2 + 5), int(mid_y + 5)),
                            (255, 255, 255), -1)

                cv2.putText(visualized_image, text,
                          (int(mid_x - text_w/2), int(mid_y)),
                          cv2.FONT_HERSHEY_SIMPLEX, 0.6, GREEN, 2)

                current_offset += offset_increment

                if current_offset > 80:
                    current_offset = 15

        filename = os.path.basename(image_path)
        output_path = os.path.join(output_dir, f"distances_{filename}")
        cv2.imwrite(output_path, visualized_image)

        depth_output_path = os.path.join(output_dir, f"depth_{filename}")
        cv2.imwrite(depth_output_path, colored_depth)

        h, w = original_image.shape[:2]
        resized_original = cv2.resize(original_image, (w, h))
        resized_depth = cv2.resize(colored_depth, (w, h))
        resized_visualization = cv2.resize(visualized_image, (w, h))

        composite = np.hstack((resized_original, resized_depth, resized_visualization))
        composite_output_path = os.path.join(output_dir, f"composite_{filename}")
        cv2.imwrite(composite_output_path, composite)

        print(f"Elaborazione completata per {image_path}")
        return True

    except Exception as e:
        print(f"Errore durante l'elaborazione di {image_path}: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

In [77]:
def main():
    base_dir = '/content/drive/MyDrive/trained_photos'
    output_dir = '/content/drive/MyDrive/distance_img_process'

    # Parametri della camera
    focal_length = 50.0  # lunghezza focale in mm (standard 50mm)
    sensor_width = 36.0  # larghezza sensore in mm (full frame)

    images_dir = os.path.join(base_dir, 'images')
    coordinates_dir = os.path.join(base_dir, 'coordinates')

    successful = 0
    failed = 0
    skipped = 0

    print(f"Elaborazione delle immagini in {images_dir}...")

    if not os.path.exists(images_dir):
        print(f"La directory delle immagini {images_dir} non esiste!")
        return
    if not os.path.exists(coordinates_dir):
        print(f"La directory delle coordinate {coordinates_dir} non esiste!")
        return

    os.makedirs(output_dir, exist_ok=True)

    image_files = [f for f in os.listdir(images_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    print(f"Trovate {len(image_files)} immagini da elaborare.")

    for i, filename in enumerate(image_files):
        print(f"[{i+1}/{len(image_files)}] Elaborazione di {filename}...")

        image_path = os.path.join(images_dir, filename)
        json_name = f"{os.path.splitext(filename)[0]}.json"
        json_path = os.path.join(coordinates_dir, json_name)

        if not os.path.exists(json_path):
            print(f"File JSON non trovato per {filename}, saltato.")
            skipped += 1
            continue

        try:
            people = load_detections_from_json(json_path)
            if not people:
                print(f"Nessuna persona rilevata in {filename}, saltato.")
                skipped += 1
                continue

            success = process_and_save_image(image_path, people, output_dir, focal_length, sensor_width, model)
            if success:
                successful += 1
                print(f"Immagine {filename} elaborata con successo")
            else:
                failed += 1
                print(f"Errore nell'elaborazione dell'immagine {filename}")
        except Exception as e:
            print(f"Errore catastrofico nell'elaborazione di {filename}: {str(e)}")
            import traceback
            traceback.print_exc()
            failed += 1

    print(f"\n=== RIEPILOGO DELL'ELABORAZIONE ===")
    print(f"Immagini elaborate con successo: {successful}")
    print(f"Immagini non elaborate (errori): {failed}")
    print(f"Immagini saltate (file mancanti o nessuna persona): {skipped}")
    print(f"Totale immagini processate: {successful + failed} di {len(image_files)}")
    print(f"Risultati salvati in: {output_dir}")

    if __name__ == "__main__":
        main()