<a href="https://colab.research.google.com/github/lorenzopaoria/Smoking-detection-and-distance-analysis/blob/main/distance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Find the distance between smoker and not

In [1]:
!pip install torch torchvision
!pip install opencv-python
!git clone https://github.com/DepthAnything/Depth-Anything-V2.git
%cd Depth-Anything-V2
!pip install -r requirements.txt

fatal: destination path 'Depth-Anything-V2' already exists and is not an empty directory.
/content/Depth-Anything-V2


In [2]:
import torch
from depth_anything_v2.dpt import DepthAnythingV2
import cv2
import numpy as np
from dataclasses import dataclass
from typing import List, Tuple
import math
import os
import json



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
@dataclass
class Person:
    x1: int
    y1: int
    x2: int
    y2: int
    is_smoking: bool
    confidence: float

In [5]:
def calculate_center_point(person: Person) -> Tuple[float, float]:
    """Calcola il punto centrale di una bounding box"""
    center_x = (person.x1 + person.x2) / 2
    center_y = (person.y1 + person.y2) / 2
    return (center_x, center_y)

In [6]:
def resize_to_multiple_of_patch(image, patch_size=14):
    """Ridimensiona l'immagine in modo che altezza e larghezza siano multipli di patch_size"""
    h, w = image.shape[:2]
    new_h = math.ceil(h / patch_size) * patch_size
    new_w = math.ceil(w / patch_size) * patch_size
    return cv2.resize(image, (new_w, new_h))

In [7]:
def preprocess_image_for_depth(image):
    """Prepara l'immagine per il modello di profondità"""
    # Ridimensiona l'immagine per il modello di profondità
    resized_image = resize_to_multiple_of_patch(image)
    # Converte l'immagine in tensore
    image_tensor = torch.from_numpy(resized_image).permute(2, 0, 1).unsqueeze(0).float() / 255.0
    return resized_image, image_tensor

In [8]:
def calculate_depth_map(image_tensor, model):
    """Genera una mappa di profondità utilizzando Depth-Anything-V2"""
    with torch.no_grad():
        depth_map = model(image_tensor)
    return depth_map.squeeze().cpu().numpy()

In [9]:
def scale_bounding_boxes(people, original_image, resized_image):
    """Scala le coordinate delle bounding box in base al ridimensionamento dell'immagine"""
    orig_h, orig_w = original_image.shape[:2]
    new_h, new_w = resized_image.shape[:2]

    scale_x = new_w / orig_w
    scale_y = new_h / orig_h

    scaled_people = []
    for person in people:
        scaled_person = Person(
            x1=int(person.x1 * scale_x),
            y1=int(person.y1 * scale_y),
            x2=int(person.x2 * scale_x),
            y2=int(person.y2 * scale_y),
            is_smoking=person.is_smoking,
            confidence=person.confidence
        )
        scaled_people.append(scaled_person)

    return scaled_people

In [10]:
def calculate_3d_distance(p1: Person, p2: Person, depth_map, focal_length: float, image_width: float):
    """Calcola la distanza 3D tra due persone utilizzando la mappa di profondità"""
    c1 = calculate_center_point(p1)
    c2 = calculate_center_point(p2)

    # Assicurati che le coordinate siano valide
    c1_x, c1_y = max(0, min(int(c1[0]), depth_map.shape[1]-1)), max(0, min(int(c1[1]), depth_map.shape[0]-1))
    c2_x, c2_y = max(0, min(int(c2[0]), depth_map.shape[1]-1)), max(0, min(int(c2[1]), depth_map.shape[0]-1))

    # Ottieni la profondità per ciascuna persona
    # Usa un'area piccola intorno al centro per una stima più robusta
    radius = 5
    p1_area = depth_map[max(0, c1_y-radius):min(depth_map.shape[0], c1_y+radius),
                        max(0, c1_x-radius):min(depth_map.shape[1], c1_x+radius)]
    p2_area = depth_map[max(0, c2_y-radius):min(depth_map.shape[0], c2_y+radius),
                        max(0, c2_x-radius):min(depth_map.shape[1], c2_x+radius)]

    # Usa la mediana per ridurre l'effetto di valori anomali
    depth1 = np.median(p1_area) if p1_area.size > 0 else depth_map[c1_y, c1_x]
    depth2 = np.median(p2_area) if p2_area.size > 0 else depth_map[c2_y, c2_x]

    # Calcola le coordinate 3D
    x1 = (c1_x - image_width / 2) * depth1 / focal_length
    y1 = depth1
    x2 = (c2_x - image_width / 2) * depth2 / focal_length
    y2 = depth2

    # Distanza euclidea 3D
    distance = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)

    # Converti la distanza in metri (fattore di scala approssimativo)
    # Questo fattore dovrà essere calibrato in base alle tue specifiche esigenze
    scale_factor = 0.1
    distance_meters = distance * scale_factor

    return distance_meters

In [11]:
def find_smoker_nonsmoker_distances(people: List[Person], depth_map, focal_length: float, image_width: float) -> List[Tuple[Person, Person, float]]:
    """Trova tutte le distanze 3D tra fumatori e non fumatori"""
    smokers = [p for p in people if p.is_smoking]
    non_smokers = [p for p in people if not p.is_smoking]
    distances = []

    for smoker in smokers:
        for non_smoker in non_smokers:
            distance = calculate_3d_distance(smoker, non_smoker, depth_map, focal_length, image_width)
            distances.append((smoker, non_smoker, distance))

    return distances

In [12]:
def load_detections_from_json(json_path: str) -> List[Person]:
    """Carica le detection dal file JSON e le converte in oggetti Person"""
    with open(json_path, 'r') as f:
        data = json.load(f)

    people = []
    if 'detections' in data:
        for detection in data['detections']:
            # Classe 2 rappresenta il fumatore, 1 il non fumatore
            is_smoking = detection.get('class') == 2
            bbox = detection.get('bbox', [0, 0, 0, 0])
            confidence = detection.get('confidence', 0.0)

            if len(bbox) == 4:  # Verifica che bbox abbia le 4 coordinate necessarie
                people.append(Person(
                    x1=int(bbox[0]),
                    y1=int(bbox[1]),
                    x2=int(bbox[2]),
                    y2=int(bbox[3]),
                    is_smoking=is_smoking,
                    confidence=confidence
                ))

    return people

In [13]:
def process_and_save_image(image_path: str, people: List[Person], output_dir: str, focal_length: float, depth_model) -> bool:
    """Processa un'immagine disegnando le distanze 3D tra i centri delle bounding box"""
    os.makedirs(output_dir, exist_ok=True)

    # Carica l'immagine originale
    original_image = cv2.imread(image_path)
    if original_image is None:
        print(f"Errore nel caricamento dell'immagine: {image_path}")
        return False

    try:
        # Prepara l'immagine per il modello di profondità
        resized_image, image_tensor = preprocess_image_for_depth(original_image)

        # Scala le bounding box per corrispondere all'immagine ridimensionata
        scaled_people = scale_bounding_boxes(people, original_image, resized_image)

        # Genera la mappa di profondità
        depth_map = calculate_depth_map(image_tensor, depth_model)

        # Ridimensiona la mappa di profondità alle dimensioni dell'immagine originale per la visualizzazione
        depth_display = cv2.resize(depth_map, (original_image.shape[1], original_image.shape[0]))

        # Normalizza la mappa di profondità per la visualizzazione
        normalized_depth = cv2.normalize(depth_display, None, 0, 255, cv2.NORM_MINMAX, dtype=cv2.CV_8U)
        colored_depth = cv2.applyColorMap(normalized_depth, cv2.COLORMAP_INFERNO)

        # Calcola le distanze 3D usando l'immagine ridimensionata e la mappa di profondità
        distances = find_smoker_nonsmoker_distances(scaled_people, depth_map, focal_length, resized_image.shape[1])

        # Disegna sull'immagine originale ridimensionata
        visualized_image = resized_image.copy()

        # Definizione colori
        YELLOW = (0, 255, 255)  # BGR per giallo (centri)
        BROWN = (42, 42, 165)   # BGR per marrone (linee distanza)
        RED = (0, 0, 255)       # BGR per rosso (fumatori)
        BLUE = (255, 0, 0)      # BGR per blu (non fumatori)
        GREEN = (0, 255, 0)     # BGR per verde (testo)

        # Disegna le bounding box e i centri
        for person in scaled_people:
            color = RED if person.is_smoking else BLUE
            cv2.rectangle(visualized_image, (int(person.x1), int(person.y1)),
                         (int(person.x2), int(person.y2)), color, 2)
            center = calculate_center_point(person)
            cv2.circle(visualized_image, (int(center[0]), int(center[1])), 5, YELLOW, -1)

            # Aggiungi etichetta
            label = "Smoker" if person.is_smoking else "Non-smoker"
            conf_text = f"{person.confidence:.2f}"
            cv2.putText(visualized_image, f"{label} {conf_text}",
                       (int(person.x1), int(person.y1) - 10),
                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

        # Disegna le distanze
        for smoker, non_smoker, distance in distances:
            s_center = calculate_center_point(smoker)
            ns_center = calculate_center_point(non_smoker)

            cv2.line(visualized_image,
                    (int(s_center[0]), int(s_center[1])),
                    (int(ns_center[0]), int(ns_center[1])),
                    BROWN, 2)

            mid_point = ((s_center[0] + ns_center[0]) // 2, (s_center[1] + ns_center[1]) // 2)

            # Aggiungi una piccola area di sfondo per il testo della distanza
            text = f"{distance:.2f}m"
            (text_w, text_h), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2)
            cv2.rectangle(visualized_image,
                         (int(mid_point[0] - text_w/2 - 5), int(mid_point[1] - text_h - 5)),
                         (int(mid_point[0] + text_w/2 + 5), int(mid_point[1] + 5)),
                         (255, 255, 255), -1)

            cv2.putText(visualized_image, text,
                       (int(mid_point[0] - text_w/2), int(mid_point[1])),
                       cv2.FONT_HERSHEY_SIMPLEX, 0.7, GREEN, 2)

        # Salva l'immagine processata
        filename = os.path.basename(image_path)
        output_path = os.path.join(output_dir, f"distances_{filename}")
        cv2.imwrite(output_path, visualized_image)

        # Salva anche la mappa di profondità colorata
        depth_output_path = os.path.join(output_dir, f"depth_{filename}")
        cv2.imwrite(depth_output_path, colored_depth)

        # Crea un'immagine composta (originale + profondità + visualizzazione)
        h, w = original_image.shape[:2]
        # Ridimensiona tutto alla stessa dimensione
        resized_original = cv2.resize(original_image, (w, h))
        resized_depth = cv2.resize(colored_depth, (w, h))
        resized_visualization = cv2.resize(visualized_image, (w, h))

        # Unisci orizzontalmente
        composite = np.hstack((resized_original, resized_depth, resized_visualization))
        composite_output_path = os.path.join(output_dir, f"composite_{filename}")
        cv2.imwrite(composite_output_path, composite)

        print(f"Elaborazione completata per {image_path}")
        return True

    except Exception as e:
        print(f"Errore durante l'elaborazione di {image_path}: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

In [14]:
def main():
    # Configurazione
    base_dir = '/content/drive/MyDrive/test_trained_person'
    output_dir = '/content/drive/MyDrive/distance_img_process'
    focal_length = 1000  # Parametro da calibrare in base al focale della camera

    # Carica il modello Depth-Anything-V2
    print("Caricamento del modello Depth-Anything-V2...")
    depth_model = DepthAnythingV2()
    depth_model.load_state_dict(torch.load('/content/drive/MyDrive/pth_depth_estimation_large/depth_anything_v2_vitb.pth'))
    depth_model.eval()
    depth_model = depth_model.to('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Modello caricato con successo su {'GPU' if torch.cuda.is_available() else 'CPU'}")

    # Processa tutte le immagini nella cartella
    images_dir = os.path.join(base_dir, 'images')
    coordinates_dir = os.path.join(base_dir, 'coordinates')

    successful = 0
    failed = 0
    skipped = 0

    print(f"Elaborazione delle immagini in {images_dir}...")

    # Assicurati che le directory esistano
    if not os.path.exists(images_dir):
        print(f"La directory delle immagini {images_dir} non esiste!")
        return
    if not os.path.exists(coordinates_dir):
        print(f"La directory delle coordinate {coordinates_dir} non esiste!")
        return

    # Crea la directory di output se non esiste
    os.makedirs(output_dir, exist_ok=True)

    # Ottieni la lista di tutti i file immagine
    image_files = [f for f in os.listdir(images_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    print(f"Trovate {len(image_files)} immagini da elaborare.")

    # Elabora ogni immagine
    for i, filename in enumerate(image_files):
        print(f"[{i+1}/{len(image_files)}] Elaborazione di {filename}...")

        image_path = os.path.join(images_dir, filename)
        json_name = f"{os.path.splitext(filename)[0]}.json"
        json_path = os.path.join(coordinates_dir, json_name)

        if not os.path.exists(json_path):
            print(f"File JSON non trovato per {filename}, saltato.")
            skipped += 1
            continue

        try:
            people = load_detections_from_json(json_path)
            if not people:
                print(f"Nessuna persona rilevata in {filename}, saltato.")
                skipped += 1
                continue

            success = process_and_save_image(image_path, people, output_dir, focal_length, depth_model)
            if success:
                successful += 1
                print(f"✅ Immagine {filename} elaborata con successo")
            else:
                failed += 1
                print(f"Errore nell'elaborazione dell'immagine {filename}")
        except Exception as e:
            print(f"Errore catastrofico nell'elaborazione di {filename}: {str(e)}")
            import traceback
            traceback.print_exc()
            failed += 1

    print(f"\n=== RIEPILOGO DELL'ELABORAZIONE ===")
    print(f"Immagini elaborate con successo: {successful}")
    print(f"Immagini non elaborate (errori): {failed}")
    print(f"Immagini saltate (file mancanti o nessuna persona): {skipped}")
    print(f"Totale immagini processate: {successful + failed} di {len(image_files)}")
    print(f"Risultati salvati in: {output_dir}")

if __name__ == "__main__":
    import traceback
    try:
        main()
    except Exception as e:
        print(f"Errore nell'esecuzione del programma: {str(e)}")
        traceback.print_exc()

Caricamento del modello Depth-Anything-V2...


  depth_model.load_state_dict(torch.load('/content/drive/MyDrive/pth_depth_estimation_large/depth_anything_v2_vitb.pth'))


Errore nell'esecuzione del programma: Error(s) in loading state_dict for DepthAnythingV2:
	Missing key(s) in state_dict: "pretrained.blocks.12.norm1.weight", "pretrained.blocks.12.norm1.bias", "pretrained.blocks.12.attn.qkv.weight", "pretrained.blocks.12.attn.qkv.bias", "pretrained.blocks.12.attn.proj.weight", "pretrained.blocks.12.attn.proj.bias", "pretrained.blocks.12.ls1.gamma", "pretrained.blocks.12.norm2.weight", "pretrained.blocks.12.norm2.bias", "pretrained.blocks.12.mlp.fc1.weight", "pretrained.blocks.12.mlp.fc1.bias", "pretrained.blocks.12.mlp.fc2.weight", "pretrained.blocks.12.mlp.fc2.bias", "pretrained.blocks.12.ls2.gamma", "pretrained.blocks.13.norm1.weight", "pretrained.blocks.13.norm1.bias", "pretrained.blocks.13.attn.qkv.weight", "pretrained.blocks.13.attn.qkv.bias", "pretrained.blocks.13.attn.proj.weight", "pretrained.blocks.13.attn.proj.bias", "pretrained.blocks.13.ls1.gamma", "pretrained.blocks.13.norm2.weight", "pretrained.blocks.13.norm2.bias", "pretrained.blocks.13

Traceback (most recent call last):
  File "<ipython-input-14-ca02b3eea748>", line 83, in <cell line: 0>
    main()
  File "<ipython-input-14-ca02b3eea748>", line 10, in main
    depth_model.load_state_dict(torch.load('/content/drive/MyDrive/pth_depth_estimation_large/depth_anything_v2_vitb.pth'))
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 2584, in load_state_dict
    raise RuntimeError(
RuntimeError: Error(s) in loading state_dict for DepthAnythingV2:
	Missing key(s) in state_dict: "pretrained.blocks.12.norm1.weight", "pretrained.blocks.12.norm1.bias", "pretrained.blocks.12.attn.qkv.weight", "pretrained.blocks.12.attn.qkv.bias", "pretrained.blocks.12.attn.proj.weight", "pretrained.blocks.12.attn.proj.bias", "pretrained.blocks.12.ls1.gamma", "pretrained.blocks.12.norm2.weight", "pretrained.blocks.12.norm2.bias", "pretrained.blocks.12.mlp.fc1.weight", "pretrained.blocks.12.mlp.fc1.bias", "pretrained.blocks.12.mlp.fc2.weight", "pretrained.blocks.12.m