# Clonar el proyecto
*Nota: Asegurarse de estar trabajando con la GPU.*

In [1]:
!git clone https://github.com/gangweix/Fast-ACVNet.git
!git clone https://github.com/nick3153/Depth-Estimation-with-StereoThermal-Images.git
%cd Fast-ACVNet

Cloning into 'Fast-ACVNet'...
remote: Enumerating objects: 213, done.[K
remote: Counting objects: 100% (61/61), done.[K
remote: Compressing objects: 100% (30/30), done.[K
remote: Total 213 (delta 46), reused 31 (delta 31), pack-reused 152 (from 1)[K
Receiving objects: 100% (213/213), 86.10 MiB | 17.52 MiB/s, done.
Resolving deltas: 100% (96/96), done.
Updating files: 100% (50/50), done.
Cloning into 'Depth-Estimation-with-StereoThermal-Images'...
remote: Enumerating objects: 6149, done.[K
remote: Counting objects: 100% (2/2), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 6149 (delta 1), reused 0 (delta 0), pack-reused 6147 (from 3)[K
Receiving objects: 100% (6149/6149), 1.68 GiB | 31.33 MiB/s, done.
Resolving deltas: 100% (60/60), done.
Updating files: 100% (152/152), done.
/content/Fast-ACVNet


# Instalar dependencias necesarias

In [2]:
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
!pip install opencv-python
!pip install scikit-image
!pip install tensorboard
!pip install matplotlib
!pip install tqdm
!pip install timm==0.5.4

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu113
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting

# Código

In [3]:
#@title Montar el drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#@title Funciones necesarias
from PIL import Image
import numpy as np
import cv2
from PIL import Image

def normalize(img):
    image = ((img - img.min()) / (img.max() - img.min())) * 255.0
    return image.astype(np.uint8)

def intensity_binding(img, lower_percentile:int=1, upper_percentile:int=99):
    # Calcular los percentiles
    low = np.percentile(img, lower_percentile)
    high = np.percentile(img, upper_percentile)

    # Recortar al rango [low, high]
    img_clipped = np.clip(img, low, high)

    img_norm = normalize(img_clipped)

    return img_norm

def apply_CLAHE(img):
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    return clahe.apply(img)

def bilateral_filtering(img):
    return cv2.bilateralFilter(img, d=5, sigmaColor=15, sigmaSpace=3)

def align_contrast(imgL, imgR):
    meanL = np.mean(imgL)
    meanR = np.mean(imgR)

    if meanR < meanL:
      imageR = imgR * (meanL/meanR)
      return imgL, imageR
    elif meanL < meanR:
      imageL = imgL * (meanR/meanL)
      return imageL, imgR

    return imgL, imgR

def preprocess_thermal_image(img):
    if np.max(img) < 35000:
      image = normalize(img)
    else:
      # Eliminar outlayers
      image = intensity_binding(img)

    # Ecualizacion del histograma con clahe
    image = apply_CLAHE(image)

    # Eliminacion de ruido
    image = bilateral_filtering(image)

    return image.astype(np.uint8)

def preprocess_images(imgL, imgR):
    imageL = preprocess_thermal_image(imgL)
    imageR = preprocess_thermal_image(imgR)

    imageL, imageR = align_contrast(imageL, imageR)

    return Image.fromarray(imageL.astype(np.uint8)), Image.fromarray(imageR.astype(np.uint8))

def depth_to_disparity(depth, focal_length = 406.33233091474426, baseline = 0.2458492526627874):
    # Evitar división por cero
    depth = np.clip(depth, a_min=1e-6, a_max=None)
    disparity = (focal_length * baseline) / depth
    return disparity

In [5]:
import os
import torch
from torch.utils.data import Dataset
from PIL import Image
import numpy as np
import cv2
import random
import torchvision.transforms as transforms

class ThermalStereoDataset(Dataset):
    def __init__(self, root_dir, domain='frick_2', split='train', training=True, crop_size=(512, 256)):
        self.training = training
        self.crop_w, self.crop_h = crop_size
        self.left_dir = os.path.join(root_dir, domain, split, 'img_left')
        self.right_dir = os.path.join(root_dir, domain, split, 'img_right')
        self.disp_dir = os.path.join(root_dir, domain, split, 'depth_filtered')

        self.left_images = sorted(os.listdir(self.left_dir))
        self.right_images = sorted(os.listdir(self.right_dir))
        self.disp_images = sorted(os.listdir(self.disp_dir))

        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def replicate_to_3_channels(self, img):
        img_np = np.array(img)
        if img_np.ndim == 2:
            img_np = np.stack([img_np]*3, axis=-1)
        return Image.fromarray(img_np.astype(np.uint8))

    def __len__(self):
        return len(self.left_images)

    def __getitem__(self, idx):
        # Cargar imágenes
        left_path = os.path.join(self.left_dir, self.left_images[idx])
        right_path = os.path.join(self.right_dir, self.right_images[idx])
        depth_path = os.path.join(self.disp_dir, self.disp_images[idx])

        left_img = cv2.imread(left_path, cv2.IMREAD_UNCHANGED)
        right_img = cv2.imread(right_path, cv2.IMREAD_UNCHANGED)
        depth = np.array(Image.open(depth_path)).astype(np.float32) / 1000.0

        # Aplicar procesado a las imagenes
        left_img, right_img = preprocess_images(left_img, right_img)

        # Calcular el mapa de disparidad
        disparity = depth_to_disparity(depth)

        # Gradiente
        left_np = np.array(left_img)
        dx = cv2.Sobel(left_np, cv2.CV_32F, 1, 0, ksize=3)
        dy = cv2.Sobel(left_np, cv2.CV_32F, 0, 1, ksize=3)
        dxy = np.sqrt(dx**2 + dy**2)
        dxy = dxy / (np.max(dxy) + 1e-5)

        # Replicar a 3 canales
        left_img = self.replicate_to_3_channels(left_img)
        right_img = self.replicate_to_3_channels(right_img)

        if self.training:
            w, h = left_img.size
            x1 = random.randint(0, w - self.crop_w)
            y1 = random.randint(0, h - self.crop_h)

            # Crop
            left_img = left_img.crop((x1, y1, x1 + self.crop_w, y1 + self.crop_h))
            right_img = right_img.crop((x1, y1, x1 + self.crop_w, y1 + self.crop_h))
            disparity = disparity[y1:y1 + self.crop_h, x1:x1 + self.crop_w]
            gradient_map = dxy[y1:y1 + self.crop_h, x1:x1 + self.crop_w]

            # Disparidad downsampled
            disparity_low = cv2.resize(disparity, (self.crop_w//4, self.crop_h//4), interpolation=cv2.INTER_NEAREST)
            disparity_low_r8 = cv2.resize(disparity, (self.crop_w//8, self.crop_h//8), interpolation=cv2.INTER_NEAREST)

            # Aplicar transformaciones
            left_img = self.transform(left_img)
            right_img = self.transform(right_img)
            disparity = torch.from_numpy(disparity).float()
            gradient_map = torch.from_numpy(gradient_map).float()
            disparity_low = torch.from_numpy(disparity_low).float()
            disparity_low_r8 = torch.from_numpy(disparity_low_r8).float()

            return {
                "left": left_img,
                "right": right_img,
                "disparity": disparity,
                "gradient_map": gradient_map,
                "disparity_low": disparity_low,
                "disparity_low_r8": disparity_low_r8}

        else:
          w, h = left_img.size
          crop_w, crop_h = 640, 512

          left_img = left_img.crop((w - crop_w, h - crop_h, w, h))
          right_img = right_img.crop((w - crop_w, h - crop_h, w, h))
          disparity = disparity[h - crop_h:h, w - crop_w: w]
          gradient_map = dxy[h - crop_h:h, w - crop_w: w]
          disparity_low = cv2.resize(disparity, (crop_w//4, crop_h//4), interpolation=cv2.INTER_NEAREST)

          left_img = self.transform(left_img)
          right_img = self.transform(right_img)
          disparity = torch.from_numpy(disparity).float()
          gradient_map = torch.from_numpy(gradient_map).float()
          disparity_low = torch.from_numpy(disparity_low).float()

          return {"left": left_img,
                  "right": right_img,
                  "disparity": disparity,
                  "top_pad": 0,
                  "right_pad": 0,
                  "gradient_map":gradient_map,
                  "disparity_low":disparity_low}

In [6]:
import argparse
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
import torchvision.utils as vutils
import torch.nn.functional as F
import numpy as np
import time
from models import __models__, model_loss_train, model_loss_test
from utils import *
from torch.utils.data import DataLoader
import gc
import cv2

# Optimizzar el funcionamiento de la GPU
cudnn.benchmark = True

# Establecer una semilla para reproducibilidad
torch.manual_seed(1)
torch.cuda.manual_seed(1)

# model, optimizer
model = __models__['Fast_ACVNet'](192, False)
model = nn.DataParallel(model)
model.cuda()

# Comprobar si hay capas congeladas
for name, param in model.named_parameters():
    print(f"Layer: {name}, Requires Gradient: {param.requires_grad}")

Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_100_ra-b33bc2c4.pth" to /root/.cache/torch/hub/checkpoints/mobilenetv2_100_ra-b33bc2c4.pth


Layer: module.gamma, Requires Gradient: True
Layer: module.beta, Requires Gradient: True
Layer: module.feature.conv_stem.weight, Requires Gradient: True
Layer: module.feature.bn1.weight, Requires Gradient: True
Layer: module.feature.bn1.bias, Requires Gradient: True
Layer: module.feature.block0.0.0.conv_dw.weight, Requires Gradient: True
Layer: module.feature.block0.0.0.bn1.weight, Requires Gradient: True
Layer: module.feature.block0.0.0.bn1.bias, Requires Gradient: True
Layer: module.feature.block0.0.0.conv_pw.weight, Requires Gradient: True
Layer: module.feature.block0.0.0.bn2.weight, Requires Gradient: True
Layer: module.feature.block0.0.0.bn2.bias, Requires Gradient: True
Layer: module.feature.block1.0.0.conv_pw.weight, Requires Gradient: True
Layer: module.feature.block1.0.0.bn1.weight, Requires Gradient: True
Layer: module.feature.block1.0.0.bn1.bias, Requires Gradient: True
Layer: module.feature.block1.0.0.conv_dw.weight, Requires Gradient: True
Layer: module.feature.block1.0.0.

In [7]:
# Cargar un checkpoint para evaluar
ckpt_path = '/content/Depth-Estimation-with-StereoThermal-Images/pretrained_model/trained_without_freezing_f2h4_e24.ckpt' # Cambiar segun el ckpt que se desee probar
if ckpt_path is not None:
    print("loading model {}".format(ckpt_path))
    state_dict = torch.load(ckpt_path)
    model_dict = model.state_dict()
    pre_dict = {k: v for k, v in state_dict['model'].items() if k in model_dict}
    model_dict.update(pre_dict)
    model.load_state_dict(model_dict)

loading model /content/Depth-Estimation-with-StereoThermal-Images/pretrained_model/trained_without_freezing_f2h4_e24.ckpt


In [10]:
# Carga de datos
from torch.utils.data import ConcatDataset, DataLoader
# Dataset validacion
dataset_frick_test = ThermalStereoDataset(root_dir='/content/drive/MyDrive/Procesamiento de imagenes/Proyecto Final/dataset', domain='frick_2', split='test', training=False)
dataset_hawkins_test = ThermalStereoDataset(root_dir='/content/drive/MyDrive/Procesamiento de imagenes/Proyecto Final/dataset', domain='hawkins_4', split='test', training=False)
combined_dataset_test = ConcatDataset([dataset_frick_test, dataset_hawkins_test])
test_loader = DataLoader(combined_dataset_test, batch_size=4, shuffle=False, num_workers=4)

def test():
    # # testing
    avg_test_scalars = AverageMeterDict()
    for batch_idx, sample in enumerate(test_loader):
        start_time = time.time()
        loss, scalar_outputs = test_sample(sample)
        avg_test_scalars.update(scalar_outputs)

        del scalar_outputs
        print('Iter {}/{}, test loss = {:.3f}, time = {:3f}'.format(batch_idx,
                                                                    len(test_loader), loss,
                                                                    time.time() - start_time))
    avg_test_scalars = avg_test_scalars.mean()
    print("avg_test_scalars", avg_test_scalars)
    gc.collect()

# test one sample
@make_nograd_func
def test_sample(sample):
    model.eval()
    imgL, imgR, disp_gt = sample['left'], sample['right'], sample['disparity']
    imgL = imgL.cuda()
    imgR = imgR.cuda()
    disp_gt = disp_gt.cuda()
    mask = (disp_gt < 192) & (disp_gt > 0)
    disp_ests = model(imgL, imgR)
    disp_gts = [disp_gt]
    masks = [mask]
    loss = model_loss_test(disp_ests, disp_gts, masks)

    scalar_outputs = {"loss": loss}
    scalar_outputs["D1"] = [D1_metric(disp_est, disp_gt, mask) for disp_est in disp_ests]
    scalar_outputs["EPE"] = [EPE_metric(disp_est, disp_gt, mask) for disp_est in disp_ests]
    scalar_outputs["Thres1"] = [Thres_metric(disp_est, disp_gt, mask, 1.0) for disp_est in disp_ests]
    scalar_outputs["Thres2"] = [Thres_metric(disp_est, disp_gt, mask, 2.0) for disp_est in disp_ests]
    scalar_outputs["Thres3"] = [Thres_metric(disp_est, disp_gt, mask, 3.0) for disp_est in disp_ests]

    return tensor2float(loss), tensor2float(scalar_outputs)

if __name__ == '__main__':
    test()

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


Iter 0/281, test loss = 2.908, time = 8.252657
Iter 1/281, test loss = 3.026, time = 0.217919
Iter 2/281, test loss = 2.383, time = 0.221564
Iter 3/281, test loss = 2.389, time = 0.220053
Iter 4/281, test loss = 2.398, time = 0.207241
Iter 5/281, test loss = 2.571, time = 0.205611
Iter 6/281, test loss = 2.826, time = 0.210634
Iter 7/281, test loss = 3.320, time = 0.209616
Iter 8/281, test loss = 4.021, time = 0.276955
Iter 9/281, test loss = 4.360, time = 0.218147
Iter 10/281, test loss = 4.812, time = 0.203955
Iter 11/281, test loss = 5.512, time = 0.204116
Iter 12/281, test loss = 6.383, time = 0.293719
Iter 13/281, test loss = 7.995, time = 0.205002
Iter 14/281, test loss = 8.706, time = 0.210186
Iter 15/281, test loss = 9.891, time = 0.203000
Iter 16/281, test loss = 10.203, time = 0.290401
Iter 17/281, test loss = 10.321, time = 0.233169
Iter 18/281, test loss = 9.886, time = 0.231050
Iter 19/281, test loss = 10.853, time = 0.210377
Iter 20/281, test loss = 12.409, time = 0.28953