In [None]:
# Copyright (c) 2023 William Locke

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

This notebook is intended to be run in Google Colab with access to corresponding Google Drive files. If running locally or on another service, change import and install code accordingly.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/WilliamLockeIV/segment-anything/blob/main/notebooks/Preprocess_NEONTreeDataset_for_VectorDataset.ipynb)

The purpose of this notebook is to take raw image and bounding box information saved in the NEONTreeDataset and encode it into vectors can be saved in a VectorDataset. See the ReadMe for more information.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

TODO: Preprocess NEONTreeEvaluation training set.

Thus far we have only worked with the evaluation set, in which all images are cropped to a uniform size of 400 x 400 pixels. We still need to decide whether to try training our model on the full-size training images, which can range from 888 x 1153 pixels to 10,000 x 10,000 pixels, or also crop these into smaller images of 400 x 400 pixels. We will eventually need to make the same decisions for our own dataset.

In [None]:
%%capture
!unzip '/content/drive/MyDrive/UAV/Data/NEONTreeEvaluation/training.zip' -d "/content/training"
!unzip '/content/drive/MyDrive/UAV/Data/NEONTreeEvaluation/evaluation.zip' -d "/content"
!unzip '/content/drive/MyDrive/UAV/Data/NEONTreeEvaluation/annotations.zip' -d "/content"

In [None]:
%%capture
!pip install rasterio
!pip install supervision

In [None]:
#@title Copy GroundingDINO from IDEA-Research github repository
%%capture

%cd /content
import os
if not os.path.exists('/content/weights'):
  !mkdir /content/weights
!git clone https://github.com/IDEA-Research/GroundingDINO.git
%cd /content/GroundingDINO
!pip install -q .
%cd /content/weights
!wget -q https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha2/groundingdino_swinb_cogcoor.pth
%cd /content

In [None]:
#@title Copy SAM from personal github repository
%%capture

%cd /content
import os
if os.path.exists('/content/segment-anything'):
  !rm -r /content/segment-anything
!git clone https://github.com/lu-liang-geo/UAV_Tree_Detection.git
%cd /content/segment-anything
!pip install -q .
%cd /content/weights
!wget -q https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
%cd /content

In [None]:
import os
import cv2
import glob
import torch
import rasterio
import numpy as np
from PIL import Image
import supervision as sv
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
from segment_and_detect_anything.detr import box_ops
from GroundingDINO.groundingdino.util.inference import Model
from segment_and_detect_anything import NEONTreeDataset, sam_model_registry, SamPredictor

In [None]:
#@title Load GroundingDINO Model
GROUNDING_DINO_CONFIG_PATH = "/content/GroundingDINO/groundingdino/config/GroundingDINO_SwinB_cfg.py"
GROUNDING_DINO_CHECKPOINT_PATH = "/content/weights/groundingdino_swinb_cogcoor.pth"
gd_model = Model(model_config_path=GROUNDING_DINO_CONFIG_PATH, model_checkpoint_path=GROUNDING_DINO_CHECKPOINT_PATH)

In [None]:
#@title Load SAM Model
sam_model = sam_model_registry["vit_h"](checkpoint="/content/weights/sam_vit_h_4b8939.pth")
sam_predictor = SamPredictor(sam_model)

NOTE: when creating a dataset for the first time with a new set of images, set `check_values=True` to raise an error if any RGB, NIR, Red Edge, or CHM pixels are less than 0, which likely indicates invalid pixels are included in the image. These images may be further cropped to remove the invalid pixels or eliminated from the dataset altogether by adding them to the set `problem_files` inside the code for NEONTreeDataset.

ALSO NOTE: We have already preprocessed the val dataset; I include it here for instructional and archival purposes only.

In [None]:
#@title Initialize either Train or Val Dataset

mode = # 'train' or 'val'

if mode == 'train':
  img_path = '/content/training'
  prompt_path = None

elif mode == 'val':
  img_path = '/content/evaluation'
  prompt_path = '/content/drive/MyDrive/UAV/Data/NEONTreeEvaluation/Evaluation/Prompts'

ann_path = '/content/annotations'

ds = NEONTreeDataset(image_path=img_path, ann_path=ann_path, prompt_path=prompt_path, check_values=False)

In [None]:
#@title Run Through Train and Val Datasets

# The first time running through the val dataset takes about 1 minute; subsequent runthroughs are much faster.
# I run through the dataset here so that they won't add to the time when encoding the data below.

for tree in ds:
  pass

## Encode RGB and Multi Images using SAM

CAUTION: After encoding and saving the RGB and Multi images using SAM, you should open a new notebook (or at least disconnect and delete the current runtime and start a new one) and run the SAM encoder on one or two of the same images to compare its outputs with the saved embeddings. For some reason the first time I encoded the RGB and Multi images with SAM, the saved embeddings were different from the ones output by SAM in a later notebook, and I had to rerun all the images through SAM and save the new embeddings. I still haven't figured out why this happened.

In [None]:
#@title Encode Images from Scratch

rgb_folder = '/content/path_to_rgb_vector_folder'
multi_folder = '/content/path_to_multi_vector_folder'

for i, tree in enumerate(ds):
  rgb_img = tree['rgb']
  multi_img = tree['multi']
  name = tree['basename']
  sam_predictor.set_images(rgb_img, multi_img)
  rgb_embed, multi_embed = sam_predictor.get_image_embedding()
  torch.save(rgb_embed, os.path.join(rgb_folder, f'{name}.pt'))
  torch.save(multi_embed, os.path.join(multi_folder, f'{name}.pt'))
  print(i+1, name)

print()

In [None]:
#@title If continuing from previous encoding session

rgb_folder = '/content/path_to_rgb_vector_folder'
multi_folder = '/content/path_to_multi_vector_folder'

rgb_encoded = set(os.listdir(rgb_folder))
multi_encoded = set(os.listdir(multi_folder))
imgs_encoded = rgb_encoded.intersection(multi_encoded)

print('Done:', len(imgs_encoded), 'To Do:', len(ds) - len(imgs_encoded))
print()

i = 0
for tree in ds:
  name = tree['basename']
  if not name in imgs_encoded:
    rgb_img = tree['rgb']
    multi_img = tree['multi']
    sam_predictor.set_images(rgb_img, multi_img)
    rgb_embed, multi_embed = sam_predictor.get_image_embedding()
    torch.save(rgb_embed, os.path.join(rgb_folder, f'{name}.pt'))
    torch.save(multi_embed, os.path.join(multi_folder, f'{name}.pt'))
    print(i+1, name)
    i += 1

print()

## Encoding Prompt Boxes using GroundingDINO and SAM

The code block below raises the following warnings, which I have not yet addressed in the code:

```
/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:907: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:61: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
  warnings.warn(
```
The final warning is not an issue, but the deprecation of `device` might cause an error in the future, and the changing default value of `use_reentrant` might change some behavior, though I don't really understand this parameter. The relevant documenation is currently available at https://pytorch.org/docs/stable/checkpoint.html

In [None]:
#@title Generate Bounding Boxes with GroundingDINO (can skip if already done)

'''
Classes are the categories GroundingDINO will attempt to find in the image. Box threshold is the
confidence threshold necessary to draw a box, and text_theshold is the confidence threshold necessary
to attach a label to that box. See GroundingDINO documentation for more details.

Early tests of GroundingDINO found a simple class of "tree" with box_threshold and text_threshold set
to 0.2 worked best in conjunction with Custom Non-Max Suppression.
'''

box_folder = '/content/path_to_box_folder'

classes = ['tree']
box_threshold = 0.2
text_threshold = 0.2

for i, tree in enumerate(ds):
  tree = ds[i]
  name = tree['basename']
  bgr_img = tree['rgb'][:,:,::-1].copy()
  prompt_boxes = gd_model.predict_with_classes(
      image=bgr_img,
      classes=classes,
      box_threshold=box_threshold,
      text_threshold=text_threshold)
  prompt_boxes = box_ops.custom_nms(prompt_boxes).xyxy
  np.save(os.path.join(box_folder, f'{name}.npy'), prompt_boxes)
  print(i, name)

In [None]:
#@title Add Bounding Boxes to Dataset (can skip if already done)

ds = NEONTreeDataset(image_path=img_path, ann_path=ann_path, prompt_path=box_folder, check_values=False)

for tree in ds:
  pass

We calculate and save **sparse**, **dense**, and **positional** embeddings using the SAM prompt encoder.

**Sparse embeddings** are the embeddings of the bounding boxes provided by GroundingDINO. It would also include any point prompts if provided, but those do not apply to our dataset.

**Dense embeddings** encode mask prompts rather than box prompts and aren't used by our Box Decoder (nor can they be due to how the Box Decoder reshapes prompt embeddings). We save them anyway so that we can run the same embeddings through the Mask Decoder to check that it outputs reasonable masks, which is a good indicator that we haven't made any mistakes in the preprocessing and embedding.

**Positional embeddings** only depend on the size of the encoded image, so we save a single embedding per image size rather than per image.

In [None]:
#@title Encode Bounding Box Prompts as Sparse, Dense, and Positional Embeddings

sparse_folder = '/content/path_to_sparse_folder'
dense_folder = '/content/path_to_dense_folder'
positional_folder = '/content/path_to_positional_folder'

# Save Sparse and Dense embeddings
for i, tree in enumerate(ds):
  rgb_img = tree['rgb_img']
  boxes = tree['prompt']
  name = tree['basename']
  boxes_transform = sam_predictor.transform.apply_boxes(boxes, rgb_img.shape[:2])
  boxes_torch = torch.as_tensor(boxes_transform, dtype=torch.float)
  sparse_embedding, dense_embedding = sam_model.prompt_encoder(
      points=None,
      boxes=boxes_torch,
      masks=None
  )
  torch.save(sparse_embedding, os.path.join(sparse_folder, f'{name}.pt'))
  torch.save(dense_embedding, os.path.join(dense_folder, f'{name}.pt'))
  print(i, name)

# Save positional embedding for encoded image size
# (should be the same for all images so long as they are encoded by the same SAM model).
image_embedding_size = sam_model.prompt_encoder.image_embedding_size
positional_embedding = sam_model.prompt_encoder.get_dense_pe()
torch.save(positional_embedding, os.path.join(positional_folder, f'{image_embedding_size}.pt'))
print()
print(image_embedding_size)

## Encoding Annotations and Class Labels

The NEONTreeDataset annotations are saved in PASCAL VOC format, which are XML documents saving (among other things) the bounding boxes as Xmin, Ymin, Xmax, Ymax coordinates in pixels. To train the Box Decoder, we need to normalize the pixel values by the original image size and convert them to Center_X, Center_Y, Width, Height format. We do this here.

We also create a "Class Label". For the NEONTreeDataset, there is only a single class ("Tree"), so the class label will be 0 for each object (later the model will add a non-object class of 1).

In [None]:
#@title Encode Annotation Bounding Boxes in CxCyWH format

annotation_folder = '/content/path_to_annotations'

for i, tree in enumerate(tree_ds):
  name = tree['basename']
  h, w = tree['rgb'].shape[:-1]
  orig_box = torch.from_numpy(tree['annotation'])
  resize_box = box_ops.box_xyxy_to_cxcywh(orig_box) / torch.Tensor([w,h,w,h])
  torch.save(resize_box, os.path.join(annotation_folder, f'{name}.pt'))
  print(i, name)

In [None]:
#@title Encode Class Labels

label_folder = '/content/path_to_label_folder'

for i, tree in enumerate(tree_ds):
  name = tree['basename']
  num_boxes = len(tree['annotation'])
  labels = torch.zeros(num_boxes, dtype=torch.int64)
  torch.save(labels, os.path.join(label_folder, f'{name}.pt'))
  print(i, name)