##Basic Set-up

In [1]:
#@title Connect to google drive

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
#@title Clone cseg into /content/ folder
!git clone https://github.com/noellelaw/cseg

Cloning into 'cseg'...
remote: Enumerating objects: 423, done.[K
remote: Counting objects: 100% (342/342), done.[K
remote: Compressing objects: 100% (212/212), done.[K
remote: Total 423 (delta 131), reused 316 (delta 120), pack-reused 81[K
Receiving objects: 100% (423/423), 6.66 MiB | 20.41 MiB/s, done.
Resolving deltas: 100% (147/147), done.


In [4]:
#@title Read in KITTI test data from drive (link share: https://drive.google.com/drive/folders/1LLKGeYnLXBY1lJXRKOUpTk4GKZaEoNYR?usp=drive_link)

# Imports
import os
from PIL import Image
# Access google drive folder, update to specific path the shared folder is sent
data_fldr = '/content/drive/MyDrive/training/image_2/' #@param {kitti_image_fldr:'string'}
# Set up data structure to hold images and filenames
images = {'filename':[],
          'image':[]}
# Iterate thru folder to store images and respective filenames
for file in os.listdir(data_fldr):
  full_file = os.path.join(data_fldr, file)
  images['image'].append(Image.open(full_file))
  images['filename'].append(file)

In [None]:
#@title Install and import OV-Seg requirments
%cd /content/cseg/
import multiprocessing as mp
!pip install -r requirements.txt
!pip install wandb
!pip install timm
!pip install ftfy

try:
    import detectron2
except:
    import os
    os.system('pip install git+https://github.com/facebookresearch/detectron2.git')

from detectron2.config import get_cfg

from detectron2.projects.deeplab import add_deeplab_config
from detectron2.data.detection_utils import read_image
from open_vocab_seg import add_ovseg_config
from open_vocab_seg.utils import VisualizationDemo

In [27]:
#@title Install vlseg_ensembling, needed for benchmark evaluation
# better way to do this in the future
import os
%cd /content/
!git clone https://github.com/noellelaw/vlseg_ensembling --recurse-submodules
%cd /content/vlseg_ensembling
os.mkdir('/content/vlseg_ensembling/kitti_benchmark_suite/devkit/devkit/evaluation/KITTI_RESULTS/results')
!pip install -r requirements.txt


/content
Cloning into 'vlseg_ensembling'...
remote: Enumerating objects: 1105, done.[K
remote: Counting objects: 100% (523/523), done.[K
remote: Compressing objects: 100% (444/444), done.[K
remote: Total 1105 (delta 74), reused 456 (delta 54), pack-reused 582[K
Receiving objects: 100% (1105/1105), 161.14 MiB | 17.66 MiB/s, done.
Resolving deltas: 100% (75/75), done.
Updating files: 100% (1143/1143), done.
/content/vlseg_ensembling
Collecting git+https://github.com/openai/CLIP.git (from -r requirements.txt (line 11))
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-cxqjlgi7
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-cxqjlgi7
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [9]:
#@title Import common libraries
%cd /content/cseg
import torch
import requests
import sys
import numpy as np
import os, json, cv2, random
from google.colab.patches import cv2_imshow
import matplotlib.pyplot as plt
from PIL import Image
import math
from time import perf_counter

import matplotlib as mpl
import matplotlib.colors as mplc
import matplotlib.figure as mplfigure
from torchvision import transforms
import colorsys


/content/cseg


## Common Utils

In [11]:
#@title Color map class
_OTHER = np.array([ 0, 0, 0 ])
_COLORS = np.array([
          128,   64, 128,#
          244,   35, 232,#
          70,    70,  70,#
          102,  102, 156,#
          190,  153, 153,#
          153,  153, 153,#
          250,  170,  30,#
          220,  220,   0,#
          107,  142,  35,#
          152,  251, 152,#
            70, 130, 180,#
          220,   20,  60,#
          255,    0,   0,#
            0,    0, 142,#
            0,    0,  70,#
            0,   60, 100,#
            0,   80, 100,#
            0,    0, 230,#
          119,   11,  32,#
            0,    0,   0,
            0,    0,   0,
          ]).reshape(-1, 3)
# BASED OFF KITTI BENCHMARK SUITE LABEL MAPPING
_GRAYS = np.array([
          7,
          8,
          11,
          12,
          13,
          17,
          19,
          20,
          21,
          22,
          23,
          24,
          25,
          26,
          27,
          28,
          31,
          32,
          33,
          0,
          0
          ]).reshape(-1, 1)

In [12]:
#@title Overlay background mask on image
from typing import Tuple
# Overlay for 3 Channel RGB images
def overlay_rgb(
    image: np.ndarray,
    mask: np.ndarray,
    color: Tuple[int, int, int] = (255, 0, 0),
    alpha: float = 0.6,
    resize: Tuple[int, int] = (1242,375)
) -> np.ndarray:
    """Combines image and its segmentation mask into a single image.

    Params:
        image: Training image.
        mask: Segmentation mask.
        color: Color for segmentation mask rendering.
        alpha: Segmentation mask's transparency.
        resize: If provided, both image and its mask are resized before blending them together.

    Returns:
        image_combined: The combined image.

    """
    color = np.asarray(color).reshape(3, 1, 1)
    colored_mask = np.expand_dims(mask, 0).repeat(3, axis=0)
    masked = np.ma.MaskedArray(image, mask=colored_mask, fill_value=color)
    image_overlay = masked.filled()

    if resize is not None:
        image = cv2.resize(image.transpose(1, 2, 0), resize)
        image_overlay = cv2.resize(image_overlay.transpose(1, 2, 0), resize)

    image_combined = cv2.addWeighted(image, 1 - alpha, image_overlay, alpha, 0)
    return image_combined
# Overlay for gray scale images
def overlay(
    image: np.ndarray,
    mask: np.ndarray,
    color: int = 255,
    alpha: float = 0.6,
    resize: Tuple[int, int] = (1242,375)
) -> np.ndarray:
    """Combines image and its segmentation mask into a single image.

    Params:
        image: Training image.
        mask: Segmentation mask.
        color: Color for segmentation mask rendering.
        alpha: Segmentation mask's transparency.
        resize: If provided, both image and its mask are resized before blending them together.

    Returns:
        image_combined: The combined image.

    """
    color = np.asarray(color).reshape(1, 1, 1)
    colored_mask = np.expand_dims(mask, 0).repeat(1, axis=0)
    masked = np.ma.MaskedArray(image, mask=colored_mask, fill_value=color)
    image_overlay = masked.filled()

    if resize is not None:
        image = cv2.resize(image, resize)
        image_overlay = cv2.resize(image_overlay, resize)

    image_combined = cv2.addWeighted(image, 1 - alpha, image_overlay, alpha, 0)

    return image_combined

##Build Model

In [18]:
#@title Utils
import torch
from time import perf_counter
def setup_cfg(config_file, model_weights):
    # load config from file and command-line arguments
    cfg = get_cfg()
    add_deeplab_config(cfg)
    add_ovseg_config(cfg)
    cfg.merge_from_file(config_file)
    cfg.MODEL.WEIGHTS = model_weights
    cfg.freeze()
    return cfg


def inference(class_names, input_img, demo):

    print("Getting ovseg predictions ... ")
    t1_start = perf_counter()
    output = {
        'pred_masks': [],
        'pred_classes': [],
        'scores' : [],
        }
    class_names = class_names.split(',')
    preds = demo.run_on_image(input_img, class_names)[0]
    if "sem_seg" in preds:
      r = preds["sem_seg"]
      blank_area = (r[0] == 0)
      pred_mask = r.argmax(dim=0).to('cpu')
      probs = r.softmax(dim=0).detach().cpu()
      confidence, _ = probs.max(dim=0)
      pred_mask[blank_area] = 255
      # Pred mask of 'dead' area
      pred_mask = np.array(pred_mask, dtype=int)
      pred_classes, pred_masks =  get_sem_seg_masks(
          sem_seg = pred_mask,
          class_names = class_names,
      )
      output['pred_classes'].append(pred_classes)
      output['pred_masks'].append(pred_masks)
      output['scores'].append(confidence.numpy())
    tfinal = perf_counter() - t1_start
    print("Returning clipseg predictions: {:.2f} seconds".format(tfinal))
    return output, tfinal

def get_sem_seg_masks(sem_seg, class_names, area_threshold=None):
        if isinstance(sem_seg, torch.Tensor):
            sem_seg = sem_seg.numpy()
        labels, areas = np.unique(sem_seg, return_counts=True)
        sorted_idxs = np.argsort(-areas).tolist()
        labels = labels[sorted_idxs]
        pred_classes, pred_masks = [], []

        for label in filter(lambda l: l < len(class_names), labels):
            # I need to find a way to assign confidence
            binary_mask = (sem_seg == label).astype(np.uint8)
            # text = class_names[label]
            # Storing index instead of text
            pred_classes.append(label)
            pred_masks.append(binary_mask)
        return pred_classes, pred_masks




In [19]:
#@title Build model
mp.set_start_method("spawn", force=True)
config_file = '/content/cseg/configs/ovseg_swinB_vitL_demo.yaml' #@param {cofig_file: "string"}
model_weights = '/content/drive/MyDrive/model_weights/output_2layers/model_0000999_hpc.pth' #@param {model_weights: "string"}
cfg = setup_cfg(config_file, model_weights)
demo = VisualizationDemo(cfg)





##Inference

In [21]:
#@title Define prompt string
prompts = 'road, sidewalk, building, wall, fence, pole, traffic light, traffic sign, vegetation, terrain, sky, person, rider, car, truck, bus, train, motorcycle, bicycle'
prompt_list = prompts.split(', ')

In [None]:
#@title Get all cseg predictions
pred_data = {'filename': [],
              'image': [],
              'ovseg': [],
              'time': []}

pred_data['final'] = []
pred_data['final_rgb'] = []
# Folder to save for KITTI evaluation, don't change
eval_fldr = '/content/vlseg_ensembling/kitti_benchmark_suite/devkit/devkit/evaluation/KITTI_RESULTS/results/'
# Folders in your drive to save to
# grayscale predictions
pred_fldr = '/content/drive/MyDrive/zeroshot/cseg2/' #@param {prediction_folder:"string"}
# RGB predictions
rgb_fldr = '/content/drive/MyDrive/zeroshot/cseg2_rgb/' #@param {rgb_prediction_folder:"string"}
# COnfidence predictions
conf_fldr = '/content/drive/MyDrive/zeroshot/cseg2_conf/'#@param {confidence_prediction_folder:"string"}

for i, image in enumerate(images['image']):
  print(f'Processing image {i}.')
  np_img = np.array(image).astype('uint8')
  pred_data['filename'].append(images['filename'][i])
  pred_data['image'].append(images['image'][i])
  preds, time = inference(prompts, np_img, demo)

  ovseg_classes = preds['pred_classes']
  ovseg_masks = preds['pred_masks']
  #---------------------------------------------------------------------------
  # Overlay clipseg masks on image
  height, width, _ = np.array(image).shape
  image = np.multiply(np.ones((height, width)), 0)
  # Get RGB Image -----------------------------------------------------
  image_rgb = np.multiply(np.ones((height, width, 3)), 0)
  for j, mask in enumerate(preds['pred_masks'][0]):
    color = _COLORS[preds['pred_classes'][0][j]]
    image_rgb = overlay_rgb(
        image = image_rgb.transpose(2,0,1),
        mask = mask,
        color = tuple(color),
        resize = tuple([width,height]),
        alpha = 1
    )
  # Get Gray Image -----------------------------------------------------
  for j, mask in enumerate(preds['pred_masks'][0]):
    color = _GRAYS[preds['pred_classes'][0][j]]
    image = overlay(
        image = image,
        mask = mask,
        color = tuple(color),
        resize = tuple([width,height]),
        alpha = 1
    )
  pred_data['final'].append(image.astype('uint8'))
  pred_data['final_rgb'].append(image_rgb.astype('uint8'))
  # Save images to google drive
  filename_eval = eval_fldr+pred_data['filename'][i]
  filename_pred = pred_fldr+pred_data['filename'][i]
  filename_rgb = rgb_fldr+pred_data['filename'][i]
  filename_conf = conf_fldr+pred_data['filename'][i]
  image_eval = Image.fromarray(image.astype('uint8'))
  image_eval.save(filename_eval)
  image_pred = Image.fromarray(image.astype('uint8'))
  image_pred.save(filename_pred)
  image_rgb = Image.fromarray(image_rgb.astype('uint8'))
  image_rgb.save(filename_rgb)
  image_conf = Image.fromarray(np.array(preds['scores'][0]*255).astype('uint8'))
  image_conf.save(filename_conf)
  #break


## KITTI Evaluation

In [29]:
#@title Perform KITTI Evaluation
! python /content/vlseg_ensembling/kitti_benchmark_suite/devkit/devkit/evaluation/evalPixelLevelSemanticLabeling.py /content/output/

ERROR: Found no prediction for ground truth /content/vlseg_ensembling/kitti_benchmark_suite/devkit/devkit/evaluation/KITTI_RESULTS/training/semantic/000125_10.png
