# U-Net Model Implementation
Environment Notebook was built in:
- Google Colab

Hardware needs:
- 10 GB CPU RAM
- NVidia T4 GPU -> 16 GB
- 30 GB disk

## Data Gathering and Augmentation

### Imports and Installs

#### Using detect_model_requirements.txt
- Please uncomment to use

In [None]:
# uncomment to create notebook package requirments file called segment_model_requirements.txt
# !pip freeze > segment_model_requirements.txt

# use the code below to use detect_model_requirements.txt to install all necessary packages
#!pip install -r segment_model_requirements.txt

#### Neccessary installs on top of Google Colab environment
- Uncomment to run this cell if you aren't using segment_model_requirements.txt and are operating in the GPU google colab environment

In [None]:
#!pip install google-cloud-storage
#!pip install pyplot

#### Imports

In [24]:
#Standard Library Imports
import os
import time
from io import BytesIO
import random

#Third party imports
from google.cloud import storage

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
from torch.cuda.amp import autocast, GradScaler
import torch.optim as optim
from torchvision.transforms.functional import pad
from torch.utils.data import random_split

import skimage
from skimage.io import imread
from skimage.transform import resize

from matplotlib import pyplot as plt
import ipyplot

import altair

from PIL import Image

#Local application imports
from transforms import (
    ComposeDouble,
    FunctionWrapperDouble,
    create_dense_target,
    normalize,
    gaussian_smoothing,
    image_histogram_equalization
)

import build_unet

from helper import (
    dice_loss,
    custom_collate,
    analyze_num_classes
)

### Grabbing data from Google bucket
First, we want to set up credentials and the client. The Waymo dataset isn't publicly open for use without permissions. Permissions were required for transfer of bucket information from Waymo google cloud services to our own.

In [4]:
file_store = 'GCS' # kindly use either 'GCS' for Google cloud storage or 'LOCAL' for files stored in the same folder as notebook

# leave both as None if using LOCAL or replace this with your GCS api key json file and Bucket name
google_application_credentials = '/content/authentication.json' #replace this string with your own credentials
bucket_name = 'waymo_sample_bucket'

if file_store == 'GSC':
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = google_application_credentials
    client = storage.Client()

#### Downloading from the bucket
For smaller numbers of parquets, a dataframe is created all at once. 

Otherwise, for the larger, overall training, we iteratively download parquet information. The functions below do these things. This is to ensure memory ease.

- Camera Image Parquet Files
- Camera Segmentation Files

In [25]:
# Grab data for iterative training
def grab_data_image():
  # Get the bucket and list the blobs with the specified prefix (folder name)
  df_temp = pd.DataFrame()
  bucket = client.get_bucket(bucket_name)
  blobs_image = list(bucket.list_blobs(prefix='training/camera_image/'))

  for blob1 in blobs_image:
    object_name_image = blob1.name[len('training/camera_image/'):]

    print("Loading in ", blob1)

    temp = "temp_file.parquet"
    #download the blob content to a temp file
    blob1.download_to_filename(temp)
    df = pd.read_parquet(temp)
  
    #Yield results for iterative training  
    yield df

# Grab data for iterative training
def grab_data_segment():
  # Get the bucket and list the blobs with the specified prefix (folder name)
  df_temp = pd.DataFrame()
  bucket = client.get_bucket(bucket_name)
  blobs_segment = list(bucket.list_blobs(prefix='training/camera_segmentation/'))

  for blob2 in blobs_segment:
    object_name = blob2.name[len('training/camera_segmentation/'):]

    print("Loading in ", blob2)

    temp = "temp_file.parquet"
    #download the blob content to a temp file
    blob2.download_to_filename(temp)
    df = pd.read_parquet(temp)
    
    #Yield results for iterative training  
    yield df

#--------This one will be important for hyperparameter tuning ---------
def grab_data(folder_name = 'training/camera_image/', max_parquet = 3):

  # Get the bucket and list the blobs with the specified prefix (folder name)
  df_temp = pd.DataFrame()
  bucket = client.get_bucket(bucket_name)

  blobs = list(bucket.list_blobs(prefix=folder_name))
  #random.Random(random_seed).shuffle(blobs)
  i = -1
  for blob in blobs:
    i += 1
    object_name = blob.name[len(folder_name):]
    temp = "temp_file.parquet"
    #download the blob content to a temp file
    blob.download_to_filename(temp)
    df = pd.read_parquet(temp)
    if df_temp.empty:
      df_temp = df
    else:
      df_temp = pd.concat([df_temp,df])
    if i == max_parquet :
      break

  return df_temp

#### For Local use
- get_local_data is a one and go use, particulary for the hyperparameter tuning. This is best for the sample data.
- This example is best used for the sample data and fits the similar format.

- The other two functions are data generators for iterative training of the larger model.

In [None]:
def get_local_data():
    ''''
    The one-and-go for sample images for hyperparameter tuning
    ''''
    camera_image_file_path = 'waymo_sample_data/training/camera_image'
    segmented_image_file_path = 'waymo_sample_data/training/camera_segmentation'
    camera_images = os.listdir(camera_image_file_path)
    segmented_images = os.listdir(segmented_image_file_path)

    # Get local camera image data
    df_temp_camera = pd.DataFrame()
    for each in camera_images:
        temp_parquet_file_path = camera_image_file_path + "/" + each
        df = pd.read_parquet(temp_parquet_file_path)
        if df_temp_camera.empty:
            df_temp_camera = df
        else:
            df_temp_camera = pd.concat([df_temp_camera,df])

    # Get local segmented image data
    df_temp_seg = pd.DataFrame()
    for each in segmented_images_images:
        temp_parquet_file_path = segmented_image_file_path + "/" + each
        df = pd.read_parquet(temp_parquet_file_path)
        if df_temp_seg.empty:
            df_temp_seg = df
        else:
            df_temp_seg= pd.concat([df_temp_seg,df])

    # The merged dataframe
    df_tot = pd.merge(df_temp_camera,df_temp_seg,on=['key.segment_context_name','key.frame_timestamp_micros','key.camera_name'])[['[CameraImageComponent].image','[CameraSegmentationLabelComponent].panoptic_label']]

    print("There are" , len(df_tot) ,"images") #checking the results of the merge
    df_tot.columns = ['image','seg_label'] #rename columns for typing ease
    return df_tot


def camera_images():
    camera_images = os.listdir('waymo_sample_data/training/camera_image')
    # Get local camera image data
    for each in camera_images:
        df = pd.read_parquet("waymo_sample_data/training/camera_image/" + each)
        yield df

def segmented_images():
    segmented_images = os.listdir('waymo_sample_data/training/camera_segmentation')
    # Get local segmented image data
    for each in segmented_images:
        df = pd.read_parquet("waymo_sample_data/training/camera_segmentation/" + each)
        yield df
    
    

### Defining the Data
Firstly, we want to define a class that allows us to transform out current data into a pytorch related format, configured with transformations.

In [26]:
class SegmentationDataSet(data.Dataset):
    def __init__(self,
                 inputs: list,
                 targets: list,
                 transform=None
                 ):
        self.inputs = inputs
        self.targets = targets
        self.transform = transform
        self.inputs_dtype = torch.float32
        self.targets_dtype = torch.long

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self,
                    index: int):
        # Select the sample
        input_ID = self.inputs[index]
        target_ID = self.targets[index]

        # Load input and target
        x = np.array(Image.open(BytesIO(input_ID)))
        y = np.array(Image.open(BytesIO(target_ID)))

        # Preprocessing
        if self.transform is not None:
            x, y = self.transform(x, y)

        # Typecasting
        x, y = torch.from_numpy(x).type(self.inputs_dtype), torch.from_numpy(y).type(self.targets_dtype)

        return x, y
    

Next, we apply a pipeline of transformations to the training data. This is to help prevent data leakage, as well as streamline our process. We resize the images to be smaller in order to prevent any memory issues when developing our model. Due the size of our dataset and varying images previewed, we decided against further augmentation of the data. 
Augmentation would be considered if the original dataset were smaller.

In [27]:
# Preprocessing 
transforms = ComposeDouble([
    #Turn to grayscale
    FunctionWrapperDouble(skimage.color.rgb2gray,input=True,target=False),
    #Resizing
    FunctionWrapperDouble(resize,
                          input=True,
                          target=False,
                          output_shape=(640, 960),
                          anti_aliasing=True,
                          preserve_range=True),
    FunctionWrapperDouble(resize,
                          input=False,
                          target=True,
                          output_shape=(640, 960),
                          order=0,
                          anti_aliasing=False,
                          preserve_range=True),

    # Normalizing
    FunctionWrapperDouble(normalize,input=True,target=False),

    # Smooth it out 
    FunctionWrapperDouble(gaussian_smoothing, input=True, target=False, sigma_val=1),

    # Contrast enhancement - using histogram equalization
    FunctionWrapperDouble(image_histogram_equalization,input=True,target=False),
    
    # Ensure target segmented images are dense
    FunctionWrapperDouble(create_dense_target, input=False, target=True),

])

### The big Hyperparameter tuning loop!
This loop is configured to save the loss and validation outputs from each run in order to later be analyzed. We are grabbing the data differently here then we are in the actual training. Due to the smaller nature of the hyperparameter tuning, we are going to load the data beforehand before applying. Here we are gathering about 200 images.

In [None]:
if file_store == "GSC":
    #--------- Get image data ------------
    df_image = grab_data(folder_name= 'training/camera_image/', max_parquet = 3)

    # --------- Get segmented data ----------
    df_seg = grab_data(folder_name= 'training/camera_segmentation/', max_parquet = 3)

    # The merged dataframe
    df_tot = pd.merge(df_image,df_seg,on=['key.segment_context_name','key.frame_timestamp_micros','key.camera_name'])[['[CameraImageComponent].image','[CameraSegmentationLabelComponent].panoptic_label']]
    df_tot = df_tot.sample(200, ignore_index=True)

    print("There are" , len(df_tot) ,"images") #checking the results of the merge
    df_tot.columns = ['image','seg_label'] #rename columns for typing ease

    # Help clear up some space
    del df_image
    del df_seg
elif file_store == "LOCAL":
    df_tot = get_local_data()
else:
    print("Check file_store variable, it is CASE sensitive. Options are GSC or LOCAL")

Define some loss functions and other hyperparameters to try

In [None]:
# Define the loss functions you want to try
loss_functions = {
    "DiceLoss" : dice_loss,
    "CrossEntropyLoss": nn.CrossEntropyLoss(),
    "BCEWithLogitsLoss": nn.BCEWithLogitsLoss(),
}

# Define different hyperparameter settings to try
hyperparameters = {
    "lr": [0.0001,0.001],
    "batch_size": [2,6,12],
}

Then define the training and validation datasets.

In [None]:
dataset = SegmentationDataSet(inputs=df_tot['image'],
                                       targets=df_tot['seg_label'],
                                       transform=transforms)
n_val = int(len(dataset) * 0.1)
n_train = len(dataset) - n_val
train_set, val_set = random_split(dataset, [n_train, n_val], generator=torch.Generator().manual_seed(0))

#### Finally, we Tune! 

In [None]:
num_epochs = 5

# Hyperparameter tuning loop
best_loss = float("inf")
best_loss_function = None
best_hyperparameters = None
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
results = []

# Loop for Loss functions
for loss_name, loss_function in loss_functions.items():
  # Loop for Learning Rate
  for lr in hyperparameters["lr"]:
    # Loop for Batch Size
    for batch_size in hyperparameters["batch_size"]:
      # Now we're running!
      print("Running "+loss_name+" with loss rate of ",lr,"with batch size",batch_size)

      # put the training and validation data into each dataloader
      training_dataloader = data.DataLoader(dataset=train_set,
                                      batch_size=batch_size,
                                      shuffle=True,
                                      collate_fn = custom_collate
                                      )
      validation_dataloader = data.DataLoader(dataset=val_set,
                                      batch_size=batch_size,
                                      shuffle=False,
                                      collate_fn = custom_collate
                                      )

      # Create our model instance, optimizer, and loss function
      model = build_unet(num_classes=1, device=device).to(device)
      optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=0.01)
      criterion = loss_function

      # Initialize the GradScaler for mixed-precision training
      scaler = GradScaler()

      loss_per_epoch = []
      #Epoch Loop
      for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        start_time = time.time()

        for batch_idx, (inputs, labels, mask) in enumerate(training_dataloader):

          # Move inputs and labels to the GPU
          if device.type == 'cuda':
            inputs = inputs.to(device, dtype=torch.float32)
            labels = labels.to(device, dtype=torch.float32)
            mask = mask.to(device, dtype=torch.float32)

          optimizer.zero_grad()

          # Analyze the number of classes in the current segmentation mask
          num_classes = analyze_num_classes(labels)

          # Update the model's output layer to have the correct number of channels (classes)
          model.output_layer = nn.Conv2d(64, num_classes, kernel_size=1, padding=0)

          # Add nn.Parameter for bias with the desired precision
          model.output_layer.bias = nn.Parameter(torch.zeros(num_classes, dtype=torch.float32, device=device))

          # Use autocast to perform mixed-precision training
          with autocast():
            outputs = model(inputs)
            loss = criterion(outputs*mask, labels*mask)
            #loss_per_epoch.append(loss.item())

          scaler.scale(loss).backward()
          scaler.step(optimizer)
          scaler.update()

          total_loss += loss.item()

        # Calculate average loss for the epoch
        average_loss = total_loss / len(training_dataloader)
        loss_per_epoch.append(average_loss)
        # End time for the epoch
        end_time = time.time()
        epoch_time = end_time - start_time

        #Print progress with batch and epoch times
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {average_loss:.4f}, Epoch Time: {epoch_time:.2f} seconds")

        ## ------------ end of current epoch ---------------

        ## - ----------- eval ------------------------------
        model.eval() # set to eval mode
        total_val_loss = 0.0
        eval_loss_list = []
        with torch.no_grad():
          for val_batch_idx, (val_inputs, val_labels,val_mask) in enumerate(validation_dataloader):
            # Move validation inputs and labels to the same device as the model
            val_inputs = val_inputs.to(device)
            val_labels = val_labels.to(device)
            val_mask = val_mask.to(device)

            # Forward pass
            val_outputs = model(val_inputs)

            #Compute the validation loss
            if loss_name == 'CrossEntropyLoss':
              val_labels_indices = (val_labels*val_mask).argmax(dim=1)
              val_loss = criterion(val_outputs*val_mask, val_labels_indices)
            else:
              val_loss = criterion(val_outputs*val_mask, val_labels*val_mask)

            total_val_loss += val_loss.item()

        # Calculate the average validation loss
        average_val_loss = total_val_loss / len(validation_dataloader)
        eval_loss_list.append(average_val_loss)
        print(f"Validation Loss: {average_val_loss:.4f}")

      # Update best hyperparameters if current loss is better
      if average_loss < best_loss and average_loss >= 0: #we don't want super big negatives
        best_loss = average_loss
        best_loss_function = loss_name
        best_hyperparameters = {"lr": lr, "batch_size": batch_size}

      # Save hyperparameter tuning information and loss
      result = {
          "Loss Function": loss_name,
          "Learning Rate": lr,
          "Batch Size": batch_size,
          "Loss per Epoch": loss_per_epoch,
          "Validation Loss per Epoch": eval_loss_list
      }
      results.append(result)

# Convert the results list to a DataFrame
results_df = pd.DataFrame(results) #Import for visualizations
print(f"Best loss function: {best_loss_function}")
print(f"Best hyperparameters: {best_hyperparameters}")

#### Do you want to save?
Uncomment the below portion to save the results

In [None]:
# print("Saving all of Information")
# results_df.to_csv("hyperparam_tuning.csv", index=False)

#### Then, we graph
Graphing with Altair involves configuring our results dataframe a little bit before going ahead and applying. 

In [None]:
epoch_num = []
for _ in results_df['Loss per Epoch']:
  epoch_num.append(range(num_epochs))

results_df['Epoch_Num'] = epoch_num

results_df = results_df.explode(['Loss per Epoch','Epoch_Num']).reset_index(drop=True)

alt.Chart(results_df).mark_line().encode(
    x = 'Epoch_Num:N',
    y = 'Loss per Epoch:Q',
    color = 'Batch Size:N',
).properties(
    width = 180,
    height = 180
).facet(
    column ='Loss Function:N',
    row ='Learning Rate:N'
).resolve_scale(
    y = 'independent',
    x = 'independent'
)

### Time to Train
- Training requires the input data to be in the format of a generator!
- The number of parquets defined is the number provided in the sample. 

Firstly, we want to define the use of GPU as well as our preferred hyperparameters. The GPU was set in our hyperparameter tuning loop, but it doesn't hurt to double check.

In [28]:
# Set to run off of GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#Set hyperparameters
hyperparameters = {
    'lr': 0.0001,
    'batch_size': 6,
    'loss_function': dice_loss,
    'parquets': 2, 
    'num_epochs' : 10
}


Next, we create the model instance and train as followed, iteratively going through each parquet file from the WAYMO perception bucket.
Checkpoints are built in and print statements so that in case something occurs, the current model state is saved.

In [None]:
#initiate the model
model = build_unet(num_classes=1).to(device)
optimizer = optim.Adam(model.parameters(), lr=hyperparameters['lr'], weight_decay = 0.01)
criterion = hyperparameters['loss_function']

# Initialize the GradScaler for mixed-precision training
scaler = GradScaler()

# Initialize the data generators to iterate through
if local_store == 'GSC':
    df_image = grab_data_image()
    df_seg = grab_data_segment()
elif local_store == "LOCAL":
    df_image = camera_images()
    df_seg = segmented_images()
    

# to hold our results
results = []

i = 0
# Iterating through data
while i < hyperparameters['parquets']:
    i += 1

    print("Loaded parquet",i)
    # The merged dataframe
    df_tot = pd.merge(next(df_image),next(df_seg),on=['key.segment_context_name','key.frame_timestamp_micros','key.camera_name'])[['[CameraImageComponent].image','[CameraSegmentationLabelComponent].panoptic_label']]

    print("Iteration",i,"There are" , len(df_tot) ,"images") #checking the results of the merge
    df_tot.columns = ['image','seg_label'] #rename columns for typing ease

    training_dataset = SegmentationDataSet(inputs=df_tot['image'],
                                       targets=df_tot['seg_label'],
                                       transform=transforms)
    ## ----- Training loop -----------
    training_dataloader = data.DataLoader(dataset=training_dataset,
                                      batch_size=hyperparameters['batch_size'],
                                      shuffle=True,
                                      collate_fn = custom_collate
                                      )

    loss_per_epoch = []
    for epoch in range(hyperparameters['num_epochs']):
        loss_per_epoch = []
        model.train()
        total_loss = 0.0
        start_time = time.time()
        for batch_idx, (inputs, labels, mask) in enumerate(training_dataloader):
            
            # Move inputs and labels to the GPU
            if device.type == 'cuda':
                inputs = inputs.to(device, dtype=torch.float32)
                labels = labels.to(device, dtype=torch.float32)
                mask = mask.to(device, dtype=torch.float32)

            optimizer.zero_grad()
            # Analyze the number of classes in the current segmentation mask
            num_classes = analyze_num_classes(labels)
            # Update the model's output layer to have the correct number of channels (classes)
            model.output_layer = nn.Conv2d(64, num_classes, kernel_size=1, padding=0)
            # Add nn.Parameter for bias with the desired precision
            model.output_layer.bias = nn.Parameter(torch.zeros(num_classes, dtype=torch.float32, device=device))

            # Use autocast to perform mixed-precision training
            with autocast():
                outputs = model(inputs)
                labels = labels.to(outputs.dtype)
                loss = criterion(outputs*mask, labels*mask)
                
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            total_loss += loss.item()
        # Calculate average loss for the epoch
        average_loss = total_loss / len(training_dataloader)
        loss_per_epoch.append(average_loss)
        # End time for the epoch
        end_time = time.time()
        epoch_time = end_time - start_time

        #Print progress with batch and epoch times
        print(f"Epoch [{epoch + 1}/{hyperparameters['num_epochs']}], Loss: {average_loss:.4f}, Epoch Time: {epoch_time:.2f} seconds")

    result = {
        "Loss per Epoch": loss_per_epoch
    }
    results.append(result)

    checkpoint = {
        'Parquet':i,
        'model_state_dict':model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss' : loss_per_epoch
    }
    checkpoint_path = f'checkpoint_iteration_{i}.pth'
    torch.save(checkpoint, checkpoint_path)
    print(f'Checkpoint saved at iteration {i}')


Once it is all good and done, we go ahead and save our model.

In [None]:
torch.save(model,'unet_model_waymo.pt')

### Visualizaing our Loss

In [None]:
df = pd.DataFrame(results)
df = df.reset_index()

alt.Chart(df).mark_line().encode(
    x = alt.X('index:N',title='Parquet Iteration'),
    y = alt.Y('Loss per Epoch:Q',title="Average Loss"),
).properties(
    title='Average Loss During U-Net Training'
)