# Explore the dataset


In this notebook, we will perform an EDA (Exploratory Data Analysis) on the processed Waymo dataset (data in the `processed` folder). In the first part, you will create a function to display 

In [None]:
from utils import get_dataset
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
from PIL import Image, ImageOps


%matplotlib inline

In [None]:
dataset = get_dataset("/app/project/data/waymo/training_and_validation/*.tfrecord")


## Write a function to display an image and the bounding boxes

Implement the `display_instances` function below. This function takes a batch as an input and display an image with its corresponding bounding boxes. The only requirement is that the classes should be color coded (eg, vehicles in red, pedestrians in blue, cyclist in green).

In [None]:
def display_instances(batch):
    """
    This function takes a batch from the dataset and display the image with 
    the associated bounding boxes.
    """
    color_coding = {
            1 : "red", # Vehicles
            2 : "blue", # Pedestrians
            4 : "green" # Cyclists
        }
    fig, ax = plt.subplots(1, figsize=(20, 15))   
    
    img = batch['image'].numpy()
    ax.imshow(img)

    img_width, img_height, _ = img.shape
    
    for i, box in enumerate(batch['groundtruth_boxes']):
            x1 = box[0] * img_width
            y1 = box[1] * img_height
            x2 = box[2] * img_width
            y2 = box[3] * img_height
            height = x2 - x1
            width = y2 - y1
            
            color = color_coding[batch['groundtruth_classes'].numpy()[i]]
            
            rect = patches.Rectangle((y1,x1), width, height, linewidth=1, edgecolor=color, facecolor="none")
            ax.add_patch(rect)   
    
    plt.show()

## Display 10 images 

Using the dataset created in the second cell and the function you just coded, display 10 random images with the associated bounding boxes. You can use the methods `take` and `shuffle` on the dataset.

In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
dataset = dataset.shuffle(100, reshuffle_each_iteration=True)
subset = dataset.take(10)

for batch in subset:
        display_instances(batch)



## Additional EDA

In this last part, you are free to perform any additional analysis of the dataset. What else would like to know about the data?
For example, think about data distribution. So far, you have only looked at a single file...

In [None]:
# Statistics for car, pedestrian, cyclist class
def add_count(count, statistics):
    statistics[0] = count if count < statistics[0] else statistics[0]
    statistics[1] = count if count > statistics[1] else statistics[1]
    statistics[3] = statistics[3] + count if count > 0 else statistics[3]
    
def add_class_occurence(car, ped, cyc, co):
    row = [car, ped, cyc]
    co.append(row)

def calculate_mean(total, statistics):
    statistics[2] = statistics[3] / total

statistics = [1e20, 0, 0, 0] # min, max, mean, total

car_stat = statistics.copy()
pedestrian_stat = statistics.copy()
cyclists_stat = statistics.copy()

classes_occurence = [] # Car, Pedestrian, Cyclists
total_images = 0

for batch in dataset.take(10000):
    classes = batch['groundtruth_classes'].numpy()
    total_images = total_images + 1 
    
    car_count = np.count_nonzero(classes == 1)
    pedestrian_count = np.count_nonzero(classes == 2)
    cyclists_count = np.count_nonzero(classes == 4)
    
    add_count(car_count, car_stat)
    add_count(pedestrian_count, pedestrian_stat)
    add_count(cyclists_count, cyclists_stat)
    add_class_occurence(car_count, pedestrian_count, cyclists_count, classes_occurence)
    
calculate_mean(total_images, car_stat)
calculate_mean(total_images, pedestrian_stat)
calculate_mean(total_images, cyclists_stat)
total_objects = car_stat[3] + pedestrian_stat[3] + cyclists_stat[3]

# Printout statistics
print('Total images: ', total_images)
print('Total objects: ', total_objects)
print('Car class min/max/mean/total: ', car_stat, ' Percentage: ', (car_stat[3]*100)/total_objects, "%" )
print('Pedestrian class min/max/mean/total: ', pedestrian_stat, ' Percentage: ', (pedestrian_stat[3]*100)/total_objects, "%")
print('Cyclist class min/max/mean/total: ', cyclists_stat, ' Percentage: ', (cyclists_stat[3]*100)/total_objects, "%")


# Printout histograms
classes_occurence = np.array(classes_occurence)
fig, ax = plt.subplots(1, 1, sharey=True, tight_layout=False, figsize=(20, 15))
ax.hist(classes_occurence[:, 0], bins=car_stat[1]-car_stat[0]+1)
ax.set_title('Car class histogram')

fig, ax = plt.subplots(1, 1, sharey=True, tight_layout=False, figsize=(20, 15))
ax.hist(classes_occurence[:, 1], bins=pedestrian_stat[1]-pedestrian_stat[0]+1)
ax.set_title('Pedestrian class histogram')

fig, ax = plt.subplots(1, 1, sharey=True, tight_layout=False, figsize=(20, 15))
ax.hist(classes_occurence[:, 2], bins=cyclists_stat[1]-cyclists_stat[0]+1)
ax.set_title('Cyclist class histogram')

# Histogram to determine images brightness



In [None]:
dataset = dataset.shuffle(200, reshuffle_each_iteration=True)

gray_images = []

for batch in dataset.take(200):
    img = ImageOps.grayscale(Image.fromarray(batch['image'].numpy()))
    gray_img = np.asarray(img)

#     fig, ax = plt.subplots(1, figsize=(20, 15))   
#     ax.imshow(gray_img, cmap='gray')

    gray_images.extend(gray_img.flatten())

fig, ax = plt.subplots(1, 1, sharey=True, tight_layout=False, figsize=(20, 15))
ax.hist(gray_images, bins=256)
ax.set_title('Brightness histogram')
