# EDA 3 PARTS

# PART 1: Recovering the Videos
In this notebook we will show how to view the videos contained in the [CVPR 2018 WAD Video Segmentation Challenge](https://www.kaggle.com/c/cvpr-2018-autonomous-driving) dataset. 
### Sources and Resources 
* http://tiao.io/posts/notebooks/embedding-matplotlib-animations-in-jupyter-as-interactive-javascript-widgets/
* https://matplotlib.org/gallery/animation/dynamic_image2.html

In [1]:
import pandas as pd
from skimage.io import imread 
import matplotlib.pyplot as plt
from matplotlib import animation
from IPython.display import HTML
import os

In [2]:
training_img_names = os.listdir("/Users/quinn/Desktop/AdvPredictiveModeling/GroupProject/input/train_label_camera5") 
trainingImgNameDF = pd.DataFrame( list(map(lambda s : [s]+s.split("_") , training_img_names )) ).drop(3, axis=1)

In [3]:
trainingImgNameDF.head()

Unnamed: 0,0,1,2,4,5
0,170908_061536323_Camera_5_instanceIds.png,170908,61536323,5,instanceIds.png
1,170908_061534933_Camera_5_instanceIds.png,170908,61534933,5,instanceIds.png
2,170908_061518393_Camera_5_instanceIds.png,170908,61518393,5,instanceIds.png
3,170908_061519366_Camera_5_instanceIds.png,170908,61519366,5,instanceIds.png
4,170908_061531041_Camera_5_instanceIds.png,170908,61531041,5,instanceIds.png


In [4]:
training_img_names2 = os.listdir("/Users/quinn/Desktop/AdvPredictiveModeling/GroupProject/input/train_color_camera5/") 
trainingImgNameDF2 = pd.DataFrame( list(map(lambda s : [s]+s.split("_") , training_img_names2 )) ).drop(3, axis=1)

In [5]:
trainingImgNameDF2.head()

Unnamed: 0,0,1,2,4
0,170908_061504215_Camera_5.jpg,170908,61504215,5.jpg
1,170908_061523674_Camera_5.jpg,170908,61523674,5.jpg
2,170908_061533404_Camera_5.jpg,170908,61533404,5.jpg
3,170908_061527288_Camera_5.jpg,170908,61527288,5.jpg
4,170908_061526176_Camera_5.jpg,170908,61526176,5.jpg


In [6]:
# define a function to convert images to video
def visVid(trainingImgNameDF,sessionID,cameraID,startFrame=0,endFrame=10,
           dataRoot="/Users/quinn/Desktop/AdvPredictiveModeling/GroupProject/input/train_label_camera5/"):
    res = trainingImgNameDF[ (trainingImgNameDF[1] == sessionID) & 
                            (trainingImgNameDF[4] == cameraID)].sort_values(by=2)
    imgURIs = list(res[0])
    
    fig = plt.figure(figsize=(10,10))
    frames = []
    for uri in imgURIs[startFrame:endFrame]:
        im = plt.imshow( imread(dataRoot+uri), animated=True)
        frames.append([im])
    
    plt.close(fig)
    return animation.ArtistAnimation(fig,frames,interval=500, blit=True,repeat_delay=100)
    

In [7]:
# define a function to convert images to video
def visVid2(trainingImgNameDF,sessionID,cameraID,startFrame=0,endFrame=10,
           dataRoot="/Users/quinn/Desktop/AdvPredictiveModeling/GroupProject/input/train_color_camera5/"):
    res = trainingImgNameDF[ (trainingImgNameDF[1] == sessionID) & 
                            (trainingImgNameDF[4] == cameraID)].sort_values(by=2)
    imgURIs = list(res[0])
    
    fig = plt.figure(figsize=(10,10))
    frames = []
    for uri in imgURIs[startFrame:endFrame]:
        im = plt.imshow( imread(dataRoot+uri), animated=True)
        frames.append([im])
    
    plt.close(fig)
    return animation.ArtistAnimation(fig,frames,interval=500, blit=True,repeat_delay=100)

In [8]:
# create a video of label images
uniqueVideoID = trainingImgNameDF[1].unique()
ani = visVid(trainingImgNameDF,uniqueVideoID[0],"5")
ani.save('camera5label.mp4')

In [9]:
# create a video of colored images
uniqueVideoID2 = trainingImgNameDF2[1].unique()
ani = visVid2(trainingImgNameDF2,uniqueVideoID2[0],"5.jpg")
ani.save('camera5colored.mp4')

# PART 2: First Look

To begin with, we would like to see what the training images look like and what we are suppose to detect. Using scikit-image segmentation module, dask.array.image, and matplotlib, we were able to display the original image, the colored labels, and the cropped overlay.

In [None]:
%matplotlib inline
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from glob import glob
import matplotlib.pyplot as plt
USE_CV2 = True
if USE_CV2:
    from cv2 import imread # opencv is much faster, but less accurate
    MIN_OBJ_VAL = 0
else:
    from skimage.io import imread
    MIN_OBJ_VAL = 1000

from skimage.segmentation import mark_boundaries
DATA_DIR = os.path.join('..', 'input')

In [None]:
all_paths = pd.DataFrame(dict(path = glob(os.path.join(DATA_DIR, '*', '*.*p*g'))))
all_paths['split'] = all_paths['path'].map(lambda x: x.split('/')[-2].split('_')[0])
all_paths['group'] = all_paths['path'].map(lambda x: x.split('/')[-2].split('_')[-1])
all_paths['group'] = all_paths['group'].map(lambda x: 'color' if x == 'test' else x)
all_paths['id'] = all_paths['path'].map(lambda x: '_'.join(os.path.splitext(os.path.basename(x))[0].split('_')[:4]))
all_paths.sample(5)

In [None]:
group_df = all_paths.pivot_table(values = 'path', columns = 'group', aggfunc = 'first', index = ['id', 'split']).reset_index()

To begin with, we would like to see what the training images look like and what we are suppose to detect. Using scikit-image segmentation module, dask.array.image, and matplotlib, we were able to display the original image, the colored labels, and the cropped overlay

In [None]:
train_df = group_df.query('split=="train"')
print(train_df.shape[0], 'rows')
sample_rows = 6
fig, m_axs = plt.subplots(sample_rows, 3, figsize = (20, 6*sample_rows))
[c_ax.axis('off') for c_ax in m_axs.flatten()]
for (ax1, ax2, ax3), (_, c_row) in zip(m_axs, train_df.sample(sample_rows).iterrows()):
    c_img = imread(c_row['color'])
    l_img = imread(c_row['label'])
    if l_img.ndim==3: l_img = l_img[:,:,0]
    ax1.imshow(c_img)
    ax1.set_title('Color')
    # make the labels nicer
    nice_limg = np.zeros(l_img.shape, dtype = np.uint8)
    for new_idx, old_idx in enumerate(np.unique(l_img[l_img>MIN_OBJ_VAL]), 1):
        nice_limg[l_img==old_idx]=new_idx
    ax2.imshow(nice_limg, cmap = 'nipy_spectral')
    ax2.set_title('Labels')
    xd, yd = np.where(l_img>MIN_OBJ_VAL)
    bound_img = mark_boundaries(image = c_img, label_img = l_img, color = (1,0,0), background_label = 255, mode = 'thick')
    ax3.imshow(bound_img[xd.min():xd.max(), yd.min():yd.max(),:])
    ax3.set_title('Cropped Overlay')

# PART 3: Histograms

In [None]:
import os
import random
import numpy as np
import pandas as pd 
from skimage import io
from PIL import Image
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
from dask.array.image import imread
from dask import bag, threaded
from dask.diagnostics import ProgressBar

look at the training images

In [None]:
def filecheck(dir):
    dir_size = 0
    filelist = os.listdir(dir)
    filelist.sort()
    print(dir)
    for i,name in enumerate(filelist):
        dir_size += os.path.getsize(os.path.join(dir, name))
    print("{:.1f} GB of {} files".format(dir_size/1024/1024/1024, i))
    print("showing sample files")
    print("\n".join(filelist[300:306]) + "\n")

dirs = ["../input/train_color","../input/train_label", "../input/test"]

for d in dirs[0:2]:
    filecheck(d)

In [None]:
# dask imread version - in progress
def divmil():
    return 1
# labels is the image array
labels = imread("../input/train_label/*.png")
print(labels.shape, labels[0].shape, divmil)

In [None]:
filenames = os.listdir(dirs[1])
fullpaths = ["../input/train_label/" + f for f in filenames]

# set up a bag
def get_ims(impath):
    tlabel = io.imread(impath, plugin='pil')
    cls = np.unique(tlabel)
    unique,counts = np.unique(cls//1000, return_counts=True)
    ds = dict(zip(unique, counts))
    return ds

labelbag = bag.from_sequence(fullpaths).map(get_ims)
with ProgressBar():
    labels = labelbag.compute()

We first created a dataframe called label_df, which contains the label file name and number of interested classes in each training label image. 

In [None]:
labels_df = pd.DataFrame(labels, index=filenames, dtype='uint8')
labels_df.fillna(value=0, inplace=True)
labels_df = labels_df.astype(int)
labels_df.rename(columns=classdict, inplace=True)      
labels_df.drop(columns=['others', 'rider', 'traffic_cone'], inplace=True)

labels_df.to_csv('train_labels.csv')
labels_df.head()

In [None]:

classes_df = pd.melt(labels_df)
groups = classes_df.groupby('variable')
sums = groups.sum()


sns.set(style='whitegrid')
ax = sns.barplot(x=sums.index, y=sums.value, color='steelblue')
ax.set(xlabel='', ylabel='count')
sns.despine(left=True)

Using the data frame above, we created few histograms using seaborn library. 

Class Frequencies: We found that cars and person are the two most frequently appeared classes in image

In [None]:
classes_df = pd.melt(labels_df)
groups = classes_df.groupby('variable')
sums = groups.sum()


sns.set(style='whitegrid')
ax = sns.barplot(x=sums.index, y=sums.value, color='steelblue')
ax.set(xlabel='', ylabel='count')
sns.despine(left=True)

In [None]:
labels_df['objects'] = labels_df.sum(axis=1)
labels_df['classes'] = labels_df[labels_df>0].count(axis=1)-1
labels_df.clip(lower=0, inplace=True)   # crude fix for when no objects are seen
labels_df.head()

In [None]:
plt.figure();
plt.title("Total Number of Objects")
labels_df['objects'].plot.hist()

Number of Distinct Classes per Image: The majority of images have two to four distinct classes. 

In [None]:
plt.figure();
plt.title("Number of Distinct Classes")
labels_df['classes'].value_counts().sort_index().plot.bar(color='steelblue')

Number of Cars per Image: Majority of the images have 0~10 cars.

In [None]:
plt.figure();
plt.title("Number of Cars per Image")
labels_df['car'].value_counts().sort_index().plot.bar(color='steelblue')

Number of Person per Image: Most of the images have 0~5 people.

In [None]:
plt.figure();
plt.title("Number of People per Image")
labels_df['person'].value_counts().sort_index().plot.bar(color='steelblue')