In [3]:
!pip install pims av
!python --version

Collecting pims
[?25l  Downloading https://files.pythonhosted.org/packages/d5/47/82e0ac31e01a271e5a06362fbf03769e9081956f6772f91d98b32899d743/PIMS-0.5.tar.gz (85kB)
[K     |████████████████████████████████| 92kB 4.0MB/s 
[?25hCollecting av
[?25l  Downloading https://files.pythonhosted.org/packages/66/ff/bacde7314c646a2bd2f240034809a10cc3f8b096751284d0828640fff3dd/av-8.0.3-cp37-cp37m-manylinux2010_x86_64.whl (37.2MB)
[K     |████████████████████████████████| 37.2MB 77kB/s 
[?25hCollecting slicerator>=0.9.8
  Downloading https://files.pythonhosted.org/packages/75/ae/fe46f5371105508a209fe6162e7e7b11db531a79d2eabcd24566b8b1f534/slicerator-1.0.0-py3-none-any.whl
Building wheels for collected packages: pims
  Building wheel for pims (setup.py) ... [?25l[?25hdone
  Created wheel for pims: filename=PIMS-0.5-cp37-none-any.whl size=84328 sha256=01d772439c121bcb46d388bd50eff11e692407ada8a13bb8fea0932912701da0
  Stored in directory: /root/.cache/pip/wheels/0e/0a/14/4c33a4cc1b9158e57329a38e

In [4]:
import numpy as np
import cv2
import pims
from tqdm.notebook import trange
import xml.etree.ElementTree as ET

import matplotlib.pyplot as plt
from matplotlib.pyplot import plot

import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

Imageio: 'ffmpeg-linux64-v3.3.1' was not found on your computer; downloading it now.
Try 1. Download from https://github.com/imageio/imageio-binaries/raw/master/ffmpeg/ffmpeg-linux64-v3.3.1 (43.8 MB)
Downloading: 8192/45929032 bytes (0.0%)1630208/45929032 bytes (3.5%)4415488/45929032 bytes (9.6%)7897088/45929032 bytes (17.2%)11272192/45929032 bytes (24.5%)14974976/45929032 bytes (32.6%)18505728/45929032 bytes (40.3%)22208512/45929032 bytes (48.4%)25862144/45929032 bytes (56.3%)29048832/45929032 bytes (63.2%)32702464/45929032 bytes (71.2%)36167680/45929032 bytes (78.7%)39731200/45929032 bytes (86.5%)4

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# CONSTANTS
# network input resolution
W = 320
H = 160

# annotations' resolution
annot_W = 480
annot_H = 320

In [7]:
# DATA FUNCTIONS
from os import listdir

# get polylines from file
def extract_polylines(filename):
  tree = ET.parse(filename)
  polylines = []
  for polyline in tree.iter(tag='polyline'):
    frame = polyline.get("frame")
    points = polyline.get("points").split(";")
    for i in range(len(points)):
      points[i] = points[i].split(",")
      for j in range(len(points[i])):
        points[i][j] = float(points[i][j])
    data = (int(frame), points)
    polylines.append(data)

  return sorted(polylines)

# get polylines from each frame
def extract_frame_lines(polylines):
  n_frames = polylines[-1][0]
  frames = []

  for i in range(n_frames+1):
    frame = []
    for polyline in polylines:
      if polyline[0] == i:
        frame.append(polyline[1])
    frames.append(sorted(frame))
    
  return frames

# convert annotations to new resolution
def convert_annotations(old_res, new_res, annotations):
  W, H = old_res
  new_W, new_H = new_res
  new_annotations = []
  for polylines in annotations:
    new_polylines = []
    for polyline in polylines:
      new_polyline = []
      for point in polyline:
        x, y = point
        new_x = (x*new_W) / W
        new_y = (y*new_H) / H
        new_polyline.append([new_x,new_y])
      new_polylines.append(new_polyline)
    new_annotations.append(new_polylines)
  return np.array(new_annotations)

# get training data from path
def get_data(video_path, annotations_path):
  # get video frames
  frames = pims.Video(video_path, format="mp4")
  
  # get road edges data
  annotations = extract_polylines(annotations_path)
  annotations = extract_frame_lines(annotations)
  annotations = convert_annotations((annot_W,annot_H), (W,H), annotations)

  return frames, annotations

# make pims video into actual numpy frames
def conv_frames(frames):
  imgs = []
  print("Getting frames into proper arrays")
  for frame in frames:
    imgs.append(cv2.resize(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), (W,H)))
  print("Frames converted to numpy arrays")
  return np.array(imgs)

base_dir = "/content/drive/MyDrive/OpenCRD_dataset/"
video_files = []
annot_files = []
for f in listdir(base_dir):
  if f.endswith(".mp4"):
    video_files.append(f)
  elif f.endswith(".xml"):
    annot_files.append(f)
video_files, annot_files = sorted(video_files), sorted(annot_files)

video_files = video_files[:3] # TODO: this is a temp hack, need to get all videos' annotations
print(video_files)
print(annot_files)

assert len(video_files) == len(annot_files), "Number of video files != number of annotation files"

['city_1.mp4', 'city_2.mp4', 'city_3.mp4']
['city_1_annotations.xml', 'city_2_annotations.xml', 'city_3_annotations.xml']


In [8]:
# PYTOCH MODEL
# model for road edge detection
class REDetector(nn.Module):
  def __init__(self):
    super(REDetector, self).__init__()

    # output polylines attributes
    n_coords = 2  # 2 coordinates: x,y
    n_points = 4  # number of points of each polyline
    max_n_lines = 6 # max number of polylines per frame

    # Convolutional Layers
    self.conv1 = nn.Conv2d(3, 16, 5)
    self.conv2d_bn1 = nn.BatchNorm2d(16)
    self.pool = nn.MaxPool2d(2, 2)
    self.conv2 = nn.Conv2d(16, 32, 5)
    self.conv2d_bn2 = nn.BatchNorm2d(32)
    self.conv3 = nn.Conv2d(32, 64, 5)
    self.conv2d_bn3 = nn.BatchNorm2d(64)

    # Fully Connected Layers
    self.fc1 = nn.Linear(64*16*36, 120) # for 320x160 image 64 channels
    self.bn1 = nn.BatchNorm1d(num_features=120)
    self.fc2 = nn.Linear(120, 84)
    self.bn2 = nn.BatchNorm1d(num_features=84)
    self.fc3 = nn.Linear(84, n_coords*n_points*max_n_lines)

  def forward(self, x):
    x = self.pool(F.relu(self.conv2_bn1(self.conv1(x))))
    x = self.pool(F.relu(self.conv2_bn2(self.conv2(x))))
    x = self.pool(F.relu(self.conv2_bn3(self.conv3(x))))
    #print(x.shape)
    x = x.view(-1, self.num_flat_features(x))
    x = F.relu(self.bn1(self.fc1(x)))
    x = F.relu(self.bn2(self.fc2(x)))
    x = self.fc3(x)

  def num_flat_features(self, x):
    size = x.size()[1:]
    num_features = 1
    for s in size:
      num_features *= s
    return num_features

In [26]:
# ANNOTATIONS TRANSFORMATIONS

# TODO: this algorithm has bad complexity (O(n^3)), refactor if possible
# convert polylines per frame to net output vector (flattens the array)
def serialize_polylines(polylines, n_coords, n_points, max_n_lines):
  # check if we have more than n_points
  for polyline in polylines:
    if len(polyline) != n_points:
      polylines.remove(polyline)
  assert len(polylines) <= max_n_lines, "More than max number of lines found"

  # fill the gaps with negative values (-1 == NULL => out of bounds)
  if len(polylines) < max_n_lines:
    for i in range(max_n_lines - len(polylines)):
      new_polyline = []
      for j in range(n_points):
        point = []
        for k in range(n_coords):
          point.append(-1.0)
        new_polyline.append(point)
      polylines.append(new_polyline)
      
  # flatten
  ret = []
  for i in range(max_n_lines):
    for j in range(n_points):
      for k in range(n_coords):
        ret.append(polylines[i][j][k])

  return np.array(ret)

# convert network output vector to polylines per frame
def deserialize_polylines(net_output, n_coords, n_points, max_n_lines):
  polylines = []
  point = []
  line = []
  for i in range(len(net_output)):
    point.append(net_output[i])
    if len(point) == 2:
      line.append(point)
      point = []
    if len(line) == 4:
      polylines.append(line)
      line = []

  # remove (-1, -1)/out-of-bounds points from lines
  for polyline in polylines:
    while [-1., -1.] in polyline:
      polyline.remove([-1., -1.])

  # remove empty lists
  while [] in polylines:
    polylines.remove([])

  return np.array(polylines)

In [27]:
# test the above functions (this will be later used on the training loop for Y_train)
n_coords = 2  # 2 coordinates: x,y
n_points = 4  # number of points of each polyline
max_n_lines = 6 # max number of polylines per frame

frames, annotations = get_data(base_dir+video_files[1], base_dir+annot_files[1])

idx = 0
for polylines in annotations:
  ret = serialize_polylines(polylines, n_coords, n_points, max_n_lines)
  print("++++++++++")
  print("Frame", idx)
  print(ret)
  print("==========")
  new_polylines = deserialize_polylines(ret, n_coords, n_points, max_n_lines)
  print(new_polylines)
  idx += 1



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  [2.11333333e+01 1.11100000e+02]
  [5.48666667e+01 1.06150000e+02]]

 [[1.93333333e+02 1.05750000e+02]
  [2.19533333e+02 1.15050000e+02]
  [2.73000000e+02 1.34500000e+02]
  [3.19000000e+02 1.51500000e+02]]]
++++++++++
Frame 701
[ 2.66666667e-01  1.14900000e+02  1.19333333e+01  1.12750000e+02
  2.06733333e+01  1.11185000e+02  5.32600000e+01  1.06275000e+02
  1.93746667e+02  1.05905000e+02  2.19946667e+02  1.15145000e+02
  2.73126667e+02  1.34470000e+02  3.19000000e+02  1.51375000e+02
 -1.00000000e+00 -1.00000000e+00 -1.00000000e+00 -1.00000000e+00
 -1.00000000e+00 -1.00000000e+00 -1.00000000e+00 -1.00000000e+00
 -1.00000000e+00 -1.00000000e+00 -1.00000000e+00 -1.00000000e+00
 -1.00000000e+00 -1.00000000e+00 -1.00000000e+00 -1.00000000e+00
 -1.00000000e+00 -1.00000000e+00 -1.00000000e+00 -1.00000000e+00
 -1.00000000e+00 -1.00000000e+00 -1.00000000e+00 -1.00000000e+00
 -1.00000000e+00 -1.00000000e+00 -1.00000000e+00 -1.0000

In [None]:
# TRAINING PROCESS
def train(frames, annotations, model):
  loss_function = nn.MSELoss()
  optim = torch.optim.Adam(model.parameters(), lr=0.001)  # TODO: try new learning rates

  losses, accuracies = [], []
  epochs = 11
  BS = 128

  for epoch in range(epochs):
    print("[+] Epoch", epoch)
    t = trange(0, len(frames)-BS, BS)
    for i in t: 
      # get data into network
      rng = np.random.default_rng()
      samp = rng.choice((frames), size=BS, replace=False)

      # TODO: get annotations to the output (use serialize/deserialize function)
      X_train = []
      Y_train = []
      for j in samp:
        frame = frames[j]
        frame = np.moveaxis(frame, -1, 0) # [batch_size, channels, height, width]
        X_train.append(frame)
        flat_annot = serialize_polylines(annotations[j], n_coords, n_points, max_n_lines)
        Y_train.append([flat_annot])
      samp = []
      X = torch.tensor(np.array(X_train)).float().to(device)
      Y = torch.tensor(np.array(Y_train)).float().to(device)

      # forward and back feed
      optim.zero_grad()
      out = model(X)
      accuracy = (out == Y).float().mean()  # TODO: this might be wrong
      loss = loss_function(out, Y)
      loss = loss.mean()
      loss.backward()
      optim.step()

      # print stats
      loss = loss.item()
      accuracy = accuracy.item()
      losses.append(loss/100) # /100 so that we can see it in the graph
      accuracies.append(accuracy)
      t.set_description("loss %.2f accuracy %.2f out %.2f" % (loss, accuracy, out.mean().item()))

  # plot losses and accuracies
  plt.ylim(-0.1, 1.1)
  plot(losses)
  plot(accuracies)

  return model

if device.type == "cuda":
  torch.cuda.empty_cache()  # to avoid running out of cuda memory
  print("[~] Cleared cuda cache")

model = REDetector().to(device).train()

for i in trange(1, len(video_files)):
  print("[~] Loading from files: %s , %s" % (base_dir+video_files[i], base_dir+annot_files[i]))
  frames, annotations = get_data(base_dir+video_files[i], base_dir+annot_files[i])
  frames = conv_frames(frames)
  #if i == 0:
  if i == 1:
    all_frames = frames
    all_annotations = annotations
  else:
    all_frames = np.concatenate((all_frames, frames), axis=0)
    all_annotations = np.concatenate((all_annotations, annotations), axis=0)

franes, labels = [], [] # free up memory
print("[+] Training mode ...")
model = train(all_frames, all_annotations, model)
print("[+] Trained model on all data files")

[~] Cleared cuda cache


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

[~] Loading from files: /content/drive/MyDrive/OpenCRD_dataset/city_2.mp4 , /content/drive/MyDrive/OpenCRD_dataset/city_2_annotations.xml




Getting frames into proper arrays
Frames converted to numpy arrays
[~] Loading from files: /content/drive/MyDrive/OpenCRD_dataset/city_3.mp4 , /content/drive/MyDrive/OpenCRD_dataset/city_3_annotations.xml
Getting frames into proper arrays
Frames converted to numpy arrays

[+] Training mode ...
[+] Epoch 0


HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))