In [2]:
!pip install pims av
!python --version

Collecting pims
[?25l  Downloading https://files.pythonhosted.org/packages/d5/47/82e0ac31e01a271e5a06362fbf03769e9081956f6772f91d98b32899d743/PIMS-0.5.tar.gz (85kB)
[K     |████████████████████████████████| 92kB 5.9MB/s 
[?25hCollecting av
[?25l  Downloading https://files.pythonhosted.org/packages/66/ff/bacde7314c646a2bd2f240034809a10cc3f8b096751284d0828640fff3dd/av-8.0.3-cp37-cp37m-manylinux2010_x86_64.whl (37.2MB)
[K     |████████████████████████████████| 37.2MB 80kB/s 
[?25hCollecting slicerator>=0.9.8
  Downloading https://files.pythonhosted.org/packages/75/ae/fe46f5371105508a209fe6162e7e7b11db531a79d2eabcd24566b8b1f534/slicerator-1.0.0-py3-none-any.whl
Building wheels for collected packages: pims
  Building wheel for pims (setup.py) ... [?25l[?25hdone
  Created wheel for pims: filename=PIMS-0.5-cp37-none-any.whl size=84328 sha256=e5115a8a0a6099eef2f930682737eccd2a8dd7087c10a9fa2b6c14824bfdd4ce
  Stored in directory: /root/.cache/pip/wheels/0e/0a/14/4c33a4cc1b9158e57329a38e

In [3]:
import numpy as np
import cv2
import pims
from tqdm.notebook import trange
import xml.etree.ElementTree as ET

import matplotlib.pyplot as plt
from matplotlib.pyplot import plot

import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

Imageio: 'ffmpeg-linux64-v3.3.1' was not found on your computer; downloading it now.
Try 1. Download from https://github.com/imageio/imageio-binaries/raw/master/ffmpeg/ffmpeg-linux64-v3.3.1 (43.8 MB)
Downloading: 8192/45929032 bytes (0.0%)3055616/45929032 bytes (6.7%)7200768/45929032 bytes (15.7%)11354112/45929032 bytes (24.7%)13901824/45929032 bytes (30.3%)16957440/45929032 bytes (36.9%)20979712/45929032 bytes (45.7%)24330240/45929032 bytes (53.0%)28385280/45929032 bytes (61.8%)32088064/45929032 bytes (69.9%)36134912/45929032 bytes (78.7%)40124416/45929032 bytes (87.4%)44072960/45929032 bytes (96.0%)

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# CONSTANTS
# network input resolution
W = 320
H = 160

# annotations' resolution
annot_W = 480
annot_H = 320

In [9]:
# DATA FUNCTIONS
from os import listdir

# get polylines from file
def extract_polylines(filename):
  tree = ET.parse(filename)
  polylines = []
  for polyline in tree.iter(tag='polyline'):
    frame = polyline.get("frame")
    points = polyline.get("points").split(";")
    for i in range(len(points)):
      points[i] = points[i].split(",")
      for j in range(len(points[i])):
        points[i][j] = float(points[i][j])
    data = (int(frame), points)
    polylines.append(data)

  return sorted(polylines)

# get polylines from each frame
def extract_frame_lines(polylines):
  n_frames = polylines[-1][0]
  frames = []

  for i in range(n_frames+1):
    frame = []
    for polyline in polylines:
      if polyline[0] == i:
        frame.append(polyline[1])
    frames.append(sorted(frame))
    
  return frames

# convert annotations to new resolution
def convert_annotations(old_res, new_res, annotations):
  W, H = old_res
  new_W, new_H = new_res
  new_annotations = []
  for polylines in annotations:
    new_polylines = []
    for polyline in polylines:
      new_polyline = []
      for point in polyline:
        x, y = point
        new_x = (x*new_W) / W
        new_y = (y*new_H) / H
        new_polyline.append((new_x,new_y))
      new_polylines.append(new_polyline)
    new_annotations.append(new_polylines)
  return np.array(new_annotations, dtype=object)

# get training data from path
def get_data(video_path, annotations_path):
  # get video frames
  frames = pims.Video(video_path, format="mp4")
  
  # get road edges data
  annotations = extract_polylines(annotations_path)
  annotations = extract_frame_lines(annotations)
  annotations = convert_annotations((annot_W,annot_H), (W,H), annotations)

# make pims video into actual numpy frames
def conv_frames(frames):
  imgs = []
  print("Getting frames into proper arrays")
  for frame in frames:
    imgs.append(cv2.resize(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), (W,H)))
  print("Frames converted to numpy arrays")
  return np.array(imgs)

base_dir = "/content/drive/MyDrive/OpenCRD_dataset/"
video_files = []
annot_files = []
for f in listdir(base_dir):
  if f.endswith(".mp4"):
    video_files.append(f)
  elif f.endswith(".xml"):
    annot_files.append(f)
video_files, annot_files = sorted(video_files), sorted(annot_files)

video_files = video_files[:2] # TODO: this is a temp hack, need to get all videos' annotations
print(video_files)
print(annot_files)

assert len(video_files) == len(annot_files)

['city_1.mp4', 'city_2.mp4']
['city_1_annotations.xml', 'city_2_annotations.xml']


In [10]:
# PYTOCH MODEL
# model for road edge detection
class REDetector(nn.Module):
  def __init__(self):
    super(REDetector, self).__init__()

    # output polylines attributes
    n_coords = 2  # 2 coordinates: x,y
    n_points = 4  # number of points of each polyline
    max_n_lines = 6 # max number of polylines per frame

    # Convolutional Layers
    self.conv1 = nn.Conv2d(3, 16, 5)
    self.conv2d_bn1 = nn.BatchNorm2d(16)
    self.pool = nn.MaxPool2d(2, 2)
    self.conv2 = nn.Conv2d(16, 32, 5)
    self.conv2d_bn2 = nn.BatchNorm2d(32)
    self.conv3 = nn.Conv2d(32, 64, 5)
    self.conv2d_bn3 = nn.BatchNorm2d(64)

    # Fully Connected Layers
    self.fc1 = nn.Linear(64*16*36, 120) # for 320x160 image 64 channels
    self.bn1 = nn.BatchNorm1d(num_features=120)
    self.fc2 = nn.Linear(120, 84)
    self.bn2 = nn.BatchNorm1d(num_features=84)
    self.fc3 = nn.Linear(84, n_coords*n_points*max_n_lines)

  def forward(self, x):
    x = self.pool(F.relu(self.conv2_bn1(self.conv1(x))))
    x = self.pool(F.relu(self.conv2_bn2(self.conv2(x))))
    x = self.pool(F.relu(self.conv2_bn3(self.conv3(x))))
    #print(x.shape)
    x = x.view(-1, self.num_flat_features(x))
    x = F.relu(self.bn1(self.fc1(x)))
    x = F.relu(self.bn2(self.fc2(x)))
    x = self.fc3(x)

  def num_flat_features(self, x):
    size = x.size()[1:]
    num_features = 1
    for s in size:
      num_features *= s
    return num_features

In [None]:
# TRAINING PROCESS
def train(frames, annotations, model):
  loss_function = nn.MSELoss()
  optim = torch.optim.Adam(model.parameters(), lr=0.001)  # TODO: try new learning rates

  losses, accuracies = [], []
  epochs = 11
  BS = 128

  for epoch in range(epochs):
    print("[+] Epoch", epoch)
    t = trange(0, len(frames)-BS, BS)
    for i in t: 
      # get data into network
      rng = np.random.default_rng()
      samp = rng.choice((frames), size=BS, replace=False)

      # TODO: get annotations to the output (need a serialize/deserialize function)
      X_train = []
      Y_train = []
      for j in samp:
        frame = frames[j]
        frame = np.moveaxis(frame, -1, 0) # [batch_size, channels, height, width]
        X_train.append(frame)
        #Y_train.append([annotations[j]])
      samp = []
      X = torch.tensor(np.array(X_train)).float().to(device)
      #Y = torch.tensor(np.array(Y_train)).float().to(device)

      # forward and back feed
      optim.zero_grad()
      out = model(X)
      accuracy = (out == Y).float().mean()  # TODO: this might be wrong
      loss = loss_function(out, Y)
      loss = loss.mean()
      loss.backward()
      optim.step()

      # print stats
      loss = loss.item()
      accuracy = accuracy.item()
      losses.append(loss/100) # /100 so that we can see it in the graph
      accuracies.append(accuracy)
      t.set_description("loss %.2f accuracy %.2f out %.2f" % (loss, accuracy, out.mean().item()))

  # plot losses and accuracies
  plt.ylim(-0.1, 1.1)
  plot(losses)
  plot(accuracies)

  return model

if device.type == "cuda":
  torch.cuda.empty_cache()  # to avoid running out of cuda memory
  print("[~] Cleared cuda cache")

model = REDetector().to(device).train()

for i in trange(len(video_files)):
  print("[~] Loading from files: %s , %s" % (base_dir+video_files[i], base_dir+annot_files[i]))
  frames, annotations = get_data(base_dir+video_files[i], base_dir+annot_files[i])
  frames = conv_frames(frames)
  if i == 0:
    all_frames = frames
    all_annotations = annotations
  else:
    all_frames = np.concatenate((all_frames, frames), axis=0)
    all_annotations = np.concatenate((all_annotations, annotations), axis=0)

franes, labels = [], [] # free up memory
print("[+] Training mode ...")
model = train(all_frames, all_annotations, model)
print("[+] Trained model on all data files")