# **MODEL C: YOLOv3 + SORT + Early Fused Skeleton + ST-DenseNet** 
## A unified framework for pedestrian intention prediction.
1. **YOLOv3** -> Object detector: responsible to identify and detect objects of interest in a given frame or image.

2. **SORT** -> Object Tracker: SORT is responsible tracking the detected object.

4. **Early Fused Skeleton** -> Skeleton mapping: Skeletons are then mapped for each tracked pedestrian.

3. **Spatio-Temporal DenseNet** -> Classifier: responsible to classify every identified and tracked pedestrian's intention by using the last 16 frames of a pedetrian.

*The codes for YOLOv3 was adapted from the GitHub repo: https://github.com/zzh8829/yolov3-tf2*

*The codes for SORT was adapted from the GitHub repo: https://github.com/abewley/sort*

*The codes for Skeleton FittingTF-PoseEstimator was adapted from the GitHub repo: https://github.com/ildoonet/tf-pose-estimation*

*The codes for ST-DenseNet was adapted from the GitHub repo: https://github.com/GalDude33/DenseNetFCN-3D*



## **INSTRUCTIONS TO RUN THE MODEL ON GOOGLE COLAB**

This project was completely developed on Google Colab.



###1. Connect runtime to GPU for better/faster results.


###2. Clone the repository to Colab.

In [0]:
# run this to clone the repository Volvo-DataX
!git clone https://github.com/mjpramirez/Volvo-DataX

###3. Run this to install dependencies

In [0]:

%cd Volvo-DataX/tf-pose-estimation
! pip3 install -r requirements.txt
%cd tf_pose/pafprocess
! sudo apt install swig
!swig -python -c++ pafprocess.i && python3 setup.py build_ext --inplace

###4. Next click this link to activate the folder in your google drive: https://drive.google.com/open?id=1HxKtxBva3US2AJfohlKfjYSdhHvjt2Yc and add a shortcut of the folder to the main drive folder

And finally, run the cell to mount your google drive


In [0]:
from google.colab import drive
drive.mount('/content/drive')

###5. To run the remaining cells below, observe the comments and run them appropriately. Also running some codes may provide warnings, so please ignore them. 

In [0]:
# run this
%cd /content/Volvo-DataX
!pip install filterpy
try:
  %tensorflow_version 2.x
except Exception:
  pass
import glob
 
import sys #Run this
from absl import app, logging, flags
from absl.flags import FLAGS
import time
import cv2
import numpy as np
import tensorflow as tf
from yolov3_tf2.models import (
    YoloV3, YoloV3Tiny
)
from yolov3_tf2.dataset import transform_images, load_tfrecord_dataset
from yolov3_tf2.utils import draw_outputs
from sortn import *

tf.compat.v1.disable_eager_execution()
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

flags.DEFINE_string('classes', 'data/coco.names', 'path to classes file')
flags.DEFINE_string('weights', '/content/drive/My Drive/datax_volvo_additional_files/yolov3_train_5.tf','path to weights file')
flags.DEFINE_boolean('tiny', False, 'yolov3 or yolov3-tiny')
flags.DEFINE_integer('size', 416, 'resize images to')
flags.DEFINE_string('tfrecord', None, 'tfrecord instead of image')
flags.DEFINE_integer('num_classes', 1, 'number of classes in the model')
flags.DEFINE_string('video', 'data/JAAD_test_video_0339.mp4','path to video file or number for webcam)')
flags.DEFINE_string('output','Result_model_C.mp4', 'path to output video')
flags.DEFINE_string('output_format', 'mp4v', 'codec used in VideoWriter when saving video to file')

app._run_init(['yolov3'], app.parse_flags_with_usage)


In [0]:
%cd /content/Volvo-DataX/tf-pose-estimation
from tf_pose.estimator import TfPoseEstimator
from tf_pose.networks import get_graph_path, model_wh
from tf_pose.estimator import Human
model = TfPoseEstimator(get_graph_path('egen_jaad_1_5'), target_size=(100, 100))

%cd /content/Volvo-DataX
with open('densenet_model.json', 'r') as json_file:
    json_savedModel= json_file.read()

model_j = tf.keras.models.model_from_json(json_savedModel)
model_j.load_weights('densenet_2.hdf5')

def pred_func(X_test):
  predictions = model_j.predict(X_test[0:1], verbose=0)
  Y = np.argmax(predictions[0], axis=0)
    
  return Y

In [0]:
# Run this
FLAGS.yolo_iou_threshold = 0.5
FLAGS.yolo_score_threshold = 0.5

color = (255, 0, 0) 
thickness = 2

yolo = YoloV3(classes=FLAGS.num_classes)

yolo.load_weights(FLAGS.weights).expect_partial()
logging.info('weights loaded')

class_names = [c.strip() for c in open(FLAGS.classes).readlines()]
logging.info('classes loaded')

resize_out_ratio = 4.0
fps_time = 0

def run_model():
  print('Processing started.......')

  try:
      vid = cv2.VideoCapture(int(FLAGS.video))
  except:
      vid = cv2.VideoCapture(FLAGS.video)

  out = None
  frame = 0

  color = (255, 0, 0) 
  thickness = 2

  if FLAGS.output:
      # by default VideoCapture returns float instead of int
      width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH))
      height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))
      fps = int(vid.get(cv2.CAP_PROP_FPS))
      codec = cv2.VideoWriter_fourcc(*FLAGS.output_format)
      out = cv2.VideoWriter(FLAGS.output, codec, fps, (width, height))

  #create instance of SORT
  mot_tracker = Sort()
  rolling_data = {}

  while True:
      _, img = vid.read()

      if img is None:
          break
      
      frame +=1

      img_in = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 
      img_orig = img
      img_in = tf.expand_dims(img_in, 0)
      img_in = transform_images(img_in, FLAGS.size)

      boxes, scores, classes, nums = yolo.predict(img_in, steps=1) # yolo prediction
      dets = boxes[:,:nums[0],:].reshape(nums[0], 4)  # filter pedest
      trackers = mot_tracker.update(dets[classes[0][:nums[0]] == 0])

      for d in trackers:

        wh = np.flip(img.shape[0:2])    
        x1y1 = tuple((np.array(d[0:2]) * wh).astype(np.int32))
        x2y2 = tuple((np.array(d[2:4]) * wh).astype(np.int32))

        x1 = x1y1[0]
        y1 = x1y1[1]
        bbwh = (x2y2[0]-x1y1[0], x2y2[1]-x1y1[1])
        w = bbwh[0]
        h = bbwh[1]

        try:
          cropped = img_orig[y1:y1 + h, x1:x1 + w]
          humans = model.inference(cropped, resize_to_default=(w > 0 and h > 0), upsample_size=resize_out_ratio)
          humans.sort(key=lambda human: human.score, reverse=True)
          skelett = TfPoseEstimator.draw_humans(cropped, humans, imgcopy=True)
          img_orig[y1:y1 + h, x1:x1 + w] = skelett
          img_orig2 = img_orig

        except:
          img_orig2 = img_orig
          pass

        intent = 0

        if int(d[4]) in list(rolling_data.keys()):

          if len(rolling_data[int(d[4])]) == 16:
            
            seq = np.stack(np.array(rolling_data[int(d[4])]),axis=2)
            seq = np.expand_dims(seq, axis=0)
            intent = pred_func(seq) # classification output

          else:

            seq = np.stack(np.array([rolling_data[int(d[4])][-1]] * 16),axis=2)
            seq = np.expand_dims(seq, axis=0)
            intent = pred_func(seq) # classification output

        # risky pedestrian identification thru box color

        if intent == 1:
          color = (0, 0, 255)

        else:
          color = (0, 255, 0)

        
        img = cv2.rectangle(img_orig2, x1y1, x2y2, color, thickness) 
        img = cv2.putText(img, str(int(d[4])), org = (x1y1[0],x1y1[1]-5) , fontFace = cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=color, thickness=thickness)
        img = cv2.putText(img, "Frame No: {}".format(frame), (0, 30),cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (255, 0, 0), 2)

        # storing the data for last 16 frames
        try:

          if int(d[4]) in list(rolling_data.keys()): # ID exists in dict

            if len(rolling_data[int(d[4])]) < 16: # bboxes values for 16 frames
                
              cropped_seq = []
              cropped_img = cv2.resize(img_orig[x1y1[1]:x2y2[1], x1y1[0]:x2y2[0]],(100,100))
              rolling_data[int(d[4])].append(np.asarray(cropped_img)) # append the image      

            else:

              del rolling_data[int(d[4])][0] # delete oldest frame bbox and append latest frame bbox
              cropped_seq = []
              cropped_img = cv2.resize(img_orig[x1y1[1]:x2y2[1], x1y1[0]:x2y2[0]],(100,100))
              rolling_data[int(d[4])].append(np.asarray(cropped_img))
              
          else:

            cropped_seq = []
            cropped_img = cv2.resize(img_orig[x1y1[1]:x2y2[1], x1y1[0]:x2y2[0]],(100,100))
            rolling_data[int(d[4])] = [np.asarray(cropped_img)]  

        except:
          pass  

      if FLAGS.output:
        out.write(img)

      if cv2.waitKey(1) == ord('q'):
        break

  cv2.destroyAllWindows()
  print('\nProcessing completed.......!!!')
  print('Check video file in Volvo-DataX folder!')

  return


###6. Run this to obtain the Model-C output as a video file named **'Result_model_C.mp4'** in Volvo-DataX folder. 
After running the run_model() function expect around 7 mins for GPU and 30 mins for CPU

In [0]:
run_model()