Merge pull request #94 from luxonis/nn_on_left_right

Run NN on left/right mono cameras; add support for second stage NN
luxonis · Jul 20, 2020 · 99a60b3 · 99a60b3
2 parents a596142 + 84aa59c
commit 99a60b3
Show file tree

Hide file tree

Showing 13 changed files with 203 additions and 82 deletions.
diff --git a/calibrate.py b/calibrate.py
@@ -155,10 +155,9 @@ def __init__(self):
                 {
                     'mono':
                     {
-                        # 1280x720, 640x400 (binning enabled)
-                        # only 720/30 fps supported for now
+                        # 1280x720, 1280x800, 640x400 (binning enabled)
                         'resolution_h': 720,
-                        'fps': 30,
+                        'fps': 30.0,
                     },
                 },
         }

diff --git a/depthai-api b/depthai-api
diff --git a/depthai.cmd b/depthai.cmd
diff --git a/depthai.cpython-36m-x86_64-linux-gnu.so b/depthai.cpython-36m-x86_64-linux-gnu.so
diff --git a/depthai.cpython-37m-arm-linux-gnueabihf.so b/depthai.cpython-37m-arm-linux-gnueabihf.so
diff --git a/depthai.py b/depthai.py
@@ -6,7 +6,7 @@
 import os
 import subprocess
 from time import time, sleep, monotonic
-
+from datetime import datetime
 import cv2
 import numpy as np
 import depthai
@@ -17,7 +17,7 @@
 
 from depthai_helpers.object_tracker_handler import show_tracklets
 
-global args
+global args, cnn_model2
 try:
     args = vars(parse_args())
 except:
@@ -83,6 +83,22 @@
         suffix="_depth"
     blob_file_config = cnn_model_path + suffix + ".json"
 
+blob_file2 = ""
+blob_file_config2 = ""
+cnn_model2 = None
+if args['cnn_model2']:
+    print("Using CNN2:", args['cnn_model2'])
+    cnn_model2 = args['cnn_model2']
+    cnn_model_path = consts.resource_paths.nn_resource_path + args['cnn_model2']+ "/" + args['cnn_model2']
+    blob_file2 = cnn_model_path + ".blob"
+    blob_file_config2 = cnn_model_path + ".json"
+    if not Path(blob_file2).exists():
+        cli_print("\nWARNING: NN2 blob not found in: " + blob_file2, PrintColors.WARNING)
+        os._exit(1)
+    if not Path(blob_file_config2).exists():
+        cli_print("\nWARNING: NN2 json not found in: " + blob_file_config2, PrintColors.WARNING)
+        os._exit(1)
+
 blob_file_path = Path(blob_file)
 blob_file_config_path = Path(blob_file_config)
 if not blob_file_path.exists():
@@ -116,6 +132,13 @@
         "Disconnect/connect usb cable on host! \n", PrintColors.RED)
         os._exit(1)
 
+if args['cnn_camera'] == 'left_right':
+    if args['NN_engines'] is None:
+        args['NN_engines'] = 2
+        args['shaves'] = 6 if args['shaves'] is None else args['shaves'] - args['shaves'] % 2
+        args['cmx_slices'] = 6 if args['cmx_slices'] is None else args['cmx_slices'] - args['cmx_slices'] % 2
+        compile_model = True
+        cli_print('Running NN on both cams requires 2 NN engines!', PrintColors.RED)
 
 default_blob=True
 if compile_model:
@@ -124,8 +147,6 @@
     cmx_slices = args['cmx_slices']
     NCE_nr = args['NN_engines']
 
-    outblob_file = blob_file + ".sh" + str(shave_nr) + "cmx" + str(cmx_slices) + "NCE" + str(NCE_nr)
-
     if NCE_nr == 2:
         if shave_nr % 2 == 1 or cmx_slices % 2 == 1:
             cli_print("shave_nr and cmx_slices config must be even number when NCE is 2!", PrintColors.RED)
@@ -135,7 +156,9 @@
     else:
         shave_nr_opt = int(shave_nr)
         cmx_slices_opt = int(cmx_slices)
-
+
+    outblob_file = blob_file + ".sh" + str(shave_nr) + "cmx" + str(cmx_slices) + "NCE" + str(NCE_nr)
+
     if(not Path(outblob_file).exists()):
         cli_print("Compiling model for {0} shaves, {1} cmx_slices and {2} NN_engines ".format(str(shave_nr), str(cmx_slices), str(NCE_nr)), PrintColors.RED)
         ret = depthai.download_blob(args['cnn_model'], shave_nr_opt, cmx_slices_opt, NCE_nr, outblob_file)
@@ -150,6 +173,22 @@
         cli_print("Compiled mode found: compiled for {0} shaves, {1} cmx_slices and {2} NN_engines ".format(str(shave_nr), str(cmx_slices), str(NCE_nr)), PrintColors.GREEN)
         blob_file = outblob_file
 
+    if args['cnn_model2']:
+        outblob_file = blob_file2 + ".sh" + str(shave_nr) + "cmx" + str(cmx_slices) + "NCE" + str(NCE_nr)
+        if(not Path(outblob_file).exists()):
+            cli_print("Compiling model2 for {0} shaves, {1} cmx_slices and {2} NN_engines ".format(str(shave_nr), str(cmx_slices), str(NCE_nr)), PrintColors.RED)
+            ret = depthai.download_blob(args['cnn_model2'], shave_nr_opt, cmx_slices_opt, NCE_nr, outblob_file)
+            # ret = subprocess.call(['model_compiler/download_and_compile.sh', args['cnn_model'], shave_nr_opt, cmx_slices_opt, NCE_nr])
+            print(str(ret))
+            if(ret != 0):
+                cli_print("Model compile failed. Falling back to default.", PrintColors.WARNING)
+                default_blob=True
+            else:
+                blob_file2 = outblob_file
+        else:
+            cli_print("Compiled mode found: compiled for {0} shaves, {1} cmx_slices and {2} NN_engines ".format(str(shave_nr), str(cmx_slices), str(NCE_nr)), PrintColors.GREEN)
+            blob_file2 = outblob_file
+
 if default_blob:
     #default
     shave_nr = 7
@@ -169,14 +208,17 @@
         'calibration_file': consts.resource_paths.calib_fpath,
         'padding_factor': 0.3,
         'depth_limit_m': 10.0, # In meters, for filtering purpose during x,y,z calc
-        'confidence_threshold' : 0.5, #Depth is calculated for bounding boxes with confidence higher than this number 
+        'confidence_threshold' : 0.5, #Depth is calculated for bounding boxes with confidence higher than this number
     },
     'ai':
     {
         'blob_file': blob_file,
         'blob_file_config': blob_file_config,
+        'blob_file2': blob_file2,
+        'blob_file_config2': blob_file_config2,
         'calc_dist_to_bb': calc_dist_to_bb,
         'keep_aspect_ratio': not args['full_fov_nn'],
+        'camera_input': args['cnn_camera'],
         'shaves' : shave_nr,
         'cmx_slices' : cmx_slices,
         'NN_engines' : NCE_nr,
@@ -209,8 +251,7 @@
         },
         'mono':
         {
-            # 1280x720, 640x400 (binning enabled)
-            # only 720/30 fps supported for now
+            # 1280x720, 1280x800, 640x400 (binning enabled)
             'resolution_h': args['mono_resolution'],
             'fps': args['mono_fps'],
         },
@@ -285,13 +326,25 @@
 t_start = time()
 frame_count = {}
 frame_count_prev = {}
-for s in stream_names:
-    frame_count[s] = 0
-    frame_count_prev[s] = 0
-
 nnet_prev = {}
-nnet_prev["entries_prev"] = []
-nnet_prev["nnet_source"] = []
+nnet_prev["entries_prev"] = {}
+nnet_prev["nnet_source"] = {}
+frame_count['nn'] = {}
+frame_count_prev['nn'] = {}
+for s in stream_names:
+    stream_windows = []
+    if s == 'previewout':
+        for cam in {'rgb', 'left', 'right'}:
+            nnet_prev["entries_prev"][cam] = []
+            nnet_prev["nnet_source"][cam] = []
+            frame_count['nn'][cam] = 0
+            frame_count_prev['nn'][cam] = 0
+            stream_windows.append(s + '-' + cam)
+    else:
+        stream_windows.append(s)
+    for w in stream_windows:
+        frame_count[w] = 0
+        frame_count_prev[w] = 0
 
 tracklets = None
 
@@ -332,60 +385,73 @@ def on_trackbar_change(value):
             os._exit(10)
 
     for _, nnet_packet in enumerate(nnet_packets):
-        frame_count["metaout"] += 1
-
-        nnet_prev["nnet_source"] = nnet_packet
-        nnet_prev["entries_prev"] = decode_nn(nnet_packet)
+        camera = nnet_packet.getMetadata().getCameraName()
+        nnet_prev["nnet_source"][camera] = nnet_packet
+        nnet_prev["entries_prev"][camera] = decode_nn(nnet_packet, config=config)
+        frame_count['metaout'] += 1
+        frame_count['nn'][camera] += 1
 
     for packet in data_packets:
+        window_name = packet.stream_name
         if packet.stream_name not in stream_names:
             continue # skip streams that were automatically added
         packetData = packet.getData()
         if packetData is None:
             print('Invalid packet data!')
             continue
         elif packet.stream_name == 'previewout':
-
+            camera = packet.getMetadata().getCameraName()
+            window_name = 'previewout-' + camera
             # the format of previewout image is CHW (Chanel, Height, Width), but OpenCV needs HWC, so we
             # change shape (3, 300, 300) -> (300, 300, 3)
             data0 = packetData[0,:,:]
             data1 = packetData[1,:,:]
             data2 = packetData[2,:,:]
             frame = cv2.merge([data0, data1, data2])
 
-            nn_frame = show_nn(nnet_prev["entries_prev"], frame, labels=labels, config=config)
+            nn_frame = show_nn(nnet_prev["entries_prev"][camera], frame, labels=labels, config=config)
             if enable_object_tracker and tracklets is not None:
                 nn_frame = show_tracklets(tracklets, nn_frame, labels)
-            cv2.putText(nn_frame, "fps: " + str(frame_count_prev[packet.stream_name]), (25, 50), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 0))
-            cv2.imshow('previewout', nn_frame)
+            cv2.putText(nn_frame, "fps: " + str(frame_count_prev[window_name]), (25, 50), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 0))
+            cv2.putText(nn_frame, "NN fps: " + str(frame_count_prev['nn'][camera]), (2, frame.shape[0]-4), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 255, 0))
+            cv2.imshow(window_name, nn_frame)
         elif packet.stream_name == 'left' or packet.stream_name == 'right' or packet.stream_name == 'disparity':
             frame_bgr = packetData
             cv2.putText(frame_bgr, packet.stream_name, (25, 25), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 0))
-            cv2.putText(frame_bgr, "fps: " + str(frame_count_prev[packet.stream_name]), (25, 50), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 0))
+            cv2.putText(frame_bgr, "fps: " + str(frame_count_prev[window_name]), (25, 50), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 0))
             if args['draw_bb_depth']:
-                show_nn(nnet_prev["entries_prev"], frame_bgr, labels=labels, config=config, nn2depth=nn2depth)
-            cv2.imshow(packet.stream_name, frame_bgr)
+                camera = args['cnn_camera']
+                if packet.stream_name == 'disparity':
+                    if camera == 'left_right':
+                        camera = 'right'
+                elif camera != 'rgb':
+                    camera = packet.getMetadata().getCameraName()
+                show_nn(nnet_prev["entries_prev"][camera], frame_bgr, labels=labels, config=config, nn2depth=nn2depth)
+            cv2.imshow(window_name, frame_bgr)
         elif packet.stream_name.startswith('depth'):
             frame = packetData
 
             if len(frame.shape) == 2:
                 if frame.dtype == np.uint8: # grayscale
                     cv2.putText(frame, packet.stream_name, (25, 25), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 255))
-                    cv2.putText(frame, "fps: " + str(frame_count_prev[packet.stream_name]), (25, 50), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 255))
+                    cv2.putText(frame, "fps: " + str(frame_count_prev[window_name]), (25, 50), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 255))
                 else: # uint16
                     frame = (65535 // frame).astype(np.uint8)
                     #colorize depth map, comment out code below to obtain grayscale
                     frame = cv2.applyColorMap(frame, cv2.COLORMAP_HOT)
                     # frame = cv2.applyColorMap(frame, cv2.COLORMAP_JET)
                     cv2.putText(frame, packet.stream_name, (25, 25), cv2.FONT_HERSHEY_SIMPLEX, 1.0, 255)
-                    cv2.putText(frame, "fps: " + str(frame_count_prev[packet.stream_name]), (25, 50), cv2.FONT_HERSHEY_SIMPLEX, 1.0, 255)
+                    cv2.putText(frame, "fps: " + str(frame_count_prev[window_name]), (25, 50), cv2.FONT_HERSHEY_SIMPLEX, 1.0, 255)
             else: # bgr
                 cv2.putText(frame, packet.stream_name, (25, 25), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255, 255, 255))
-                cv2.putText(frame, "fps: " + str(frame_count_prev[packet.stream_name]), (25, 50), cv2.FONT_HERSHEY_SIMPLEX, 1.0, 255)
+                cv2.putText(frame, "fps: " + str(frame_count_prev[window_name]), (25, 50), cv2.FONT_HERSHEY_SIMPLEX, 1.0, 255)
 
             if args['draw_bb_depth']:
-                show_nn(nnet_prev["entries_prev"], frame, labels=labels, config=config, nn2depth=nn2depth)
-            cv2.imshow(packet.stream_name, frame)
+                camera = args['cnn_camera']
+                if camera == 'left_right':
+                    camera = 'right'
+                show_nn(nnet_prev["entries_prev"][camera], frame, labels=labels, config=config, nn2depth=nn2depth)
+            cv2.imshow(window_name, frame)
 
         elif packet.stream_name == 'jpegout':
             jpg = packetData
@@ -404,21 +470,29 @@ def on_trackbar_change(value):
                 ' CSS:' + '{:6.2f}'.format(dict_['sensors']['temperature']['css']),
                 ' MSS:' + '{:6.2f}'.format(dict_['sensors']['temperature']['mss']),
                 ' UPA:' + '{:6.2f}'.format(dict_['sensors']['temperature']['upa0']),
-                ' DSS:' + '{:6.2f}'.format(dict_['sensors']['temperature']['upa1']))            
+                ' DSS:' + '{:6.2f}'.format(dict_['sensors']['temperature']['upa1']))
         elif packet.stream_name == 'object_tracker':
             tracklets = packet.getObjectTracker()
 
-        frame_count[packet.stream_name] += 1
+        frame_count[window_name] += 1
 
     t_curr = time()
     if t_start + 1.0 < t_curr:
         t_start = t_curr
         # print("metaout fps: " + str(frame_count_prev["metaout"]))
 
         for s in stream_names:
-            frame_count_prev[s] = frame_count[s]
-            frame_count[s] = 0
-
+            stream_windows = []
+            if s == 'previewout':
+                for cam in {'rgb', 'left', 'right'}:
+                    stream_windows.append(s + '-' + cam)
+                    frame_count_prev['nn'][cam] = frame_count['nn'][cam]
+                    frame_count['nn'][cam] = 0
+            else:
+                stream_windows.append(s)
+            for w in stream_windows:
+                frame_count_prev[w] = frame_count[w]
+                frame_count[w] = 0
 
     key = cv2.waitKey(1)
     if key == ord('c'):

diff --git a/depthai_helpers/age_gender_recognition_handler.py b/depthai_helpers/age_gender_recognition_handler.py
@@ -2,7 +2,7 @@
 import numpy as np
 
 
-def decode_age_gender_recognition(nnet_packet):
+def decode_age_gender_recognition(nnet_packet, **kwargs):
     detections = []
     for _, e in enumerate(nnet_packet.entries()):
         if e[1]["female"] > 0.8 or e[1]["male"] > 0.8:

diff --git a/depthai_helpers/cli_utils.py b/depthai_helpers/cli_utils.py
@@ -51,13 +51,13 @@ def parse_args():
     parser.add_argument("-nce", "--NN_engines", default=None, type=int,
                         help="Number of NN_engines used by NN.")
     parser.add_argument("-rgbr", "--rgb_resolution", default=1080, type=int,
-                        help="RGB cam res config: 1080 or 2160 are supported.")
-    parser.add_argument("-rgbf", "--rgb_fps", default=30, type=int,
-                        help="RGB cam fps config: 30 fps is supported.")
+                        help="RGB cam res height: (1920x)1080, (3840x)2160 or (4056)x3040. Default: %(default)s")
+    parser.add_argument("-rgbf", "--rgb_fps", default=30.0, type=float,
+                        help="RGB cam fps: max 118.0 for H:1080, max 42.0 for H:2160. Default: %(default)s")
     parser.add_argument("-monor", "--mono_resolution", default=720, type=int,
-                        help="Mono cam res config: 720 or 480 are supported.")
-    parser.add_argument("-monof", "--mono_fps", default=30, type=int,
-                        help="Mono cam fps config: 30 fps is supported.")
+                        help="Mono cam res height: (1280x)720, (1280x)800 or (640x)400 - binning. Default: %(default)s")
+    parser.add_argument("-monof", "--mono_fps", default=30.0, type=float,
+                        help="Mono cam fps: max 60.0 for H:720 or H:800, max 120.0 for H:400. Default: %(default)s")
     parser.add_argument("-dct", "--disparity_confidence_threshold", default=200, type=disparity_ct_type,
                         help="Disparity_confidence_threshold.")
     parser.add_argument("-fv", "--field-of-view", default=None, type=float,
@@ -85,6 +85,10 @@ def parse_args():
                         help="Force usb2 connection")
     parser.add_argument("-cnn", "--cnn_model", default="mobilenet-ssd", type=str,
                         help="Cnn model to run on DepthAI")
+    parser.add_argument("-cnn2", "--cnn_model2", default="", type=str,
+                        help="Cnn model to run on DepthAI for second-stage inference")
+    parser.add_argument('-cam', "--cnn_camera", default='rgb', choices=['rgb', 'left', 'right', 'left_right'],
+                        help='Choose camera input for CNN (default: %(default)s)')
     parser.add_argument("-dd", "--disable_depth", default=False, action="store_true",
                         help="Disable depth calculation on CNN models with bounding box output")
     parser.add_argument("-bb", "--draw-bb-depth", default=False, action="store_true",

diff --git a/depthai_helpers/emotion_recognition_handler.py b/depthai_helpers/emotion_recognition_handler.py
@@ -2,7 +2,7 @@
 import numpy as np
 
 
-def decode_emotion_recognition(nnet_packet):
+def decode_emotion_recognition(nnet_packet, **kwargs):
     detections = []
     for i in range(len(nnet_packet.entries()[0][0])):
         detections.append(nnet_packet.entries()[0][0][i])

diff --git a/depthai_helpers/landmarks_recognition_handler.py b/depthai_helpers/landmarks_recognition_handler.py
@@ -2,7 +2,7 @@
 import numpy as np
 
 
-def decode_landmarks_recognition(nnet_packet):
+def decode_landmarks_recognition(nnet_packet, **kwargs):
     landmarks = []
     for i in range(len(nnet_packet.entries()[0][0])):
         landmarks.append(nnet_packet.entries()[0][0][i])