# Imports

### Libraries

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import random
from random import randint
import os
import glob
import cv2
import pickle
from zipfile import ZipFile
import imageio
import time
import statistics
import editdistance
import traceback
import cProfile
import LM.clean
import LM.segment

%matplotlib inline
figure_size = 15
plt.rcParams['figure.figsize'] = (figure_size, figure_size)
plt.rcParams['image.interpolation'] = 'nearest'

from SegLink import datasets
from SegLink.ssd_utils import load_weights
from SegLink.ssd_data import InputGenerator, preprocess
from SegLink.sl_model import SL512, SL512v2
from SegLink.sl_utils import PriorUtil, rbox_to_polygon, polygon_to_rbox
from SegLink.sl_metric import evaluate_results, fscore
from SegLink.ssd_viz import plot_box, escape_latex
from SegLink.ssd_training import Logger

import keras
import keras.backend as K
from keras.backend.tensorflow_backend import set_session
from keras.callbacks import Callback
from keras.optimizers import SGD, Adam
import tensorflow as tf

from CRNN.crnn_model import CRNN, CRNNv2
from CRNN.crnn_data import InputGenerator, crop_words 
from CRNN.crnn_utils import alphabet87 as alphabet
from CRNN.crnn_utils import decode

In [None]:
# dsodmodel = DSODSL512()
# dsodmodel.summary()

### Datasets

In [None]:
from data_synthtext import GTUtility
with open('gt_util_synthtext_seglink.pkl', 'rb') as f:
    gt_util = pickle.load(f)
gt_util_train, gt_util_other = gt_util.split(gt_util, split=0.5)
gt_util_val, foo = gt_util.split(gt_util_other, split=0.03)
gt_util_test, bar = gt_util.split(foo, split=0.03)

print(gt_util_train)
print(gt_util_val)
print(gt_util_test)

In [None]:
# from data_svt import GTUtility
# gt_util_test = GTUtility('data/SVT/', test=True, polygon=True)

# from data_icdar2015fst import GTUtility
# gt_util_test = GTUtility('data/ICDAR2015_FST/', test=True, polygon=False)

# gt_util_test = GTUtility.merge(gt_util_test1, gt_util_test2)

# Initialize Models

### SegLink

In [None]:
# model = SL512(input_shape=(384,512,3))
model = SL512v2()
prior_util = PriorUtil(model)
image_size = model.image_size
# weights_path = './checkpoints/201807282000_sl_pretrain_fixed_dataset_continutation/weights.010.h5'
# weights_path = './cps/sl/original_sl_2013/weights.999.h5'
weights_path = './checkpoints/weights.004.h5'
# weights_path = './checkpointsw/weights.002.h5'
load_weights(model, weights_path)
checkdir = os.path.split(weights_path)[0]
model.summary()

### CRNN

In [None]:
input_width = 256 #256 
input_height = 32 #32
crnn_model = CRNNv2((input_width, input_height, 1), len(alphabet), prediction_only=True, gru=False)
load_weights(crnn_model, './checkpointsw/201809190735_crnn_fine_tune/weights.080000.h5')
# load_weights(crnn_model, './checkpointsw/PRETRAINED WEIGHTS/CRNN_with_LSTM/201806162129_crnn_lstm_synthtext/weights.400000.h5')
# crnn_model.summary()
# model, model_pred = CRNNv2(input_shape, len(alphabet), gru=False)

# freeze = ['conv1_1', 'conv2_1', 'conv3_1', 'conv3_2']

# names = [weight.name for layer in crnn_model.layers for weight in layer.weights]
# weights = crnn_model.get_weights()

# for name, weight in zip(names, weights):
#     print(name, weights)
#     print("========================================================")

# for layer in crnn_model.layers:
#     layer.trainable = False
    
# unfreeze = ['conv4_11111', 
#             'batchnorm111', 
#             'act111', 
#             'conv4_22222', 
#             'batchnorm222', 
#             'act222',
#             'conv5_11111', 
#             'batchnorm333',
#             'act333',
#             'conv5_22222',
#             'batchnorm444',
#             'act444'
#            ]
# for layer in crnn_model.layers:
#     if layer.name in unfreeze:
#         layer.trainable = True

# for layer in crnn_model.layers:
#     print(layer.name, "\t\t\t\t\t\t", layer.trainable)

# Predictions

### Detection

In [None]:
inputs = []
images = []
images_orig = []
data = []

# gtu = gt_util_test

# for i in np.random.randint(0, gtu.num_samples, 16):    
    
#     img_path = os.path.join(gtu.image_path, gtu.image_names[i])
#     img = cv2.imread(img_path)
    
#     inputs.append(preprocess(img, image_size))
    
#     h, w = image_size
#     img = cv2.resize(img, (w,h), cv2.INTER_LINEAR).astype('float32')
#     img = img[:, :, (2,1,0)]
#     img /= 255
#     images.append(img)
    
#     boxes = gtu.data[i]
#     data.append(boxes)
    
# inputs = np.asarray(inputs)
# preds = model.predict(inputs, batch_size=1, verbose=1)

from skimage import io, color, exposure, transform

inputs = []
images = []
images_orig = []
data = []

gtu = gt_util_test

# np.random.seed(1337)
for i in np.random.randint(0, gtu.num_samples, 10):
    
    img_path = os.path.join(gtu.image_path, gtu.image_names[i])
    img = cv2.imread(img_path)
    
#     hsv = color.rgb2hsv(img) # convert to hsv color space
#     hsv[:, :, 2] = exposure.equalize_hist(hsv[:, :, 2]) # 
#     img = color.hsv2rgb(hsv)
    
    images_orig.append(np.copy(img))
    inputs.append(preprocess(img, image_size))
    h, w = image_size
    img = cv2.resize(img, (w,h), cv2.INTER_LINEAR).astype('float32') # should we do resizing
    img = img[:, :, (2,1,0)] # BGR to RGB
    img /= 255
    images.append(img)
#     boxes = gtu.data[i]
#     data.append(boxes)
    
inputs = np.asarray(inputs)
preds = model.predict(inputs, batch_size=1, verbose=1)

### Grid Search to Find Optimal Parameters

In [None]:
# steps_seg = np.arange(0.1, 1, 0.1)
# steps_lnk = np.arange(0.1, 1, 0.1)

# fmes_grid = np.zeros((len(steps_seg),len(steps_lnk)))

# '''

# def dfs(node, group_id):
#     if ids[node] == None:
#         ids[node] = group_id
#         for a in adjacency[node]:
#             dfs(a, group_id)
            
# '''
# ordered = []
# for i, segment_threshold in enumerate(steps_seg):
#     for j, link_threshold in enumerate(steps_lnk):
#         results = [prior_util.decode(p, segment_threshold=segment_threshold, link_threshold=link_threshold) for p in preds]
#         TP, FP, FN = evaluate_results(data, results)
#         recall = TP / (TP+FN)
#         precision = TP / (TP+FP)
#         fmes = fscore(precision, recall)
#         fmes_grid[i,j] = fmes
#         ordered.append([fmes, segment_threshold, link_threshold])
# ordered.sort(reverse=True)
# for i in range(5):
#     print('Segment Threshold: %.2f   Link Threshold: %.2f   F-Score: %.2f' % (ordered[i][1], ordered[i][2], ordered[i][0]))

### Recognition

In [None]:
# for k in range(len(preds)):
#     plt.figure(figsize=[12]*2)
#     plt.imshow(images[k])
#     res = prior_util.decode(preds[k], segment_threshold=0.35, link_threshold=0.25)
#     rboxes = res
#     if len(rboxes) == 0:
#         plt.show()
#         continue
       
#     # ADD A SMALL PADDING TO THE DETECTED BOXES BEFORE RECOGNITION
#     bh = rboxes[:,3]
#     rboxes[:,2] += bh * 0.2
#     rboxes[:,3] += bh * 0.2
    
#     boxes = []
#     for r in rboxes:
#         ds = r[:5]
#         r2p = rbox_to_polygon(ds)
#         boxes.append(r2p)
#     boxes = np.asarray(boxes)
#     boxes = np.flip(boxes, axis=1) 
#     boxes = np.reshape(boxes, (-1, 8))
    
#     # if height is greater than width?
#     boxes_mask_a = np.array([b[2] > b[3] for b in rboxes]) 
#     # if box falls outside frame?
#     boxes_mask_b = np.array([not (np.any(b < 0) or np.any(b > image_size[0])) for b in boxes]) 
#     # merge masks
#     boxes_mask = np.logical_and(boxes_mask_a, boxes_mask_b)
    
#     boxes = boxes[boxes_mask]
#     rboxes = rboxes[boxes_mask]
#     if len(boxes) == 0:
#         boxes = np.empty((0,8))
    
#     # PLOT BOUNDING BOXES
#     for box in boxes:
#         plot_box(box, 'polygon', linewidth=5)
    
#     words = crop_words(img, boxes/image_size[0], input_height, width=input_width, grayscale=True)
#     words = np.asarray([w.transpose(1,0,2) for w in words])

#     if len(words) > 0:
#         res_crnn = crnn_model.predict(words)
#     else:
#         res_crnn = []

#     for i in range(len(res_crnn)):
#         chars = [alphabet[c] for c in np.argmax(res_crnn[i], axis=1)]
        
#         if False:
#             img = words[i][:,:,0].T
#             plt.figure(figsize=[30,0.5])
#             plt.imshow(img, cmap='gray')
#             ax = plt.gca()
#             ax.get_xaxis().set_visible(False)
#             ax.get_yaxis().set_visible(False)
#             plt.show()
        
#         #gt_str = texts[i]
#         res_str = decode(chars)
        
#         #ed = editdistance.eval(gt_str, res_str)
#         #ed = levenshtein(gt_str, res_str)
#         #ed_norm = ed / len(gt_str)
#         #mean_ed += ed
#         #mean_ed_norm += ed_norm
        
#         print('%-20s %s' % (res_str, ''.join(chars)))
#         x, y, w, h, theta = rboxes[i][:5]
#         plt.text(x+h*np.sin(theta)/2, 
#                  y+h*np.cos(theta)/2, 
#                  res_str, 
#                  rotation=theta/np.pi*180, 
#                  horizontalalignment='center',
#                  verticalalignment='center',
#                  fontsize=40, 
#                  color='r',
#                  alpha=1) 
        
#     plt.show()

In [None]:
from crnn_data import crop_words
from crnn_utils import decode
from sl_utils import rbox_to_polygon, polygon_to_rbox
from ssd_viz import plot_box, escape_latex
plot_name = 'sl512_crnn_sythtext'
import re



def most_common(lst):
    return max(set(lst), key=lst.count)

for k in range(len(preds)):
    plt.figure(figsize=[12]*2)
    plt.imshow(images[k])
    res = prior_util.decode(preds[k], segment_threshold=0.8, link_threshold=0.5)
        
    img = images_orig[k]
    rboxes = res[:,:5]
    if len(rboxes) == 0:
        plt.axis('off')
        plt.show()
        continue
        
    # ADD A SMALL PADDING TO THE DETECTED BOXES BEFORE RECOGNITION
    bh = rboxes[:,3]
    rboxes[:,2] += bh * 0.1
    rboxes[:,3] += bh * 0.2
    
    boxes = np.asarray([rbox_to_polygon(r) for r in rboxes])
    boxes = np.flip(boxes, axis=1) # TODO: fix order of points, why?
    boxes = np.reshape(boxes, (-1, 8))
    
    boxes_mask_a = np.array([b[2] > b[3] for b in rboxes]) # width > height, in square world
    boxes_mask_b = np.array([not (np.any(b < 0) or np.any(b > 512)) for b in boxes]) # box inside image
    boxes_mask = np.logical_and(boxes_mask_a, boxes_mask_b)
    
    boxes = boxes[boxes_mask]
    rboxes = rboxes[boxes_mask]
    if len(boxes) == 0:
        boxes = np.empty((0,8))
        
#     print(boxes)
# #     print("============================")
    
    # PLOT BOUNDING BOXES
#     visualize_bounding_box(img, boxes)
    
    for box in boxes:
        for i in range(len(box)):
            if i % 2 == 0:
                box[i] = box[i]/512
            else:
                box[i] = box[i]/512
    words = crop_words(img, boxes, input_height, width=input_width, grayscale=True)
    words = np.asarray([w.transpose(1,0,2) for w in words])
#     print('words', words.shape)
    if len(words) > 0:
        res_crnn = crnn_model.predict(words)
    else:
        res_crnn = []

    #print('rboxes', len(rboxes), 'words', len(words), 'res_crnn', len(res_crnn))
    for i in range(len(res_crnn)):
        chars = [alphabet[c] for c in np.argmax(res_crnn[i], axis=1)]
        
        if False:
            img = words[i][:,:,0].T
            plt.figure(figsize=[30,0.5])
            plt.imshow(img, cmap='gray')
            ax = plt.gca()
            ax.get_xaxis().set_visible(False)
            ax.get_yaxis().set_visible(False)
            plt.show()
        
        #gt_str = texts[i]
        res_str = decode(chars)
        
        #ed = editdistance.eval(gt_str, res_str)
        #ed = levenshtein(gt_str, res_str)
        #ed_norm = ed / len(gt_str)
        #mean_ed += ed
        #mean_ed_norm += ed_norm
        
        #print('%-20s %s' % (res_str, ''.join(chars)))
        #print('%s %-20s %0.2f' % (''.join(chars), res_str, res[i,5]))
        
        #print('%-20s %-20s %s %0.2f' % (
        #    gt_str,
        #    res_str,
        #    ''.join(chars),
        #    ed_norm))
        x, y, w, h, theta = rboxes[i]
        
        #res_str = re.sub(r"([#$%&_{}])", r"\\\1" , res_str)
        #print(res_str, '   ', escape_latex(res_str))
        
#         out_string = escape_latex(res_str)
        # pre-processing
#         s = out_string.lower()
#         s = re.sub(r'[^\w\s]','',s)
#         s = ''.join([i for i in s if not i.isdigit()])

#         possible = []
        
        # ============ BRANCH 1 ===================
#         possible.append(clean.candidates(s))

        # # ============ BRANCH 2 ===================
        # split_words = infer_spaces(s).split(" ")
        # print(split_words)
        # for word in split_words:
        # 	possible.append(candidates(word))

        # ============ BRANCH 3 ===================
#         for i in range(len(s) + 1):
#             possible.append(clean.candidates(s[:i]))
#             possible.append(clean.candidates(s[i:]))

        # ============ Combine ===================
#         possible = [item for sublist in possible for item in sublist]
#         print(possible)
#         print(most_common(possible))
        
            
            
        #plt.text(x+h*np.sin(theta)/2, y+h*np.cos(theta)/2, escape_latex(res_str), rotation=theta/np.pi*180, 
        #         horizontalalignment='center', size='x-large' , color='cyan') # magenta, lime
        plt.text(x+h*np.sin(theta)/2, y+h*np.cos(theta)/2, escape_latex(res_str), rotation=theta/np.pi*180, 
                 horizontalalignment='center', color='yellow', fontsize=30) # magenta, lime
    
    plt.axis('off')
    
    file_name = 'plots/%s_endtoend_realworld_%03i.pgf' % (plot_name, k)
    #plt.savefig(file_name, bbox_inches='tight')
    
    plt.show()

# In Video Context

### Load Videos

In [None]:
def load_video(rootpath, filetype):
    vids = []
    fnames = []
    for filename in glob.glob(rootpath + "*." + filetype):
        actual_filename = filename.split("/")[-1]
        fnames.append(actual_filename.split(".")[0])
        vids.append(imageio.get_reader(filename,  'ffmpeg'))
    fnames, vids = zip(*sorted(zip(fnames, vids)))
    return fnames, vids

def load_video_gt():
    paths = glob.glob("./data/fixed_video_gt/Fixed_*.txt")
    paths.sort()
    gt = []
    for path in paths:
        with open(path) as f:
            content = f.read().splitlines()
        video_gt = []
        frame_gt = []
        for i in range(len(content)):
            if content[i] == "FRAME":
                video_gt.append(frame_gt)
                frame_gt = []
            else:
                lbl = content[i].split()
                lbl = lbl[0].split(",")
                for i in range(8):
                    lbl[i] = float(lbl[i])
                frame_gt.append(lbl)
        gt.append(video_gt)
    return gt

video_names, videos = load_video("./data/icdar_2015_text_in_video/training/", "mp4")
gt = load_video_gt()

### Play Videos

In [None]:
# def show_video_with_gt(video_path, labels, show_text=False):
#     camera = cv2.VideoCapture(video_path)
#     fps = camera.get(cv2.CAP_PROP_FPS)
#     counter = 0
    
#     while True:
#         (grabbed, frame) = camera.read()
        
#         if not grabbed:
#             break

#         frame_labels = labels[counter]
#         for label in frame_labels:
#             if label[-1] == "##DONT#CARE##":
#                 continue 
#             vrx = np.array(label[:8], np.int32)
#             vrx = vrx.reshape((-1,1,2))
#             frame = cv2.polylines(frame, [vrx], True, (0,255,255),2)
            
#             if show_text:
#                 avg_x = int(round((label[0] + label[2] + label[4] + label[6]) / 4))
#                 half_width = int(round((max(label[0], label[2], label[4], label[6]) - min(label[0], label[2], label[4], label[6]))/2))
#                 avg_y = int(round((label[1] + label[3] + label[5] + label[7]) / 4))
#                 cv2.putText(frame, label[-1], (avg_x-half_width, avg_y+30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2)

#         cv2.imshow("Frame", frame)
#         cv2.waitKey(int(round(1000/fps)))
#         counter += 1
        
#     camera.release()
#     cv2.waitKey(0)
#     cv2.destroyAllWindows()
#     cv2.waitKey(1)
    
# video_index = 1
# video_base_path = "./data/icdar_2015_text_in_video/training/"
# video_format = ".mp4"
# video_path = video_base_path + video_names[video_index] + video_format
# show_video_with_gt(video_path, gt[video_index], True)

In [None]:
'''
    So instead of tracking the 4 coordinates of the bounding box, I should be tracking 'interesting' points on the text
    such as the corners. This should yield much better results.
'''

In [None]:
lk_params = dict(winSize = (10, 10), 
                 maxLevel = 8, 
                 criteria = (cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 100, 0.1))

def track_point(old_frame, new_frame, old_points):
    new_points, status, error = cv2.calcOpticalFlowPyrLK(old_frame, new_frame, old_points, None, **lk_params)
    return new_points, status

def draw_bounding_box(frame, box, color):
    vrx = np.array(box, np.int32)
    vrx = vrx.reshape((-1,1,2))
    frame = cv2.polylines(frame, [vrx], True, color, 2)
    return frame

def draw_text(frame, text, box):
    avg_x = int(round((box[0] + box[2] + box[4] + box[6]) / 4))
    half_width = int(round((max(box[0], box[2], box[4], box[6]) - min(box[0], box[2], box[4], box[6]))/2))
    avg_y = int(round((box[1] + box[3] + box[5] + box[7]) / 4))
    cv2.putText(frame, text, (avg_x-half_width, avg_y+30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2)
    
def postprocess_detections(preds, segment_threshold=0.4, link_threshold=0.3):
    res = prior_util.decode(preds[0], segment_threshold=segment_threshold, link_threshold=link_threshold, debug=False)
    rboxes = res
    if len(rboxes) == 0:
        return []

    bh = rboxes[:,3]
    rboxes[:,2] += bh * 0.2
    rboxes[:,3] += bh * 0.2

    boxes = []
    for r in rboxes:
        ds = r[:5]
        r2p = rbox_to_polygon(ds)
        boxes.append(r2p)

    boxes = np.asarray(boxes)
    boxes = np.flip(boxes, axis=1) 
    boxes = np.reshape(boxes, (-1, 8))

    boxes_mask_a = np.array([b[2] > b[3] for b in rboxes]) 
    boxes_mask_b = np.array([not (np.any(b < 0) or np.any(b > image_size[0])) for b in boxes]) 
    boxes_mask = np.logical_and(boxes_mask_a, boxes_mask_b)

    boxes = boxes[boxes_mask]
    rboxes = rboxes[boxes_mask]
    if len(boxes) == 0:
        boxes = np.empty((0,8))
    return boxes

def recognize_detected_text(img, boxes):
    words = crop_words(img, boxes/image_size[0], input_height, width=input_width, grayscale=True)
    words = np.asarray([w.transpose(1,0,2) for w in words])
    if len(words) > 0:
        res_crnn = crnn_model.predict(words)
    else:
        res_crnn = []        
    recognized_text = []
    for i in range(len(res_crnn)):
        chars = [alphabet[c] for c in np.argmax(res_crnn[i], axis=1)]
        res_str = decode(chars)
        recognized_text.append(res_str)
    return recognized_text

def shift_box(initial_points, new_points, status, box):
    delta_x = 0; delta_y = 0
    delta_x_list = []; delta_y_list = []
    for k in range(len(initial_points)):
        if status[k] == 0:
            continue
        delta_x = delta_x - (initial_points[k][0][0] - new_points[k][0][0])
        delta_y = delta_y - (initial_points[k][0][1] - new_points[k][0][1])
        delta_x_list.append(delta_x)
        delta_y_list.append(delta_y)
    delta_x_list.sort()
    delta_y_list.sort()
    median_delta_x = statistics.median(delta_x_list)
    median_delta_y = statistics.median(delta_y_list)
    
    delta_x = 0; delta_y = 0
    THRES = 1.1
    for k in range(len(initial_points)):
        if status[k] == 0:
            continue
        if (abs(initial_points[k][0][0] - new_points[k][0][0])) < (abs(THRES * median_delta_x)) and (abs(initial_points[k][0][1] - new_points[k][0][1])) < (abs(THRES * median_delta_y)):
            delta_x = delta_x - (initial_points[k][0][0] - new_points[k][0][0])
            delta_y = delta_y - (initial_points[k][0][1] - new_points[k][0][1])
    mean_delta_x = delta_x / len(initial_points)
    mean_delta_y = delta_y / len(initial_points)

    shifted_box = [box[0] + mean_delta_x, 
                   box[1] + mean_delta_y, 
                   box[2] + mean_delta_x, 
                   box[3] + mean_delta_y, 
                   box[4] + mean_delta_x,
                   box[5] + mean_delta_y,
                   box[6] + mean_delta_x,
                   box[7] + mean_delta_y]
    
    broken = False
    for i in range(len(shifted_box)):
        if shifted_box[i] < 0:
            broken = True
        if (i%2 == 0) and (shifted_box[i] > image_size[1]):
            broken = True
        if (i%2 == 1) and (shifted_box[i] > image_size[0]):
            broken = True
            
    return shifted_box, broken

def create_text_mask(frame, box):
    mask = np.zeros(frame.shape, dtype=np.uint8)
    roi_corners = np.array([[(box[0], box[1]), (box[2], box[3]), (box[4], box[5]), (box[6], box[7])]], dtype=np.int32)
    channel_count = frame.shape[2]
    ignore_mask_color = (255,)*channel_count
    cv2.fillPoly(mask, roi_corners, ignore_mask_color)
    masked_image = cv2.bitwise_and(frame, mask)
    gray_masked_image = cv2.cvtColor(masked_image, cv2.COLOR_BGR2GRAY)
    return gray_masked_image

def create_padded_text_mask(frame, box, padding=10):
    mask = np.zeros(frame.shape, dtype=np.uint8)
    minX = min([box[0], box[2], box[4], box[6]]) - padding
    maxX = max([box[0], box[2], box[4], box[6]]) + padding
    minY = min([box[1], box[3], box[5], box[7]]) - padding
    maxY = max([box[1], box[3], box[5], box[7]]) + padding
    if minX < 0:
        minX = 0
    if maxX > image_size[1]:
        maxX = image_size[1]
    if minY < 0:
        minY = 0
    if maxY > image_size[0]:
        maxY = image_size[0]
        
    roi_corners = np.array([[(minX, minY), (maxX, minY), (maxX, maxY), (minX, maxY)]], dtype=np.int32)
    channel_count = frame.shape[2]
    ignore_mask_color = (255,)*channel_count
    cv2.fillPoly(mask, roi_corners, ignore_mask_color)
    masked_image = cv2.bitwise_and(frame, mask)
    gray_masked_image = cv2.cvtColor(masked_image, cv2.COLOR_BGR2GRAY)
    return gray_masked_image

def generate_initial_tracking_points(frame, box):
    gray_masked_image = create_text_mask(frame, box)
    corners = cv2.goodFeaturesToTrack(gray_masked_image, 100, minDistance=1, qualityLevel=0.05)
    corners = np.int0(corners)
    if len(corners) < 10:
        return np.array([[]], dtype=np.float32)
    pnt_list = []
    for corner in corners:
        x, y = corner.ravel()
        pnt = [x, y]
        pnt_list.append(pnt)
    initial_points = np.array(pnt_list, dtype=np.float32)
    return initial_points.reshape((len(initial_points),1,2))

def show_gt(frame, labels, counter, show_text=False):    
    frame_labels = labels[counter]
    for label in frame_labels:
        if label[-1] == "##DONT#CARE##":
            continue 
        
        vrx = np.array(label[:8], np.int32)
        vrx = vrx.reshape((-1,1,2))
        frame = cv2.polylines(frame, [vrx], True, (0,255,255),2)
        if show_text:
            avg_x = int(round((label[0] + label[2] + label[4] + label[6]) / 4))
            half_width = int(round((max(label[0], label[2], label[4], label[6]) - min(label[0], label[2], label[4], label[6]))/2))
            avg_y = int(round((label[1] + label[3] + label[5] + label[7]) / 4))
            cv2.putText(frame, label[-1], (avg_x-half_width, avg_y+30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2)

def recognize_and_display(img, boxes, frame, show_text=False):
    recognized_text = recognize_detected_text(img, boxes)
    if show_text:
        for i in range(len(recognized_text)):
            draw_text(frame, recognized_text[i], boxes[i])

def get_orb_features(frame, box, old_frame, new_frame):
    orb = cv2.ORB_create()
    old_mask = create_text_mask(frame, box)
    new_mask = create_padded_text_mask(frame, box, padding=10)
    
    kp1, des1 = orb.detectAndCompute(new_frame, new_mask)
    kp2, des2 = orb.detectAndCompute(old_frame, old_mask)

    bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
    matches = bf.match(des1,des2)
    if len(matches) < 10:
        return [], []
    matches = sorted(matches, key = lambda x:x.distance)
    matches = matches[:10]
    
    # what if the number of matches is too low or 0?

    list_kp1 = []
    list_kp2 = []
    for mat in matches:
        img1_idx = mat.queryIdx
        img2_idx = mat.trainIdx
        (x1,y1) = kp1[img1_idx].pt
        (x2,y2) = kp2[img2_idx].pt
        list_kp1.append([x1, y1])
        list_kp2.append([x2, y2])
    list_kp1 = np.array(list_kp1, dtype=np.float32)
    list_kp2 = np.array(list_kp2, dtype=np.float32)
    
#     for p in range(len(list_kp1)):
#         point = list_kp1[p]
#         point2 = list_kp2[p]
#         cv2.circle(frame, (int(round(point[0])), int(round(point[1]))), 3, (255, 255, 0), -1) # cyan
#         cv2.circle(frame, (int(round(point2[0])), int(round(point2[1]))), 3, (0, 255, 255), -1) #yellow

    return list_kp1.reshape((len(list_kp1),1,2)), list_kp2.reshape((len(list_kp2),1,2))

def track_quadrant(previous_color_frame, previous_gray_frame, current_color_frame, current_gray_frame, quadx, quady):
    try:
        img = previous_gray_frame[quady:quady+50, quadx:quadx+50]
        corners = cv2.goodFeaturesToTrack(img, 100, minDistance=1, qualityLevel=0.05)
        corners = np.int0(corners)
        if len(corners) < 10:
            return 999,999
        pnt_list = []
        for corner in corners:
            x, y = corner.ravel()
            pnt = [x, y]
            pnt_list.append(pnt)
        initial_points = np.array(pnt_list, dtype=np.float32)
        initial_points = initial_points.reshape((len(initial_points),1,2))
        if len(initial_points) == 0:
            return 999,999
        d_new_points, d_status, d_error = cv2.calcOpticalFlowPyrLK(previous_gray_frame, current_gray_frame, initial_points, None, **lk_params)

        delta_x = 0; delta_y = 0
        delta_x_list = []; delta_y_list = []
        for k in range(len(initial_points)):
            if d_status[k] == 0:
                continue
            delta_x = delta_x - (initial_points[k][0][0] - d_new_points[k][0][0])
            delta_y = delta_y - (initial_points[k][0][1] - d_new_points[k][0][1])
            delta_x_list.append(delta_x)
            delta_y_list.append(delta_y)
        delta_x_list.sort()
        delta_y_list.sort()
        median_delta_x = statistics.median(delta_x_list)
        median_delta_y = statistics.median(delta_y_list)

        delta_x = 0; delta_y = 0
        THRES = 1.1
        for k in range(len(initial_points)):
            if d_status[k] == 0:
                continue
            if (abs(initial_points[k][0][0] - d_new_points[k][0][0])) < (abs(THRES * median_delta_x)) and (abs(initial_points[k][0][1] - d_new_points[k][0][1])) < (abs(THRES * median_delta_y)):
                delta_x = delta_x - (initial_points[k][0][0] - d_new_points[k][0][0])
                delta_y = delta_y - (initial_points[k][0][1] - d_new_points[k][0][1])
        mean_delta_x = delta_x / len(initial_points)
        mean_delta_y = delta_y / len(initial_points)
        return mean_delta_x, mean_delta_y
    except:
        return 999,999
        
def show_video_with_preds(video_path, labels, show_text=False):
    camera = cv2.VideoCapture(video_path)
    total_frame_count = int(camera.get(cv2.CAP_PROP_FRAME_COUNT))
    old_points = np.array([[]])
    _, frame = camera.read()
    old_color_frame = cv2.resize(frame, (image_size[1], image_size[0]))
    old_gray_frame = cv2.cvtColor(old_color_frame, cv2.COLOR_BGR2GRAY)
    fps = camera.get(cv2.CAP_PROP_FPS)
    COUNTER = 1
    LAST_FRAME_NUMBER_WITH_DETECTION = -4
    NUMBER_OF_FRAMES_SINCE_LAST_DETECTION = 0
    old_boxes = []
    totalX = 0; totalY = 0
    
    while True:
        (grabbed, frame) = camera.read()
        if not grabbed:
            break
        
        current_color_frame = cv2.resize(frame, (image_size[1], image_size[1]))
        current_color_frame_copy = current_color_frame.copy()
        current_gray_frame = cv2.cvtColor(current_color_frame, cv2.COLOR_BGR2GRAY)
        NUMBER_OF_FRAMES_SINCE_LAST_DETECTION = COUNTER - LAST_FRAME_NUMBER_WITH_DETECTION
        
        start = time.time()
        a = cv2.resize(old_gray_frame, (150, 150))
        b = cv2.resize(current_gray_frame, (150, 150))
        c = cv2.resize(old_color_frame, (150, 150))
        d = cv2.resize(current_color_frame, (150, 150))
        
        x1, y1 = track_quadrant(c, a, d, b, 0, 0)
        x2, y2 = track_quadrant(c, a, d, b, 50, 0)
        x3, y3 = track_quadrant(c, a, d, b, 100, 0)
        x4, y4 = track_quadrant(c, a, d, b, 0, 50)
        x5, y5 = track_quadrant(c, a, d, b, 50, 50)
        x6, y6 = track_quadrant(c, a, d, b, 100, 50)
        x7, y7 = track_quadrant(c, a, d, b, 0, 100)
        x8, y8 = track_quadrant(c, a, d, b, 50, 100)
        x9, y9 = track_quadrant(c, a, d, b, 100, 100)
        
#         print(x1,y1)
#         print(x2,y2)
#         print(x3,y3)
#         print(x4,y4)
#         print(x5,y5)
#         print(x6,y6)
#         print(x7,y7)
#         print(x8,y8)
#         print(x9,y9)

        '''
        Need to add up the total change
        '''
        
        
        if x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 < 600 and y1 + y2 + y3 + y4 + y5 + y6 + y7 + y8 + y9 < 600:
            threshold = 10
            strong_right = False; strong_left = False; strong_down = False; strong_up = False
            weak_right = False; weak_left = False; weak_down = False; weak_up = False
            if x1 > 0 and x2 > 0 and x3 > 0 and x4 > 0 and x5 > 0 and x6 > 0 and x7 > 0 and x8 > 0 and x9:
                avgX = (x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9) / 9
                if avgX > threshold:
                    strong_right = True
                else:
                    weak_right = True

            elif x1 < 0 and x2 < 0 and x3 < 0 and x4 < 0 and x5 < 0 and x6 < 0 and x7 < 0 and x8 < 0 and x9:
                avgX = (x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9) / 9
                if avgX > -threshold:
                    strong_left = True
                else:
                    weak_left = True

            if y1 > 0 and y2 > 0 and y3 > 0 and y4 > 0 and y5 > 0 and y6 > 0 and y7 > 0 and y8 > 0 and y9:
                avgY = (y1 + y2 + y3 + y4 + y5 + y6 + y7 + y8 + y9) / 9
                if avgY > threshold:
                    strong_down = True
                else:
                    weak_down = True

            elif y1 < 0 and y2 < 0 and y3 < 0 and y4 < 0 and y5 < 0 and y6 < 0 and y7 < 0 and y8 < 0 and y9:
                avgY = (y1 + y2 + y3 + y4 + y5 + y6 + y7 + y8 + y9) / 9
                if avgY > -threshold:
                    strong_up = True
                else:
                    weak_up = True

            if strong_right and weak_down and weak_up:
                print("RIGHT")
                ratio = avgX / 150
                ratio = ratio * 512
                section = int(128 * round(float(ratio)/128))
                cropped_image = current_color_frame[0:512, section:512]
            elif strong_left and weak_down and weak_up:
                print("LEFT")
                ratio = avgX / 150
                ratio = ratio * 512
                section = int(128 * round(float(ratio)/128))
                cropped_image = current_color_frame[0:512, 0:512-section]
            elif strong_down and weak_left and weak_right:
                print("DOWN")
                ratio = avgY / 150
                ratio = ratio * 512
                section = int(128 * round(float(ratio)/128))
                cropped_image = current_color_frame[section:512, 0:512]
            elif strong_up and weak_left and weak_right:
                print("UP")
                ratio = avgY / 150
                ratio = ratio * 512
                section = int(128 * round(float(ratio)/128))
                cropped_image = current_color_frame[0:512-section, 0:512]
            
        '''
        if all moving inwards:
            no need to detect anything
        if all moving outwards
            no need to detect the center
        if not moving
            no need to detect
        '''
        
        print("took:", time.time()-start)
        
        if NUMBER_OF_FRAMES_SINCE_LAST_DETECTION > 3:
            inputs = np.asarray([preprocess(current_color_frame, image_size)])
            preds = model.predict(inputs, batch_size=1, verbose=1)
            boxes = postprocess_detections(preds, segment_threshold=0.6, link_threshold=0.2)
            if len(boxes) > 0:
                LAST_FRAME_NUMBER_WITH_DETECTION = COUNTER
                for box in boxes:
                    current_color_frame = draw_bounding_box(current_color_frame, box, (0,0,255))
                    current_color_frame_copy = draw_bounding_box(current_color_frame_copy, box, (255,0,255))
                old_boxes = boxes
                recognize_and_display(img, boxes, current_color_frame, show_text=show_text)
    
        else:                    
            if len(old_boxes) > 0:
                all_boxes = []
                for box in old_boxes:
                    try:
                        initial_points = generate_initial_tracking_points(old_color_frame, box)
                        if len(initial_points) == 0:
                            continue
                        for pt in initial_points:
                            cv2.circle(old_color_frame,(pt[0][0], pt[0][1]), 3, (255,255,0), -1)
                        new_points, status = track_point(old_gray_frame, current_gray_frame, initial_points)
                        for i in range(len(new_points)):
                            if status[i] == 1:
                                cv2.circle(current_color_frame,(new_points[i][0][0], new_points[i][0][1]), 3, (0,255,255), -1)
                        shifted_box, broken = shift_box(initial_points, new_points, status, box)
                        if broken:
                            continue
                        current_color_frame = draw_bounding_box(current_color_frame, shifted_box, (0,255,0))
                        all_boxes.append(shifted_box)
                        old_boxes = np.array(all_boxes)
                        recognize_and_display(img, boxes, current_color_frame, show_text=show_text)
                    except:
                        LAST_FRAME_NUMBER_WITH_DETECTION = COUNTER -5
                        break
        
#         cv2.imshow("Current Frame", current_color_frame)
#         cv2.imshow("Previous Frame", old_color_frame)
        old_color_frame = current_color_frame_copy.copy()
        old_gray_frame = current_gray_frame.copy()
        cv2.waitKey(10)
        COUNTER += 1
        
    camera.release()
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    cv2.waitKey(1)

video_index = 14
video_path = "./data/icdar_2015_text_in_video/training/" + video_names[video_index] + ".mp4"
show_video_with_preds(video_path, gt[video_index], show_text=False)

In [None]:
#             if len(old_boxes1) > 0:
#                 all_boxes1 = []
#                 for box in old_boxes1:
#                     start = time.time()
#                     try:
#                         list_kp1, list_kp2 = get_orb_features(frame, box, old_frame, new_frame)
#                         if len(list_kp1) == 0:
#                             LAST_FRAME_NUMBER_WITH_DETECTION = -5
#                             continue
#                         dummy_status = [1] * len(list_kp1)
#     #                     initial_points = generate_initial_tracking_points(frame, box)
#     #                     new_points, status = track_point(old_frame, new_frame, initial_points)
#     #                     shifted_box = shift_box(initial_points, new_points, status, box)
#     #                     frame = draw_bounding_box(frame, shifted_box, True)
#                         shifted_box1, broken = shift_box(list_kp2, list_kp1, dummy_status, box)
#                         if broken:
#                             continue
#                         frame = draw_bounding_box(frame, shifted_box1, (255,0,0))
#                         all_boxes1.append(shifted_box1)
#                         old_boxes1 = np.array(all_boxes1)
#                         recognize_and_display(img, boxes, frame, show_text=show_text)
#                     except:
#                         print("HIT ERROR")
#                         LAST_FRAME_NUMBER_WITH_DETECTION = COUNTER -5
#                         break
#                     end = time.time()
#                     print("[info] orb took:", end-start)

In [None]:
#                     for p in range(len(new_points)):
#                         x, y = new_points[p].ravel()
#                         a, b = initial_points[p].ravel()
#                         if status[p][0] == 0:
#                             continue
#                         cv2.circle(frame, (x, y), 3, (255, 255, 0), -1)
#                         cv2.circle(frame, (a, b), 3, (0, 255, 255), -1)

In [None]:
# # Create old frame
# _, frame = camera.read()
# old_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
 
# # Lucas kanade params
# lk_params = dict(winSize = (15, 15), maxLevel = 4, criteria = (cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 10, 0.03))
 
# # Mouse function
# def select_point(event, x, y, flags, params):
#     global point, point_selected, old_points
#     if event == cv2.EVENT_LBUTTONDOWN:
#         point = (x, y)
#         point_selected = True
#         old_points = np.array([[x, y]], dtype=np.float32)
 
# cv2.namedWindow("Frame")
# cv2.setMouseCallback("Frame", select_point)
 
# point_selected = False
# point = ()
# old_points = np.array([[]])
# while True:
#     _, frame = camera.read()
#     gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
 
#     if point_selected is True:
#         cv2.circle(frame, point, 5, (0, 0, 255), 2)
 
#         new_points, status, error = cv2.calcOpticalFlowPyrLK(old_frame, gray_frame, old_points, None, **lk_params)
#         old_frame = gray_frame.copy()
#         old_points = new_points
 
#         x, y = new_points.ravel()
#         cv2.circle(frame, (x, y), 5, (0, 255, 0), -1)
 
 
 
#     cv2.imshow("Frame", frame)
 
#     key = cv2.waitKey(1)
#     if key == 27:
#         break
 
# cap.release()
# cv2.destroyAllWindows()

In [None]:
# lk_params = dict(winSize = (15, 15), 
#                  maxLevel = 2, 
#                  criteria = (cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 10, 0.03))

# def track_point(old_frame, new_frame, old_points):
#     new_points, status, error = cv2.calcOpticalFlowPyrLK(old_frame, new_frame, old_points, None, **lk_params)
#     old_frame = new_frame.copy()
#     old_points = new_points
#     x, y = new_points.ravel()
#     return x, y, old_frame

# def draw_bounding_box(frame, box):
#     vrx = np.array(box, np.int32)
#     vrx = vrx.reshape((-1,1,2))
#     frame = cv2.polylines(frame, [vrx], True, (0,255,255),2)
#     return frame

# def draw_text(frame, text, box):
#     avg_x = int(round((box[0] + box[2] + box[4] + box[6]) / 4))
#     half_width = int(round((max(box[0], box[2], box[4], box[6]) - min(box[0], box[2], box[4], box[6]))/2))
#     avg_y = int(round((box[1] + box[3] + box[5] + box[7]) / 4))
#     cv2.putText(frame, text, (avg_x-half_width, avg_y+30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,255), 2)
    
# def postprocess_detections(preds, segment_threshold=0.5, link_threshold=0.5):
#     res = prior_util.decode(preds[0], segment_threshold=0.7, link_threshold=0.6, debug=False)
#     rboxes = res
#     if len(rboxes) == 0:
#         return []

#     bh = rboxes[:,3]
#     rboxes[:,2] += bh * 0.2
#     rboxes[:,3] += bh * 0.2

#     boxes = []
#     for r in rboxes:
#         ds = r[:5]
#         r2p = rbox_to_polygon(ds)
#         boxes.append(r2p)

#     boxes = np.asarray(boxes)
#     boxes = np.flip(boxes, axis=1) 
#     boxes = np.reshape(boxes, (-1, 8))

#     boxes_mask_a = np.array([b[2] > b[3] for b in rboxes]) 
#     boxes_mask_b = np.array([not (np.any(b < 0) or np.any(b > 512)) for b in boxes]) 
#     boxes_mask = np.logical_and(boxes_mask_a, boxes_mask_b)

#     boxes = boxes[boxes_mask]
#     rboxes = rboxes[boxes_mask]
#     if len(boxes) == 0:
#         boxes = np.empty((0,8))
#     return boxes



# def show_video_with_preds(video_path, labels, show_text=False, track=False):
    
#     camera = cv2.VideoCapture(video_path)
#     old_points = np.array([[]])
#     _, frame = camera.read()
#     frame = cv2.resize(frame, (300, 300))
#     old_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
#     fps = camera.get(cv2.CAP_PROP_FPS)
#     counter = 0
#     detected_on_frame = 0
#     tracked_on_frame = 0
    
#     while True:
#         (grabbed, frame) = camera.read()
        
#         if not grabbed:
#             break

#         frame = cv2.resize(frame, (300, 300))
#         new_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        
#         inputs = []
#         inputs.append(preprocess(frame, image_size))
#         inputs = np.asarray(inputs)
#         preds = model.predict(inputs, batch_size=1, verbose=1)

#         boxes = postprocess_detections(preds, segment_threshold=0.5, link_threshold=0.5)
#         for box in boxes:
#             print(box)
#         print("-"*100)
        
#         for i in range(len(preds)):
#             res = prior_util.decode(preds[i], segment_threshold=0.7, link_threshold=0.6, debug=False)
#             rboxes = res
#             if len(rboxes) == 0:
#                 continue

#             bh = rboxes[:,3]
#             rboxes[:,2] += bh * 0.2
#             rboxes[:,3] += bh * 0.2

#             boxes = []
#             for r in rboxes:
#                 ds = r[:5]
#                 r2p = rbox_to_polygon(ds)
#                 boxes.append(r2p)
                
#             boxes = np.asarray(boxes)
#             boxes = np.flip(boxes, axis=1) 
#             boxes = np.reshape(boxes, (-1, 8))

#             boxes_mask_a = np.array([b[2] > b[3] for b in rboxes]) 
#             boxes_mask_b = np.array([not (np.any(b < 0) or np.any(b > 512)) for b in boxes]) 
#             boxes_mask = np.logical_and(boxes_mask_a, boxes_mask_b)

#             boxes = boxes[boxes_mask]
#             rboxes = rboxes[boxes_mask]
#             if len(boxes) == 0:
#                 boxes = np.empty((0,8))

#             for box in boxes:
#                 if track:
#                     center_x = round((box[0] + box[2] + box[4] + box[6]) / 4.0)
#                     center_y = round((box[1] + box[3] + box[5] + box[7]) / 4.0)
#                     old_points = np.array([[center_x, center_y]], dtype=np.float32)
#                     x, y, old_frame = track_point(old_frame, new_frame, old_points)
#                     cv2.circle(frame, (x, y), 5, (0, 255, 0), -1)

#                 frame = draw_bounding_box(frame, box)
                
#             words = crop_words(img, boxes/300, input_height, width=input_width, grayscale=True)
#             words = np.asarray([w.transpose(1,0,2) for w in words])

#             if len(words) > 0:
#                 res_crnn = crnn_model.predict(words)
#             else:
#                 res_crnn = []

#             if show_text:
#                 for i in range(len(res_crnn)):
#                     chars = [alphabet[c] for c in np.argmax(res_crnn[i], axis=1)]
#                     res_str = decode(chars)
#                     draw_text(frame, res_str, boxes[i])
                    
#         cv2.imshow("Frame", frame)
#         cv2.waitKey(1)
#         counter += 1
        
#     camera.release()
#     cv2.waitKey(0)
#     cv2.destroyAllWindows()
#     cv2.waitKey(1)

# video_index = 14
# video_base_path = "./data/icdar_2015_text_in_video/training/"
# video_format = ".mp4"
# video_path = video_base_path + video_names[video_index] + video_format
# show_video_with_preds(video_path, gt[video_index], True, False)