In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
"""
Main script for FCNT tracker. 
"""

# Import custom class and functions
from inputproducer import LiveInput
from tracker import TrackerContour
from vgg16 import Vgg16
from sgnet import GNet, SNet
from utils import img_with_bbox, IOU_eval, select_fms

import cv2
import numpy as np 
import tensorflow as tf
import matplotlib.pylab as plt

from scipy.misc import imresize
from subprocess import call
import sys
import os
import time

tf.app.flags.DEFINE_integer('iter_epoch_sg', 5,
                          """Number of epoches for trainning"""
                          """SGnet works""")
tf.app.flags.DEFINE_integer('batch_size', 20,
                          """Batch size for SGNet trainning"""
                          """SGnet works""")
tf.app.flags.DEFINE_integer('n_samples_per_batch', 5000,
                          """Number of samples per batch for trainning"""
                          """SGnet works""")
tf.app.flags.DEFINE_integer('iter_max', 1349,
							"""Max iter times through imgs""")
tf.app.flags.DEFINE_integer('sel_num', 354,
                          """Number of feature maps selected.""")
tf.app.flags.DEFINE_string('model_name', 'first_LiveFaceXL',
						"""true for train, false for eval""")
FLAGS = tf.app.flags.FLAGS

## Define varies pathes
DATA_ROOT = 'data/Car1'
PRE_ROOT = os.path.join(DATA_ROOT, 'img_loc')
IMG_PATH = os.path.join(DATA_ROOT, 'img')
GT_PATH = os.path.join(DATA_ROOT, 'groundtruth_rect.txt')
VGG_WEIGHTS_PATH = 'vgg16_weights.npz'


#if not os.path.isdir(PRE_ROOT):
#    os.mkdir(PRE_ROOT)


TB_SUMMARY = os.path.join('tb_summary', FLAGS.model_name)
if not os.path.isdir('tb_summary'):
    os.mkdir('tb_summary')
if not os.path.isdir(TB_SUMMARY):
    os.mkdir(TB_SUMMARY)

CKPT_PATH = 'checkpoint'
if not os.path.isdir(CKPT_PATH):
    os.mkdir(CKPT_PATH)

model_name = FLAGS.model_name+'.ckpt'
CKPT_MODEL = os.path.join(CKPT_PATH, model_name)


def init_vgg(sess, roi_t0=None, predict=True):
    """
    Initialize a tf.Session and a vgg16 graph. Followed
    by forwarding the vgg net once to predict top5 class labels
    for image generated in the first frame.

    Args:
        roi_t0: np.ndarray with shape (28x28x3), extracted roi in the first frame.
    Returns:
        sess: tf.Session object.
        vgg: Vgg16 class instance.
    """
    print('Classify it with a pre-trained Vgg16 model.')
    t_start = time.time()

    #sess.run(tf.initialize_all_variables())
    vgg = Vgg16(VGG_WEIGHTS_PATH, sess)
    if predict and roi_t0 is not None:
        vgg.print_prob(roi_t0, sess)
    print('Forwarding the vgg net cost : %.2f s'%(time.time() - t_start))
    return vgg

def gen_sel_maps(sess, roi, vgg, idx_c4, idx_c5):
    """Returns selected c4 and c5 maps"""
    if len(roi.shape) == 3: roi = [roi]
    fd = {vgg.imgs : roi}
    c4_arr, c5_arr = sess.run([vgg.conv4_3_norm, vgg.conv5_3_norm], feed_dict=fd)
    c4_maps = c4_arr[...,idx_c4]
    c5_maps = c5_arr[...,idx_c5]
    return c4_maps, c5_maps


def train_SGNets(sess, img, gt, vgg, snet, gnet, inputProducer, idx_c4, idx_c5):
    """
    Train SGnets' variables by minimizing a composite L2 regression losses.

    Args:
        sess: tf.Session object.
        vgg: Vgg16 class instance.
        snet: SNet class instance.
        gnet:  GNet class instance.
        inputProducer: InputProducer class instance.
    """
    gnet.params['wd'] = 0.5
    loss = gnet.loss + snet.loss
    tf.scalar_summary('loss', loss)
    writer = tf.train.SummaryWriter(TB_SUMMARY, sess.graph)
    
    vars_train = gnet.variables + snet.variables

    # Backprop using SGD and updates vgg variables and sgnets variables
    global_step = tf.Variable(0, trainable=False)
    lr_exp = tf.train.exponential_decay(
            0.25, # Initial learning rate 
            global_step, 
            1000, # Decay steps 
            0.8, # Decay rate 
            name='sg_lr')

    tf.scalar_summary('Learning rate', lr_exp)
    optimizer = tf.train.GradientDescentOptimizer(lr_exp)
    train_op = optimizer.minimize(loss, var_list= vars_train, global_step=global_step)
    merged = tf.merge_all_summaries()
    
    print('Generating batches from img size:%s  for trainning.'%str(img.shape))
    sample_batches, target_batches = inputProducer.gen_batches(img, gt, n_samples=FLAGS.n_samples_per_batch, batch_sz=FLAGS.batch_size, pos_ratio=0.5, scale_factors=np.arange(0.5, 5., 0.2)) #np.array([1]))#
    print('Start training the SGNets........ for %s epoches'%FLAGS.iter_epoch_sg)
    saver = tf.train.Saver()
    step = 1
    loss_list = []
    for ep in range(FLAGS.iter_epoch_sg):
        print('Total batches in each epoch: ', len(sample_batches))
        for roi, target in zip(sample_batches, target_batches):
            #roi[roi>0] = 1 # neglect gaussian..set to 1 for target arear
            
            t = time.time()
            c4_maps, c5_maps = gen_sel_maps(sess, roi, vgg, idx_c4, idx_c5)
            
            fd = {gnet.input_maps: c5_maps, gnet.gt_M: target, 
                  snet.input_maps: c4_maps, snet.gt_M: target}
            
            # Initialization 
            if step == 1:
                loss_g = 10
                init_s = 0
                while loss_g > 1.2:
                    init_s += 1
                    sess.run(tf.initialize_variables(gnet.variables))
                    loss_g = sess.run(gnet.loss, feed_dict=fd)
                    print('Initial Gnet Loss: ', loss_g, 'In steps: ', init_s)
                sess.run(tf.initialize_variables(snet.variables + [global_step]))
                
            
            pre_M_g, l, _, lr = sess.run([gnet.pre_M, loss, train_op, lr_exp], feed_dict=fd)
            
            loss_list += [l]
            if l <= 0.1:
                print('break learning!')
                break
            if step % 20 == 0:
                
                loss_ac = np.diff(np.diff(loss_list[-19:]))
                loss_ac_summary = tf.scalar_summary('Loss acceleration', loss_ac.mean())
                
                
                summary_img_g = tf.image_summary('pre_M', 
                                                 np.repeat(pre_M_g[...,np.newaxis], 3, axis=-1), name='GMap')

                summary, img_summary_g, ac_loss_summary = sess.run([merged, summary_img_g, loss_ac_summary], feed_dict=fd)

                writer.add_summary(summary, global_step=step)
                writer.add_summary(img_summary_g, global_step=step)
                writer.add_summary(ac_loss_summary, global_step=step)
                
                loss_std = np.std(loss_list[-19:])
                if loss_std <= 0.007:
                    
                    print('Stop learning??! Last 10 batches Loss Std: ', loss_std)
                    #break

            #if step % 20 == 0:
                print('Epoch: ', ep+1, 'Step: ', (ep+1)*step, 'Loss : %.2f'%l, \
                    'Speed: %.2f second/batch'%(time.time()-t), 'Lr: ', lr)
                #saver.save(sess, CKPT_MODEL)
            step += 1



print('Reading the first image...')
t_start = time.time()
## Instantiate inputProducer and retrive the first img
# with associated ground truth. 
inputProducer = LiveInput()




Reading the first image...


In [3]:
# import the necessary packages
#import argparse
import cv2

# initialize the list of reference points and boolean indicating
# whether cropping is being performed or not
refPt = []
cropping = False

def click_and_crop(event, x, y, flags, param):
    # grab references to the global variables
    global refPt, cropping

    # if the left mouse button was clicked, record the starting
    # (x, y) coordinates and indicate that cropping is being
    # performed
    if event == cv2.EVENT_LBUTTONDOWN:
        refPt = [(x, y)]
        #cropping = True

    # check to see if the left mouse button was released
    elif event == cv2.EVENT_LBUTTONUP:
        # record the ending (x, y) coordinates and indicate that
        # the cropping operation is finished
        refPt.append((x, y))
        #cropping = False

        # draw a rectangle around the region of interest
        #cv2.rectangle(image, refPt[0], refPt[1], (0, 255, 0), 2)
        #cv2.imshow("image", image)


In [4]:
def refPt_2_gt(refPt):
    p1, p2 = refPt
    x1, y1 = p1
    x2, y2 = p2
    w = x2 - x1
    h = y2 - y1
    return (x1, y1, w, h)

In [5]:
tracker = TrackerContour()
inputProducer.roi_params['roi_scale'] = 2.5


cap = cv2.VideoCapture(0)
cv2.namedWindow("image")
cv2.setMouseCallback("image", click_and_crop)


sess = tf.Session()

snet = SNet('SNet', FLAGS.sel_num)
gnet = GNet('GNet', FLAGS.sel_num)
saver = tf.train.Saver()
saved_ckpt = os.path.join('checkpoint', FLAGS.model_name.split('_')[-1]+'.ckpt')
if os.path.exists(saved_ckpt):
    print('Found saved model %s, restoring! '%saved_ckpt)
    saver.restore(sess, saved_ckpt)
    TrackReady = True
else: 
    TrackReady = False
    
PosReady = False

while(cap.isOpened()):
    # Capture frame-by-frame
    ret, image = cap.read()

    # load the image, clone it, and setup the mouse callback function
    clone = image.copy()
    
    key = cv2.waitKey(1) & 0xFF
    # keep looping until the 'q' key is pressed

    

    # if there are two reference points, then crop the region of interest
    # from teh image and display it
    if len(refPt) == 2 and key==ord("c"):      
        roi = clone[refPt[0][1]:refPt[1][1], refPt[0][0]:refPt[1][0]]
        cv2.imshow("CroppedROI", roi)

        gt = refPt_2_gt(refPt)
        img = image
        print(gt, 'gt in first!')
        
        inputProducer.save_fist_roi_mean(img, gt)


    # train
    
    if key == ord('t'):
        roi_t0, _, rz_factor = inputProducer.extract_roi(img, gt)
        
        # Predicts the first img.
        vgg = init_vgg(sess, roi_t0)
        fd = {vgg.imgs: [roi_t0]}
        gt_M = inputProducer.gen_mask((28,28)) # rank2 array


        ## At t=0. Train S and G Nets 
        # Instainate SGNets with conv tensors and training.
        # 1. feature maps selection
        # 2. Train G and S networks.
        idx_c4 = select_fms(sess, vgg.conv4_3_norm, gt, rz_factor, fd, FLAGS.sel_num)
        idx_c5 = select_fms(sess, vgg.conv5_3_norm, gt, rz_factor, fd, FLAGS.sel_num)
        
        if not TrackReady:
            train_SGNets(sess, img, gt, vgg, snet, gnet, inputProducer, idx_c4, idx_c5)
            saver.save(sess, saved_ckpt)
            TrackReady = True
    
    
    # Records the first position
    if key == ord('s'):
        gt_last = refPt_2_gt(refPt)
        print(gt_last, 'gt in start~!')
        PosReady = True
        
        
    # Start tracking
    if PosReady and TrackReady:
        img = image.copy()
        #img = inputProducer.Ajust_brighteness(img, gt_last)
        roi, _, rz_factor = inputProducer.extract_roi(img, gt_last)

        # @inputproducer, remove low level pixel
        noise_value = 10#np.argmax(hist)*0.1
        roi[roi<noise_value] = roi.mean()
        
        ## Perform Target localiation predicted by GNet
        # Get heat map predicted by GNet
        c4_maps, c5_maps = gen_sel_maps(sess, roi, vgg, idx_c4, idx_c5)
        fd = {gnet.input_maps: c5_maps, snet.input_maps: c4_maps}
        pre_M_g, pre_M_s = sess.run([gnet.pre_M, snet.pre_M], feed_dict=fd)

        pre_M = tracker.preporcess_heatmaps(pre_M_g, pre_M_s, resize=(224,224))
        pre_loc = tracker.predict_location(pre_M,gt_last,rz_factor,threshold=np.arange(0.3, 0.9, 0.05))
        
        gt_last = pre_loc
        x,y,w,h = pre_loc
        print('pre_loc', pre_loc)
        cv2.rectangle(image,(x,y),(x+w,y+h),(225,0,0),2)
        font = cv2.FONT_HERSHEY_SIMPLEX
        cv2.putText(image, 'conf score: %s'%1,(5,20), font, 0.6,(255,0,0),1,cv2.LINE_AA)
        
        cv2.imshow("pre_M_g", pre_M_g)
        cv2.imshow("pre_M_s", pre_M_s)
        cv2.imshow("pre_M_g", pre_M_g)
        cv2.imshow("ROI", roi)
        # Finetune SNet
        
        print('Tracking done in step: %s'%tracker.step)
        
        
    cv2.imshow("image", image)

# close all open windows
cv2.destroyAllWindows()

Found saved model checkpoint/LiveFaceXL.ckpt, restoring! 
(259, 168, 98, 136) gt in first!


  roi = convas[cy-half:cy+half, cx-half:cx+half, :]


Classify it with a pre-trained Vgg16 model.
cardigan 0.707232
fur coat 0.0585385
stole 0.0426091
wool, woolen, woollen 0.0249354
trench coat 0.0167647
Forwarding the vgg net cost : 4.90 s
Classify it with a pre-trained Vgg16 model.


  conf_i = roi[c-h_half:c+h_half, c-w_half:c+w_half].sum()


cardigan 0.707232
fur coat 0.0585385
stole 0.0426091
wool, woolen, woollen 0.0249354
trench coat 0.0167647
Forwarding the vgg net cost : 11.15 s
(259, 168, 98, 136) gt in start~!
pre_loc [271, 173, 81, 104]
Tracking done in step: 1
pre_loc [275, 192, 91, 117]
Tracking done in step: 2
pre_loc [272, 182, 109, 123]
Tracking done in step: 3
pre_loc [267, 176, 113, 128]
Tracking done in step: 4
pre_loc [268, 172, 125, 132]
Tracking done in step: 5


  scores = scores_dis / arear_list


pre_loc [262, 172, 131, 135]
Tracking done in step: 6
pre_loc [263, 170, 141, 138]
Tracking done in step: 7
pre_loc [255, 166, 139, 141]
Tracking done in step: 8
pre_loc [257, 167, 136, 132]
Tracking done in step: 9
pre_loc [257, 172, 135, 132]
Tracking done in step: 10
pre_loc [257, 170, 132, 129]
Tracking done in step: 11
pre_loc [257, 175, 129, 131]
Tracking done in step: 12
pre_loc [259, 171, 128, 132]
Tracking done in step: 13
pre_loc [259, 171, 128, 131]
Tracking done in step: 14
pre_loc [259, 173, 129, 129]
Tracking done in step: 15
pre_loc [259, 173, 126, 130]
Tracking done in step: 16
pre_loc [259, 173, 125, 133]
Tracking done in step: 17
pre_loc [263, 171, 127, 130]
Tracking done in step: 18
pre_loc [260, 173, 124, 133]
Tracking done in step: 19
pre_loc [262, 170, 127, 128]
Tracking done in step: 20
pre_loc [259, 175, 115, 125]
Tracking done in step: 21
pre_loc [263, 178, 107, 126]
Tracking done in step: 22
pre_loc [269, 178, 107, 121]
Tracking done in step: 23
pre_loc [266, 

ValueError: max() arg is an empty sequence

In [None]:
gt

In [None]:
img.shape

In [None]:
inputProducer.extract_roi(img, gt)

In [None]:
cap = cv2.VideoCapture(0)




In [None]:
ret, img = cap.read()

ret

In [None]:
cv2.imshow("pre_M", pre_M)

In [None]:
str(img.shape)

In [None]:
refPt_2_gt(refPt)

In [None]:
image.shape