# Sequence Annotation

In [1]:
from __future__ import print_function

In [2]:
import torch

import cv2

import torch.optim as optim
import torch.nn as nn

import sys

import torchvision.transforms as transforms
from torch.autograd import Variable
import random

In [3]:
!jupyter nbconvert --to script ConvRNN.ipynb
import ConvRNN

!jupyter nbconvert --to script dataset/videoDataset.ipynb
from dataset import videoDataset

# To avoid to restart the kernel if the .ipynb is modified
# should be suppressed if you're not modifying ConvRNN.ipynb and videoDataset.ipynb
reload(ConvRNN)
reload(videoDataset)

[NbConvertApp] Converting notebook ConvRNN.ipynb to script
[NbConvertApp] Writing 19825 bytes to ConvRNN.py
[NbConvertApp] Converting notebook dataset/videoDataset.ipynb to script
[NbConvertApp] Writing 3717 bytes to dataset/videoDataset.py


<module 'dataset.videoDataset' from 'dataset/videoDataset.pyc'>

## Datasets

In [4]:
def readDatasets():
    # Read video dataset
    trainDataset = videoDataset.VideoDataset("/video/Gesture")
    #print("Training data : " ,trainDataset)
    print("Training data size : ", len(trainDataset))
    #print('')
    valDataset   = videoDataset.VideoDataset("/video/Gesture/val")
    #print("Validation data : ", valDataset)
    print("Validation data size : ", len(valDataset))
    #print('')
    testDataset   = videoDataset.VideoDataset("/video/Gesture/test")
    #print("Test data : ", testDataset)
    return trainDataset, valDataset, testDataset

In [5]:
class videoFrameError(Exception):
    """
        Raise when reading a non-existing frame in a video
        Attribute : 
            videoName --- video name or path
            frame     --- frame number
    """
    def __init__(self, videoName, frame):
        self.videoName = videoName
        self.frame = frame
    def __str__(self):
        return repr(self.videoName) + repr(self.frame)

In [6]:
def test(model, testData):
    t = transforms.Compose(
            (transforms.ToPILImage(),
            transforms.Resize(225),
            transforms.RandomCrop(225),
            transforms.ToTensor())
            )
    success = 0
    tot = 0
    batchSize=1
    
    for videoName, annotation in testData:
        print("Test on video ", videoName)
        videoCap = cv2.VideoCapture(videoName)
        cframe = 0 #current frame
        
        #read each sequence
        for seq, gesture in annotation:
            if gesture == 5 :
                continue
            #print("The first sequence is : ", seq, " with gesture : ", gesture)
            #reach the start frame
            while cframe != seq[0]: 
                ret, frame = videoCap.read()
                if not ret :
                    raise videoFrameError(videoName, cframe)
                cframe += 1
                
            #fill a tensor with the sequence images
            seqLen = seq[1] - seq[0] + 1
            
            inputs = torch.Tensor(seqLen, batchSize, 3,225,225).cuda()
            #inputs = torch.Tensor(seqLen, batchSize, 3,225,225)
            
            for i in range(seqLen):
                ret, frame = videoCap.read()
                cframe += 1
                if ret : 
                    inputs[i] = t(frame)
                else:
                    raise videoFrameError(videoName, cframe)
            
            #forward pass
            outputs = model(Variable(inputs))
            _, pred = torch.max(outputs.data, 1)
            tot += 1
            #print("The response is : ", pred)
            if pred[0] == gesture:
                #print("\tAnd that's correct")
                success += 1
    return success,tot

In [7]:
def testTest():
    test( ConvRNN.convRNN_1_layer("lstm", copyParameters=True).cuda(), testDataset)

## Train

In [8]:
def trainSeq(model, seqList, opti, with_cuda=True):
    """
        Receive a list of sequence, of the same length
        A sequence is a list of couple:
            [ ([im1, im2, ... ] , label) , ... ]
        All sequence will be put in one batch (i.e. batchSize determined by len(seqList))
    """
    if with_cuda:
        model.cuda()
        
    model.train()
    batchSize = len(seqList)
    seqSize = len(seqList[0][0])
    
    nbChannel = len(seqList[0][0][0])
    height    = len(seqList[0][0][0][0])
    width    = len(seqList[0][0][0][0][0])
    
    criterion = nn.CrossEntropyLoss()
    
    #fill input
    if with_cuda:
        inputs = torch.Tensor(seqSize, batchSize, nbChannel, height, width).cuda()
    else:
        inputs = torch.Tensor(seqSize, batchSize, nbChannel, height, width)
    if with_cuda:
        labels = torch.LongTensor(batchSize).cuda()
    else:
        labels = torch.LongTensor(batchSize)
        
    for b in range(batchSize):
        seq, labels[b] = seqList[b]
        for i,s in enumerate(seq):
            inputs[i][b] = s
    
    #forward pass
    outputs = model(Variable(inputs))
    pred = outputs[0]
        
    #label = torch.LongTensor([gesture])
    loss = criterion(outputs, Variable(labels))
    loss.backward() #retain graph?
    optimizer.step()
    return loss.data[0]

In [9]:
def trainOnVideos(model, optimizer, trainDataset, valDataset, nbepoch=5, batchSize=16, with_cuda=True):
    
    trans = transforms.Compose(
                (transforms.ToPILImage(),
                transforms.Resize(225),
                transforms.RandomCrop(225),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225]),)
                )
    
    for epoch in range(nbepoch):
        model.train()
        if with_cuda:
            model.cuda()
        trainDataset.shuffle()
        
        nbBatches = len(trainDataset) // batchSize #we don't use the end of the list
        
        for batchNumber in range(nbBatches) :
            print("Batch :", batchNumber)
            seqList = []
            #create batchSize sequences on 10 frames
            for b in range(batchSize):
                seq = []
                videoName, annotation = trainDataset[batchNumber+b]
                print("Video", videoName)
                #randomly choose a sequence in this video
                (seqStart, seqEnd), gesture = random.choice(annotation)
                while seqEnd - seqStart < 10:
                    print("Sequence too short")
                    (seqStart, seqEnd), gesture = random.choice(annotation)
                
                #fill up seq with images from the video
                videoCap = cv2.VideoCapture(videoName)
                cframe = 0
                
                newStart = random.randint(seqStart, seqEnd-10) #select a random 10 frame window
                while cframe != newStart: #go to the start frame
                    ret, _ = videoCap.read()
                    if ret:
                        cframe += 1
                    else:
                        raise videoFrameError(videoName, cframe)
                for i in range(10):
                    ret, frame = videoCap.read()
                    if not ret:
                        raise videoFrameError(videoName, cframe)
                    seq.append(trans(frame))
                seqList.append( (seq, gesture) )
            loss = trainSeq(model, seqList, optimizer, with_cuda)
            print('[%d, %5d] loss: %.3f' % (epoch+1, batchNumber+1, loss))
            
            """
             if k >= 50:
                    model.eval()
                    nc, t = test(model, valDataset)
                    model.train()
                    print("Correctness : ", nc/float(t))
                    k = 0
                    if nc > c:
                        c = nc
                        torch.save(model,"convRNN-"+str(r)+"-"+str(c)+".model")
            """

## MAIN ##

In [11]:
trD, valD, teD = readDatasets()
#model = torch.load("./convRNN-0-33.model").cuda()
model = ConvRNN.ResNet_lstm(ConvRNN.BasicBlock, [2, 2, 2, 2]).cuda()

optimizer = optim.SGD( model.parameters(),lr=0.01, momentum=0.9, weight_decay=0.0005)

model = trainOnVideos(model, optimizer, trD, valD, with_cuda=True)

Training data size :  145
Validation data size :  4
Batch : 0
Video /video/Gesture/gR27.mp4
Sequence too short
Sequence too short
Video /video/Gesture/u28.mp4
Video /video/Gesture/jR14.mp4
Video /video/Gesture/fR04.mp4
Video /video/Gesture/v31.mp4
Video /video/Gesture/jR03.mp4
Video /video/Gesture/v21.mp4
Video /video/Gesture/v11.mp4
Video /video/Gesture/gR09.mp4
Video /video/Gesture/fR12.mp4
Video /video/Gesture/u35.mp4
Video /video/Gesture/gR06.mp4
Video /video/Gesture/jR07.mp4
Video /video/Gesture/fR03.mp4
Video /video/Gesture/fR25.mp4
Video /video/Gesture/jR11.mp4
[1,     1] loss: 1.800
Batch : 1
Video /video/Gesture/u28.mp4
Video /video/Gesture/jR14.mp4
Video /video/Gesture/fR04.mp4
Video /video/Gesture/v31.mp4
Video /video/Gesture/jR03.mp4
Video /video/Gesture/v21.mp4
Video /video/Gesture/v11.mp4
Video /video/Gesture/gR09.mp4
Video /video/Gesture/fR12.mp4
Video /video/Gesture/u35.mp4
Video /video/Gesture/gR06.mp4
Video /video/Gesture/jR07.mp4
Video /video/Gesture/fR03.mp4
Video /

In [63]:
x = torch.Tensor(1,1,2,225,225).cuda()

In [68]:
model.isCuda

True