# Sequence Annotation

In [17]:
from __future__ import print_function

In [61]:
import cv2
import torch
import torch.optim as optim

import torchvision.transforms as transforms
from torch.autograd import Variable

In [57]:
!jupyter nbconvert --to script ConvRNN.ipynb
import ConvRNN

!jupyter nbconvert --to script dataset/videoDataset.ipynb
from dataset import videoDataset

# To avoid to restart the kernel if the .ipynb is modified
# should be suppressed if you're not modifying ConvRNN.ipynb and videoDataset.ipynb
reload(ConvRNN)
reload(videoDataset)

[NbConvertApp] Converting notebook ConvRNN.ipynb to script
[NbConvertApp] Writing 8999 bytes to ConvRNN.py
[NbConvertApp] Converting notebook dataset/videoDataset.ipynb to script
[NbConvertApp] Writing 3447 bytes to dataset/videoDataset.py


<module 'dataset.videoDataset' from 'dataset/videoDataset.py'>

## Datasets

In [58]:
# Read video dataset
trainDataset = videoDataset.VideoDataset("/video/Gesture")
#print("Training data : " ,trainDataset)
print("Training data size : ", len(trainDataset))
#print('')
valDataset   = videoDataset.VideoDataset("/video/Gesture/val")
#print("Validation data : ", valDataset)
print("Validation data size : ", len(valDataset))
#print('')
testDataset   = videoDataset.VideoDataset("/video/Gesture/test")
#print("Test data : ", testDataset)

Training data size :  146
Validation data size :  3


## Train

In [65]:
#lstm model with 1 recurrent layer
model = ConvRNN.convRNN_1_layer("lstm", copyParameters=True)

#parameters
nbEpoch = 5
batchSize = 1
t = transforms.Compose(
            (transforms.ToPILImage(),
            transforms.Resize(225),
            transforms.RandomCrop(225),
            transforms.ToTensor())
            )
#We first learn only the RNN and classifier part
#with "high" learning rate 
lr = 0.05
optimizer = optim.SGD( [{'params': model.convRNN.parameters(), 
                            'params': model.classifier.parameters()} ], 
                          lr=lr, momentum=0.9, weight_decay=0.0005)

for epoch in range(nbEpoch):
    model.train()
    
    for videoName, annotation in trainDataset:
        print("Read video ", videoName)
        videoCap = cv2.VideoCapture(videoName)
        cframe = 0 #current frame
        
        #read each sequence
        for seq, gesture in annotation:
            print("The first sequence is : ", seq)
            #reach the start frame
            while cframe != seq[0]: 
                ret, frame = videoCap.read()
                if not ret :
                    print("Error : No Frame ", cframe)
                    exit(-1)
                cframe += 1
                
            #fill a tensor with the sequence images
            seqLen = seq[1] - seq[0] + 1
            inputs = torch.Tensor(seqLen, batchSize, 3,225,225)
            for i in range(seqLen):
                ret, frame = videoCap.read()
                cframe += 1
                if ret : 
                    inputs[i] = t(frame)
                else:
                    print("Error : No Frame ", cframe)
                    exit(-1)
            
            #forward pass
            outputs = model(Variable(inputs))
            print(outputs.size())

Read video  /video/Gesture/fR01.mp4
The first sequence is :  (0, 47)
torch.Size([1, 128, 6, 6])
The first sequence is :  (54, 89)
torch.Size([1, 128, 6, 6])
The first sequence is :  (93, 141)


KeyboardInterrupt: 