Train network for Gesture Recognition from Video

In [3]:
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data

import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models

from torch.autograd import Variable

import numpy as np
import random
from PIL import Image
from ipywidgets import FloatProgress
from IPython.display import display
from __future__ import print_function

from model import ModelDefinition
from dataset import ReadImages, collection
import os
import os.path as path
import glob
import random

import cv2
#os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [4]:
def readFrameAnnotation(annotationFile):
    """
        read annotation file
        return the list of annotation ([start, end], gesture)
    """
    anno = []
    for l in open(annotationFile).read().splitlines():
        s = l.split(' ')
        anno += [ ([int(s[1]), int(s[2])], int(s[0])-1)]
    return anno

In [5]:
def findGestureFrame(frameNumber, annotationFile):
    """
        from Frame Number and the list of annotation
        return the Gesture or None if not in annation
    """
    for seq, gest in annotationFile:
        if frameNumber >= seq[0] and frameNumber <= seq[1]:
            return gest
    return None

In [44]:
def copyParameters(net, netBase):
    for i, f in enumerate(net.features):
        if type(f) is torch.nn.modules.conv.Conv2d:
            if i < len(netBase.features._modules):
                if f.weight.size() == netBase.features[i].weight.size():
                    f.weight.data = netBase.features[i].weight.data
                    f.bias.data = netBase.features[i].bias.data
    for i, c in enumerate(net.classifier):
        if type(c) is torch.nn.modules.linear.Linear:
            if c.weight.size() == netBase.classifier[i].weight.size():
                c.weight.data = netBase.classifier[i].weight.data
                c.bias.data = netBase.classifier[i].bias.data

In [7]:
def fillInput(nframe, video, with_cuda=False):
    t = transforms.Compose(
                (transforms.ToPILImage(),
                transforms.Scale(225),
                transforms.RandomCrop(225),
                transforms.ToTensor())
                )
    if with_cuda:
        inputs = torch.Tensor(nframe,3,225,225).cuda()
    else:
        inputs = torch.Tensor(nframe,3,225,225)
    for j in range(nframe):
        ret, frame = video.read()
        if frame is None:
            print("Error : None Frame")
            exit(0)
        frame = t(frame)
        inputs[j] = frame
    return inputs

In [8]:
#TODO : test if difference between learning only gesture per batch

def learnSequence(sequence, gesture, video, model, criterion, optimize, batchSize=32):
    numberFrame = seq[1] - seq[0]
    running_loss = 0
    while numberFrame > 0:
        if numberFrame >= batchSize:
            inputs = fillInput(batchSize, video, True)
            numberFrame -= batchSize
            
            labels = torch.LongTensor([gesture]*batchSize).cuda()
        else:
            inputs = fillInput(numberFrame, video, True)
            labels = torch.LongTensor([gesture]*numberFrame).cuda()
            numberFrame = 0
            
        inputs = Variable(inputs)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, Variable(labels))
        loss.backward()
        optimizer.step()
        running_loss += loss.data[0]
    return running_loss

In [9]:
def testSequence(seq, gesture, video, model, batchSize=32):
    numberFrame = seq[1] - seq[0]
    correct = 0
    while numberFrame > 0:
        if numberFrame >= batchSize:
            inputs = fillInput(batchSize, video, True)
            numberFrame -= batchSize
        else:
            inputs = fillInput(numberFrame, video, True)
            numberFrame = 0
            
        inputs = Variable(inputs)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        predicted = predicted.tolist()
        for i in range(len(predicted)):
            correct = (predicted[i][0] == gesture) or correct
    return correct

In [10]:
def testNet(model, testDir):
    model.eval()
    t = 0
    c = 0
    for video in glob.glob(testDir+'*.mp4'):
        print("Test video ", video)
        fName = path.splitext(path.basename(video))[0] #basename
        annotation = readFrameAnnotation(testDir+fName)
        videoCap = cv2.VideoCapture(video)
        for seq, gesture in annotation:
            #print("Frame : ", seq[0], '-', seq[1])
            #t += seq[1] - seq[0]
            t += 1
            c += testSequence(seq, gesture, videoCap, model)
            #print("Correct :", c)
    print("Correctness : ", c, '/', t)
    return c

In [11]:
def testNFrame(model, testDir, frame_num=5):
    """
        Test the model with a window of frame_num frames
    """
    print("TestNFrame")
    model.eval()
    c = 0
    t = 0
    for video in glob.glob(testDir+'*.mp4'):
        print("Test video ", video)
        fName = path.splitext(path.basename(video))[0] #basename
        annotation = readFrameAnnotation(testDir+fName)
        videoCap = cv2.VideoCapture(video)
        for seq, gesture in annotation:
            for i in range( (seq[1]-seq[0])/frame_num):
                inputs = fillInput(frame_num, videoCap, True)
                inputs = Variable(inputs, volatile=True)
                outputs = model(inputs)
                t += 1
                c += (int(outputs.data.sum(0).max(1)[1].cpu()[0][0]) == gesture)
    print("Correctness : ", c, '/', t)
    return c

In [47]:
def testImages(model, testDir, transf=transforms.ToTensor(), batch_size=32):
    """
        Test model on images organized : class/imName
    """
    model.eval()
    dtest = datasets.ImageFolder(testDir, transform=transf)
    l = torch.utils.data.DataLoader(dtest, batch_size=batch_size, num_workers=6, drop_last=False, pin_memory=False)
    c = 0
    for batch_idx, (data,target) in enumerate(l):
        data = Variable(data.cuda(), volatile=True)
        output = model(data)
        pred = output.data.max(1)[1].cpu()
        c += pred.eq(target).sum()
    print("Correctness on Images : ", c,"/", len(l.dataset), ':', float(c)/len(l.dataset)*100, '%' )
    return c

In [65]:
class AlexNetS(nn.Module):

    def __init__(self, num_classes=1000):
        super(AlexNetS, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            #nn.MaxPool2d(kernel_size=3, stride=1),
            #nn.Conv2d(256, , kernel_size=13, padding=1),
            #nn.Conv2d(256, 256, kernel_size=13, padding=1),
            #nn.ReLU(inplace=True),
            nn.AvgPool2d(13,13),
        )
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes),
        )
        self.newClassifier = nn.Sequential(
            nn.Linear(256,6)
        )

    def forward(self, x):
        x = self.features(x)
        #x = x.view(x.size(0), 256 * 6 * 6)
        x = x.view(x.size(0), 256)
        x = self.newClassifier(x)
        return x


In [66]:
m = AlexNetS()
t = Variable(torch.Tensor(32,3,225,225))
m(t)

Variable containing:
1.00000e-02 *
 -3.9006 -4.9893  1.0369 -0.3465  3.7769  3.7546
 -3.9006 -4.9893  1.0369 -0.3465  3.7769  3.7546
 -3.9006 -4.9893  1.0369 -0.3465  3.7769  3.7546
 -3.9006 -4.9893  1.0369 -0.3465  3.7769  3.7546
 -3.9006 -4.9893  1.0369 -0.3465  3.7769  3.7546
 -3.9006 -4.9893  1.0369 -0.3465  3.7769  3.7546
 -3.9006 -4.9893  1.0369 -0.3465  3.7769  3.7546
 -3.9006 -4.9893  1.0369 -0.3465  3.7769  3.7546
 -3.9006 -4.9893  1.0369 -0.3465  3.7769  3.7546
 -3.9006 -4.9893  1.0369 -0.3465  3.7769  3.7546
 -3.9006 -4.9893  1.0369 -0.3465  3.7769  3.7546
 -3.9006 -4.9893  1.0369 -0.3465  3.7769  3.7546
 -3.9006 -4.9893  1.0369 -0.3465  3.7769  3.7546
 -3.9006 -4.9893  1.0369 -0.3465  3.7769  3.7546
 -3.9006 -4.9893  1.0369 -0.3465  3.7769  3.7546
 -3.9006 -4.9893  1.0369 -0.3465  3.7769  3.7546
 -3.9006 -4.9893  1.0369 -0.3465  3.7769  3.7546
 -3.9006 -4.9893  1.0369 -0.3465  3.7769  3.7546
 -3.9006 -4.9893  1.0369 -0.3465  3.7769  3.7546
 -3.9006 -4.9893  1.0369 -0.3465  

In [29]:
def trainOnVideos():
    rootDir = '/video/Gesture/'
    model = AlexNetS()
    #model = models.VGG(models.vgg.make_layers(models.vgg.cfg['B'], batch_norm=True), num_classes=6)
    copyParameters(model, models.alexnet(pretrained=True))
    #model = torch.load('best-model.ckpt')
    criterion = nn.CrossEntropyLoss()
    lr = 0.01

    model.cuda()
    best = testNet(model, rootDir+'test/')
    rl = 0
    videos = glob.glob(rootDir+'*.mp4')
    j = 0
    for epoch in range(5):
        random.shuffle(videos)
        for video in videos:
            model.train()
            optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=0.0005)
            #testNet(model, rootDir+'test/')
            print("Video ", video)
            fName = path.splitext(path.basename(video))[0] #basename
            annotation = readFrameAnnotation(rootDir+'annotation/'+fName) #read annotation

            videoCap = cv2.VideoCapture(video)
            i = 0
            for seq, gesture in annotation:
                #print("Sequence ", seq, " Gesture : ", gesture)
                rl += learnSequence(seq, gesture, videoCap, model, criterion, optimizer)
                i += 1
                if i%5 == 4:
                    print("[epoch %d] loss : %.3f" % (epoch, rl/5) )
                    #i = 0
                    rl = 0.0
            
            if j%5 == 4:
                r = testNet(model, rootDir+'test/')
                if r > best:
                    best = r
                    print("Saving best model")
                    torch.save(model, 'best-model.ckpt')
            #torch.save(model, path.join('model-'+str(epoch)+".ckpt"))
            #lr = 0.001
            j += 1
            videoCap.release()
        lr = 0.001
        

In [74]:
def trainOnImages(model, rootDir='/video/GestureImages/trainBGIMAG3_All/', batch_size=32, trainTrans=transforms.ToTensor(), testTrans=transforms.ToTensor(), lr=0.001, epoch=50):
    d = datasets.ImageFolder(rootDir, transform=trainTrans)
    l = torch.utils.data.DataLoader(d, batch_size=32, shuffle=True, num_workers=6, drop_last=True, pin_memory=False)
    model.train()
    model.cuda()
    best = testImages(model, '/video/GestureImages/trainBGOffice_All/')
    #testNet(model=model)
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0005)
    for ep in range(epoch):
        model.train()
        for batch_idx, (data,target) in enumerate(l):
            #data, target = data.cuda(device=0), target.cuda(device=0)
            data, target = Variable(data.cuda()), Variable(target.cuda())
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            if batch_idx % 50 == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    ep, batch_idx * len(data), len(l.dataset),
                    100. * batch_idx / len(l), loss.data[0]))
        #r = testNFrame(model, '/video/Gesture/test/')
        r = testImages(model, '/video/GestureImages/trainBGOffice_All/')
        if r > best :
            best = r
            print("Saving best model")
            torch.save(model, 'best-model.ckpt')
        if ep%2 == 0:
            testNet(model=model, testDir='/video/Gesture/test/')
            testNFrame(model, '/video/Gesture/test/')

In [None]:
#model = AlexNetS()
#copyParameters(model, models.alexnet(pretrained=True))
t = transforms.Compose(
                (transforms.ToPILImage(),
                transforms.Scale(225),
                transforms.RandomCrop(225),
                transforms.ToTensor())
                )
model=torch.load('best-model.ckpt')
trainOnImages(model, epoch=10)

trainOnImages(model, '/video/GestureImages/trainBGOffice_All/')

Correctness on Images :  5144 / 7958 : 64.6393566223 %
Correctness on Images :  5681 / 7958 : 71.387283237 %
Saving best model
Test video  /video/Gesture/test/v43.mp4
Test video  /video/Gesture/test/u19.mp4
Correctness :  8 / 42
Test video  /video/Gesture/test/v43.mp4
Test video  /video/Gesture/test/u19.mp4
Correctness :  14 / 165
Correctness on Images :  5477 / 7958 : 68.8238250817 %
Correctness on Images :  6123 / 7958 : 76.9414425735 %
Saving best model
Test video  /video/Gesture/test/v43.mp4
Test video  /video/Gesture/test/u19.mp4
Correctness :  9 / 42
Test video  /video/Gesture/test/v43.mp4
Test video  /video/Gesture/test/u19.mp4
Correctness :  15 / 165
Correctness on Images :  5906 / 7958 : 74.2146267907 %
Correctness on Images :  5282 / 7958 : 66.3734606685 %
Test video  /video/Gesture/test/v43.mp4
Test video  /video/Gesture/test/u19.mp4
Correctness :  9 / 42
Test video  /video/Gesture/test/v43.mp4
Test video  /video/Gesture/test/u19.mp4
Correctness :  18 / 165
Correctness on Im