In [1]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import net_sphere
import torch.optim as optim
from torch.nn.utils import clip_grad_norm
from skimage import io
import numpy as np
import datetime,sys
from numpy.random import randint
import torchvision.models as models
from calculateEvaluationCCC import calculateCCC
import glob
import os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
# os.environ["CUDA_VISIBLE_DEVICES"]="0"

# torch.cuda.CUDA_VISIBLE_DEVICES = 2


# Define parameters
use_cuda = torch.cuda.is_available()
# use_cuda = 0
# lr = 0.01
# bs = 32
# n_epoch = 20
# lr_steps = [7,14]
lr = 0.001
bs = 6
n_epoch = 30
lr_steps = [7,14,21,28]
gpu_id = [0,1]

gd = 20 # clip gradient
eval_freq = 3
print_freq = 100
num_worker = 4
num_seg = 16
num_stft = 4
flag_biLSTM = True


train_list_path = './data/OMG_Aligned/train_list_lstm.txt'
val_list_path = './data/OMG_Aligned/val_list_lstm.txt'
Vmodel_path = './new_model/model_lstm_73.pth'
Amodel_path = './model/426_best.pth'

# Define Video Network (VNet)

In [2]:
sphereface = getattr(net_sphere,'sphere20a')()
# sphereface.load_state_dict(torch.load(model_path))
sphereface.feature = True # remove the last fc layer because we need to use LSTM first

class VNet(torch.nn.Module):
    def __init__(self, sphereface, feature=True):
        super(VNet, self).__init__()
        self.sphereface = sphereface
        self.linear = torch.nn.Linear(512,2)
        self.tanh = torch.nn.Tanh()
        self.feature = feature
        self.avgPool = torch.nn.AvgPool2d((num_seg,1), stride=1)
        self.LSTM = torch.nn.LSTM(512, 512, 1, batch_first = True, dropout=0.2, bidirectional=flag_biLSTM)  # Input dim, hidden dim, num_layer
        for name, param in self.LSTM.named_parameters():
            if 'bias' in name:
                torch.nn.init.constant(param, 0.0)
            elif 'weight' in name:
                torch.nn.init.orthogonal(param)
        
    def sequentialLSTM(self, input, hidden=None):

        input_lstm = input.view([-1,num_seg, input.shape[1]])
        batch_size = input_lstm.shape[0]
        feature_size = input_lstm.shape[2]

        self.LSTM.flatten_parameters()
            
        output_lstm, hidden = self.LSTM(input_lstm)
        if flag_biLSTM:
             output_lstm = output_lstm.contiguous().view(batch_size, output_lstm.size(1), 2, -1).sum(2).view(batch_size, output_lstm.size(1), -1) 

        output_lstm = output_lstm.view(batch_size,1,num_seg,-1)
        out = self.avgPool(output_lstm)
        out = out.view(batch_size,-1)
        return out
    
    def forward(self, x):
        x = self.sphereface(x)
        x = self.sequentialLSTM(x)
        if self.feature == True: return x

        x = self.linear(x)
        x = self.tanh(x)

        return x

model_v = VNet(sphereface)

model_v.load_state_dict(torch.load(Vmodel_path))
# if use_cuda:
#     model.cuda()



In [3]:
# model_v.feature = True
# # print model_v

# t = torch.autograd.Variable(torch.randn(16,3,112,96))

# o = model_v(t)
# print o.shape

# Define Audio Network (ANet)

In [4]:
vgg = models.vgg16(pretrained=False).features

removed = list(vgg.children())[1:]
vgg = torch.nn.Sequential(*removed)

# We modified the first layer of vgg16
vgg_modified = torch.nn.Sequential(torch.nn.Conv2d(2,64,3),vgg)

class ANet(torch.nn.Module):
    def __init__(self, vgg,feature=True):
        super(ANet, self).__init__()
        self.vgg = vgg
        self.fc1 = torch.nn.Linear(512*7*9,4096)
        self.relu = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout()
        self.fc2 = torch.nn.Linear(4096,512)
        self.fc3 = torch.nn.Linear(512,2)
        self.tanh = torch.nn.Tanh()
        self.feature = feature
    def forward(self, x):
        x = self.vgg(x)
        x = x.view([-1,512*7*9])
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.dropout(self.relu(self.fc2(x)))
        if self.feature == True: return x 
        
        x = self.tanh(self.fc3(x))
        return x

model_a = ANet(vgg_modified)


# t = torch.autograd.Variable(torch.randn(1,2,257,300))
# o = model_a(t)
# print o.shape
model_a.load_state_dict(torch.load(Amodel_path))
# for p in model.parameters():
#     p.requires_grad = True

# if use_cuda:
#     model.cuda()

# Joint Audio and Video Network

In [5]:
# bs = 2
class AVNet(torch.nn.Module):
    def __init__(self, vnet,anet):
        super(AVNet, self).__init__()
        self.vnet = vnet
        self.anet = anet
        self.avgPool = torch.nn.AvgPool2d((num_stft,1), stride=1)
        self.fc = torch.nn.Linear(1024,2)
        self.tanh = torch.nn.Tanh()
        
    def forward(self, xi,xs):
        xi = self.vnet(xi) 
        xs = self.anet(xs)
        xs = xs.view((-1,1,num_stft,512))
        xs = self.avgPool(xs)
        xs = xs.view(-1,512)
        
        
        x = torch.cat((xi, xs), 1)
        x = self.tanh(self.fc(x))
        return x

xi = torch.autograd.Variable(torch.randn(32,3,96,112))
xs = torch.autograd.Variable(torch.randn(8,2,257,300))

model = AVNet(model_v, model_a)


if use_cuda:
#     model = torch.nn.DataParallel(model, device_ids=gpu_id).cuda()
    model.cuda()

# o = model(xi,xs)
# print o.shape


In [6]:
class OMGDataset(Dataset):
    """OMG dataset."""

    def __init__(self, txt_file, base_path_v, base_path_a, transform=None):
        self.base_path_v = base_path_v
        self.base_path_a = base_path_a
        self.data = pd.read_csv(txt_file, sep=" ", header=None)
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        vid = self.data.iloc[idx,0]
        utter = self.data.iloc[idx,1]
        img_list = self.data.iloc[idx,-1]
        img_list = img_list.split(',')[:-1]
        img_list = map(int, img_list)
        
        num_frames = len(img_list)
        # inspired by TSN's pytorch code
        average_duration = num_frames // num_seg
        if num_frames>num_seg:
            offsets = np.multiply(list(range(num_seg)), average_duration) + randint(average_duration, size=num_seg)
        else:
            tick = num_frames / float(num_seg)
            offsets = np.array([int(tick / 2.0 + tick * x) for x in range(num_seg)])

        final_list = [img_list[i] for i in offsets]
        
        # stack images within a video in the depth dimension
        for i,ind in enumerate(final_list):
            image = io.imread(self.base_path_v+'%s/%s/%d.jpg'%(vid,utter,ind)).astype(np.float32)
            image = torch.from_numpy(((image - 127.5)/128).transpose(2,0,1))
            if i==0:
                images = image
            else:
                images = torch.cat((images,image), 0)
        
        
        # stft data acquisition
        stft_path = self.base_path_a+vid+'/'+utter
        stfts_count = len(glob.glob1(stft_path,"*.npy"))
        stft_list_all = range(stfts_count)
        
        average_duration = stfts_count // num_stft
        if stfts_count>num_stft:
            offsets = np.multiply(list(range(num_stft)), average_duration) + randint(average_duration, size=num_stft)
        else:
            tick = stfts_count / float(num_stft)
            offsets = np.array([int(tick / 2.0 + tick * x) for x in range(num_stft)])
        
        stft_list = [stft_list_all[i] for i in offsets]
        
        for i,ind in enumerate(stft_list):
            
            stft = np.load(stft_path+'/%d.npy'%ind).astype(np.float32)
            max_val = max(np.abs(np.max(stft)),np.abs(np.min(stft)))
            mean_val = np.mean(stft)
            stft = torch.from_numpy(((stft - mean_val)/max_val).transpose(2,0,1))
            if i==0:
                stfts = stft
            else:
                stfts = torch.cat((stfts,stft), 0)
                
        
        
        label = torch.from_numpy(np.array([self.data.iloc[idx,2], self.data.iloc[idx,3]]).astype(np.float32))

        if self.transform:
            image = self.transform(image)
        return (images, stfts, label, (vid,utter))
    
train_loader = DataLoader(OMGDataset(train_list_path,'./data/OMG_Aligned/Train/', './data/STFT/Train/'), 
                          batch_size=bs, shuffle=True, num_workers=num_worker)
val_loader = DataLoader(OMGDataset(val_list_path,'./data/OMG_Aligned/Val/', './data/STFT/Val/'), 
                        batch_size=bs, shuffle=False, num_workers=num_worker)



In [7]:
def printoneline(*argv):
    s = ''
    for arg in argv: s += str(arg) + ' '
    s = s[:-1]
    sys.stdout.write('\r'+s)
    sys.stdout.flush()
    
def dt():
    return datetime.datetime.now().strftime('%H:%M:%S')

def save_model(model,filename):
    state = model.state_dict()
#     for key in state: state[key] = state[key].clone().cpu()
    torch.save(state, filename)


In [8]:
def pearsonr(outputs, targets):
    vx = outputs - torch.mean(outputs)
    vy = targets - torch.mean(targets)
    rho = torch.sum(vx * vy) / (torch.sqrt(torch.sum(vx ** 2)) * torch.sqrt(torch.sum(vy ** 2)))  # use Pearson correlation
    return rho

def calCCC(out, tar, rho):
    true_mean = torch.mean(tar)
    true_variance = torch.var(tar)
    pred_mean = torch.mean(out)
    pred_variance = torch.var(out)
    std_predictions = torch.std(out)
    std_gt = torch.std(tar)
    
    ccc = 2 * rho * std_gt * std_predictions / (
        std_predictions ** 2 + std_gt ** 2 +
        (pred_mean - true_mean) ** 2)
    
    return ccc

def calLoss(outputs, targets):
    out_a = outputs[:,0]
    out_v = outputs[:,1]
    tar_a = targets[:,0]
    tar_v = targets[:,1]
    
    rho_a = pearsonr(out_a, tar_a)
    rho_v = pearsonr(out_v, tar_v)
    
    ccc_a = calCCC(out_a,tar_a,rho_a)
    ccc_v = calCCC(out_v,tar_v,rho_v)
    
    ccc_all = -(ccc_a+ccc_v)
    return ccc_all


# Train 

In [9]:
criterion = torch.nn.MSELoss()
def train(train_loader, model, criterion, optimizer, epoch):
    model.train()
    
    train_loss = 0
    correct = 0
    total = 0
    batch_idx = 0
    
    for i, (inputs_v, inputs_a, targets, _) in enumerate(train_loader):
        
        optimizer.zero_grad()
        
        if use_cuda:
            inputs_v, inputs_a, targets = inputs_v.cuda(), inputs_a.cuda(), targets.cuda(async=True)

        inputs_v = torch.autograd.Variable(inputs_v)
        inputs_a = torch.autograd.Variable(inputs_a)
        targets = torch.autograd.Variable(targets)
        
        inputs_v = inputs_v.view((-1,3)+inputs_v.size()[-2:])
        inputs_a = inputs_a.view((-1,2)+inputs_a.size()[-2:])
        
        outputs = model(inputs_v, inputs_a)
#         print 'here',outputs.shape, targets.shape
        
        loss = calLoss(outputs, targets)
#         loss = criterion(outputs,targets)
        
#         lossd = loss.data[0]        
        loss.backward()
        optimizer.step()
        
        #tsn uses clipping gradient
        if gd is not None:
            total_norm = clip_grad_norm(model.parameters(),gd)
            if total_norm > gd:
                print('clippling gradient: {} with coef {}'.format(total_norm, gd/total_norm))
                
        train_loss += loss.data[0]
        
        if i % print_freq == 0:
            printoneline(dt(),'Epoch=%d Loss=%.4f\n'
                % (epoch,train_loss/(batch_idx+1)))
        batch_idx += 1

In [10]:
def validate(val_loader, model, epoch):
    model.eval()
    
    err_arou = 0.0
    err_vale = 0.0
    
    out_name = 'results/joint_ccc2_%d.csv'%epoch
    txt_result = open(out_name, 'w')
    txt_result.write('video,utterance,arousal,valence\n')
    for (inputs_v, inputs_a, targets,(vid, utter)) in val_loader:
#         print inputs.shape
        if use_cuda:
            inputs_v, inputs_a, targets = inputs_v.cuda(), inputs_a.cuda(), targets.cuda()
        
        inputs_v = torch.autograd.Variable(inputs_v)
        inputs_a = torch.autograd.Variable(inputs_a)
        targets = torch.autograd.Variable(targets)
        
        
        inputs_v = inputs_v.view((-1,3)+inputs_v.size()[-2:])
        inputs_a = inputs_a.view((-1,2)+inputs_a.size()[-2:])
        
#         try:
        outputs = model(inputs_v, inputs_a)
#         except:
#             print 'here',inputs_v.shape, inputs_a.shape
        
        for i in range(len(vid)):
#             name = img_name[i].replace('/home/m2a03/Work/sphereface_pytorch/data/OMG_Aligned/', '')
            out = outputs
#             print vid[0], utter[0]
            txt_result.write('%s,%s.mp4,%f,%f\n'%(vid[i], utter[i],out[i][0],out[i][1]))
    
#     print('MSE of arousal: %f' % (err_arou / len(val_loader.dataset)))
#     print('MSE of valence: %f' % (err_vale / len(val_loader.dataset)))
#     print('MSE of total: %f' % ((err_arou + err_vale) / len(val_loader.dataset)))
    txt_result.close()
    
    arouCCC, valeCCC = calculateCCC('./results/omg_ValidationVideos.csv',out_name)
    return (arouCCC,valeCCC)

In [11]:
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)    
best_arou_ccc, best_vale_ccc = validate(val_loader, model, 0)
for epoch in range(n_epoch):
    if epoch in lr_steps:
        lr *= 0.1
        optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)    

    train(train_loader, model, criterion, optimizer, epoch)
#     save_model(model, './results/epoch_lstm_{}.pth'.format(epoch))
    
    # evaluate on validation set
    if (epoch+1)%eval_freq == 0 or epoch == n_epoch-1:
        arou_ccc, vale_ccc = validate(val_loader, model, epoch)
        
        if (arou_ccc+vale_ccc) > (best_arou_ccc + best_vale_ccc):
            best_arou_ccc = arou_ccc
            best_vale_ccc = vale_ccc
            save_model(model,'./pth/joint_ccc2_{}_{}_{}.pth'.format(epoch, round(arou_ccc,4), round(vale_ccc,4)))
            

Arousal CCC:  0.000729248005929
Arousal Cor:  0.0640982519015
Arousal MSE:  0.251539628394
Valence CCC:  0.0127693192139
Valence Cor:  0.251068486037
Valence MSE:  0.177341878108
Total CCC:    0.0134985672198
09:36:04 Epoch=0 Loss=-0.0314
clippling gradient: 30.4845379443 with coef 0.656070301493
09:37:48 Epoch=0 Loss=-1.1690
09:39:33 Epoch=0 Loss=-1.3460
09:41:18 Epoch=0 Loss=-1.4052
09:43:03 Epoch=0 Loss=-1.4359
09:43:13 Epoch=1 Loss=-1.3830
09:44:57 Epoch=1 Loss=-1.6308
09:46:42 Epoch=1 Loss=-1.6201
09:48:27 Epoch=1 Loss=-1.5857
09:50:13 Epoch=1 Loss=-1.5784
09:50:21 Epoch=2 Loss=-1.6663
09:52:06 Epoch=2 Loss=-1.6135
09:53:51 Epoch=2 Loss=-1.6102
09:55:37 Epoch=2 Loss=-1.5948
clippling gradient: 23.6218730917 with coef 0.84667290872
09:57:22 Epoch=2 Loss=-1.5883
Arousal CCC:  0.214644260994
Arousal Cor:  0.230688168067
Arousal MSE:  0.0623285554297
Valence CCC:  0.447472218107
Valence Cor:  0.449795844191
Valence MSE:  0.113793266274
Total CCC:    0.662116479101
09:58:40 Epoch=3 Los

12:11:43 Epoch=18 Loss=-1.8460
12:11:54 Epoch=19 Loss=-1.8292
clippling gradient: 24.7488057294 with coef 0.808119802575
12:13:38 Epoch=19 Loss=-1.8651
clippling gradient: 21.5657073996 with coef 0.927398282348
clippling gradient: 47.8229759397 with coef 0.418209022065
12:15:24 Epoch=19 Loss=-1.8569
clippling gradient: 35.807898257 with coef 0.558535992715
12:17:09 Epoch=19 Loss=-1.8567
clippling gradient: 20.4691240439 with coef 0.977081381553
clippling gradient: 21.5626914418 with coef 0.927527996866
clippling gradient: 20.0374105155 with coef 0.998132966558
clippling gradient: 24.7778407161 with coef 0.807172837583
12:18:54 Epoch=19 Loss=-1.8542
clippling gradient: 62.9356708426 with coef 0.317784806807
12:19:04 Epoch=20 Loss=-1.9261
12:20:49 Epoch=20 Loss=-1.8584
clippling gradient: 26.8401693642 with coef 0.745151780848
12:22:34 Epoch=20 Loss=-1.8535
clippling gradient: 27.6023867927 with coef 0.724575021364
12:24:19 Epoch=20 Loss=-1.8541
clippling gradient: 25.1199949887 with coe