In [1]:
import sys
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable


import matplotlib.pyplot as plt
import torch 
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from tqdm import tqdm_notebook as tqdm
from torch.utils.data import Dataset, DataLoader
import h5py  
import numpy as np
import os 
from scipy.misc import imresize
import cv2
import random
import soundfile as sf

In [2]:
class AudioDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, train, frames_len=40, transform=None, h5_file='/media/jeff/Backup/CS598PS/data_2682.h5', transform_label=None):
        """
        Args:
            train (bool): Whether or not to use training data
            frames (int): Number of video frames per video sample
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.train = train
        self.transform = transform
        self.frames_len = frames_len
        
        dataset = h5py.File(h5_file)
        if self.train:
            self.videos_train = np.array(dataset['videos_train'])
            self.sounds_train = np.array(dataset['sounds_train'])
        else:
            self.videos_test = np.array(dataset['videos_test'])
            self.sounds_test = np.array(dataset['sounds_test'])
        dataset.close()
        
    def __len__(self):
        if self.train:
            return len(self.videos_train)
        return len(self.videos_test)

    def __getitem__(self, idx):
        if self.train:
            image = self.videos_train[idx]
            audio = self.sounds_train[idx]
        else:
            image = self.videos_test[idx]
            audio = self.sounds_test[idx]

        # Randomly sample 4 seconds from 10 second clip
        if random.random() < 0.5:
            start = random.randint(0,10) # Start frame
        else:
            start = random.randint(50,60)
        new_image = np.zeros((self.frames_len,256,256,1), dtype=np.uint8)
        for i in range(self.frames_len):
            new_image[i] = np.expand_dims(image[start+i],2)
        
        # Randomly align or misalign audio sample
        if random.random() < 0.5: # align
            audio = audio[int(start*220500/100.0):int(start*220500/100.0)+88200]
            label = 0
        else: # misalign
            if start < 30: # Add shift
                shift = random.randint(20, 60-start) # frame shift amount
#                 start = np.clip(start+shift, 0, 100-self.frames_len)
                start = start+shift
            else: # Subtract shift
                shift = random.randint(20, start) # frame shift amount
#                 start = np.clip(start-shift, 0, 100-self.frames_len)
                start = start-shift
            audio = audio[int(start*220500/100.0):int(start*220500/100.0)+88200]
            label = 1
            
        transform_image = np.zeros((self.frames_len,1,224,224))
        if self.transform:
            for i in range(self.frames_len):
                transform_image[i] = self.transform(new_image[i]) # Transform image frames
        
        return (transform_image, audio, label)

In [3]:
import sys
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

class Block2(nn.Module):
    expansion = 1

    def __init__(self, in_channels, out_channels, kernel_size, stride, downsample=None):
        super(Block2, self).__init__()
        self.out_channels = out_channels
        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding=0, dilation=1, groups=1, bias=True)
        self.bn1 = nn.BatchNorm1d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size=1, stride=1)
        self.bn2 = nn.BatchNorm1d(out_channels)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

class Block3(nn.Module):
    expansion = 1

    def __init__(self, in_channels, out_channels, kernel_size=(1,1,1), stride=1, downsample=None, padding=0):
        super(Block3, self).__init__()
        self.conv1 = nn.Conv3d(in_channels, out_channels, kernel_size, stride, padding=padding, dilation=1, groups=1, bias=True)
        self.bn1 = nn.BatchNorm3d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv3d(out_channels, out_channels, kernel_size=(1,1,1), stride=1)
        self.bn2 = nn.BatchNorm3d(out_channels)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        
        if self.downsample is not None:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)

        return out

def Linear(in_features, out_features, dropout=0.):
    m = nn.Linear(in_features, out_features)
    m.weight.data.normal_(mean=0, std=math.sqrt((1 - dropout) / in_features))
    m.bias.data.zero_()
    return nn.utils.weight_norm(m)

class alignment(nn.Module):
    def __init__(self):
        super(alignment, self).__init__()
        """Sound Features"""
        self.conv1_1 = nn.Conv1d(2, 64, 65, stride=4, padding=0, dilation=1, groups=1, bias=True)
        self.pool1_1 = nn.MaxPool1d(4, stride=4)

        self.s_net_1 = self._make_layer(Block2, 64, 128, 15, 4, 1)
        self.s_net_2 = self._make_layer(Block2, 128, 128, 15, 4, 1)
        self.s_net_3 = self._make_layer(Block2, 128, 256, 15, 4, 1)
        
        self.pool1_2 = nn.MaxPool1d(3, stride=3)
        self.conv1_2 = nn.Conv1d(256, 128, 3, stride=1, padding=0, dilation=1, groups=1, bias=True)
        
        """Image Features"""
        self.conv3_1 = nn.Conv3d(1, 64, (5,7,7), (2,2,2), padding=(2,3,3), dilation=1, groups=1, bias=True)
        self.pool3_1 = nn.MaxPool3d((1,3,3), (1,2,2), padding=(0,1,1))
        self.im_net_1 = self._make_layer(Block3, 64, 64, (3,3,3), (2,2,2), 2)

        """Fuse Features"""
        self.fractional_maxpool = nn.FractionalMaxPool2d((3,1), output_size=(10, 1))
        self.conv3_2 = nn.Conv3d(192, 512, (1, 1, 1))
        self.conv3_3 = nn.Conv3d(512, 128, (1, 1, 1))
        self.joint_net_1 = self._make_layer(Block3, 128, 128, (3,3,3), (2,2,2), 2)
        self.joint_net_2 = self._make_layer(Block3, 128, 256, (3,3,3), (1,2,2), 2)
        self.joint_net_3 = self._make_layer(Block3, 256, 512, (3,3,3), (1,2,2), 2)

        #TODO: Global avg pooling, fc and sigmoid
        self.fc = Linear(512,2)

    def _make_layer(self, block, in_channels, out_channels, kernel_size, stride, blocks):
        downsample = None
        if stride != 1 or in_channels != out_channels * block.expansion:
            if isinstance(kernel_size, int):
                downsample = nn.Sequential(
                    nn.Conv1d(in_channels, out_channels * block.expansion, kernel_size, stride),
                    nn.BatchNorm1d(out_channels * block.expansion),
                )
                layers = []
                layers.append(block(in_channels, out_channels, kernel_size, stride, downsample))
            else:
                downsample = nn.Sequential(
                    nn.Conv3d(in_channels, out_channels * block.expansion, kernel_size, stride, padding=1),
                    nn.BatchNorm3d(out_channels * block.expansion),
                )
                layers = []
                layers.append(block(in_channels, out_channels, kernel_size, stride, downsample, padding=1))

        
        for _ in range(1, blocks):
            layers.append(block(out_channels, out_channels))

        return nn.Sequential(*layers)

    def forward(self, sounds, images):
        batchsize = sounds.shape[0]
        sounds = sounds.view(batchsize, 2, -1)
        _, num, _, xd, yd, = images.shape
        images = images.view(batchsize, 1, num, xd, yd)
        
        out_s = self.conv1_1(sounds)
        out_s = self.pool1_1(out_s)

        out_s = self.s_net_1(out_s)
        out_s = self.s_net_2(out_s)
        out_s = self.s_net_3(out_s)

        out_s = self.pool1_2(out_s)
        out_s = self.conv1_2(out_s)
        
        out_im = self.conv3_1(images)
        out_im = self.pool3_1(out_im)
        out_im = self.im_net_1(out_im)

        #tile audio, concatenate channel wise
        out_s = self.fractional_maxpool(out_s.unsqueeze(3)) # Reduce dimension from 25 to 8
        out_s = out_s.squeeze(3).view(-1, 1, 1).repeat(1, 28, 28).view(-1,128,10,28,28) # Tile
        out_joint = torch.cat((out_s, out_im),1)
        out_joint = self.conv3_2(out_joint)
        out_joint = self.conv3_3(out_joint)
        out_joint = self.joint_net_1(out_joint)
        out_joint = self.joint_net_2(out_joint)
        out_joint = self.joint_net_3(out_joint)
        feature_maps = out_joint
        """Global Average Pooling"""
        out_joint = F.avg_pool3d(out_joint, kernel_size=out_joint.size()[2:]).view(batchsize,-1)
#         out_joint = out_joint.view(batchsize, 512, -1).mean(2)
        out_joint = self.fc(out_joint)
        out_joint = torch.sigmoid(out_joint)
        return out_joint, feature_maps

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = "4"

transform = transforms.Compose([
transforms.ToPILImage(),
# transforms.RandomHorizontalFlip(),
transforms.RandomCrop(224),
transforms.ToTensor()])

train_dataset = AudioDataset(train=True,transform=transform)
test_dataset = AudioDataset(train=False,transform=transform)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=16, shuffle=True, num_workers=4)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=16, shuffle=False, num_workers=4)

model_align = alignment().cuda()
checkpoint = torch.load("fixed_500.pth")
model_align.load_state_dict(checkpoint.state_dict())

# Training

In [5]:
loss_fn = nn.CrossEntropyLoss()
optimizer_align = optim.Adam(model_align.parameters(), lr = 1e-5)
for epoch in range(150):
    accs = []
    losses = []
    model_align.train()
    for batch_idx, (images, sounds, labels) in enumerate(train_loader):
        images_v = Variable(images.type(torch.FloatTensor)).cuda()
        sounds_v = Variable(sounds.type(torch.FloatTensor)).cuda()
        labels_v = Variable(labels).cuda()
        
        optimizer_align.zero_grad()
        aligned_res, _ = model_align(sounds_v, images_v)
        loss = loss_fn(aligned_res, labels_v)
        loss.backward()
        optimizer_align.step()
        losses.append(loss.item())
        accs.append(np.mean((torch.argmax(aligned_res,1) == labels_v).detach().cpu().numpy()))
    print("Epoch :", epoch, np.mean(losses), np.mean(accs))
    if (epoch + 1)%25 == 0:
        accs = []
        losses = []
        model_align.eval()
        for batch_idx, (images, sounds, labels) in enumerate(test_loader):
            with torch.no_grad():
                images_v = Variable(images.type(torch.FloatTensor)).cuda()
                sounds_v = Variable(sounds.type(torch.FloatTensor)).cuda()
                labels_v = Variable(labels).cuda()
                aligned_res, _ = model_align(sounds_v, images_v)
                loss = loss_fn(aligned_res, labels_v)
                losses.append(loss.item())
                accs.append(np.mean((torch.argmax(aligned_res,1) == labels_v).detach().cpu().numpy()))
        print("Validation :", epoch, np.mean(losses), np.mean(accs))
torch.save(model_align, 'larger_150.pth')

('Epoch :', 0, 0.7115155496904927, 0.5221774193548387)
('Epoch :', 1, 0.7018449652579523, 0.48185483870967744)
('Epoch :', 2, 0.6940048048573155, 0.49798387096774194)
('Epoch :', 3, 0.6957579332013284, 0.4737903225806452)
('Epoch :', 4, 0.6932148395046112, 0.5181451612903226)
('Epoch :', 5, 0.6967367337596032, 0.5)
('Epoch :', 6, 0.6955373383337452, 0.5141129032258065)
('Epoch :', 7, 0.6927853841935435, 0.48185483870967744)
('Epoch :', 8, 0.6944674080418002, 0.5020161290322581)
('Epoch :', 9, 0.6935403904607219, 0.4879032258064516)
('Epoch :', 10, 0.6943451569926354, 0.5120967741935484)
('Epoch :', 11, 0.6919757877626727, 0.5342741935483871)
('Epoch :', 12, 0.6982282303994701, 0.42338709677419356)
('Epoch :', 13, 0.6933796328883017, 0.5020161290322581)
('Epoch :', 14, 0.6932195559624703, 0.4838709677419355)
('Epoch :', 15, 0.6938723575684332, 0.5342741935483871)
('Epoch :', 16, 0.6937995591471272, 0.4899193548387097)
('Epoch :', 17, 0.6955377920981376, 0.4637096774193548)
('Epoch :', 1

('Epoch :', 143, 0.6946594388254227, 0.5100806451612904)
('Epoch :', 144, 0.6923031326263182, 0.4939516129032258)
('Epoch :', 145, 0.6958170686998675, 0.4576612903225806)
('Epoch :', 146, 0.6938704540652614, 0.4737903225806452)
('Epoch :', 147, 0.693901740735577, 0.5141129032258065)
('Epoch :', 148, 0.6934036939374862, 0.5020161290322581)
('Epoch :', 149, 0.6939269438866646, 0.5262096774193549)
('Validation :', 149, 0.6925579538712134, 0.532051282051282)
('Epoch :', 150, 0.6922417763740786, 0.5221774193548387)
('Epoch :', 151, 0.6913459031812607, 0.4939516129032258)
('Epoch :', 152, 0.6943096307016188, 0.5161290322580645)
('Epoch :', 153, 0.6915599799925282, 0.4879032258064516)
('Epoch :', 154, 0.6941882045038285, 0.5100806451612904)
('Epoch :', 155, 0.6941225663308175, 0.5100806451612904)
('Epoch :', 156, 0.6928735317722443, 0.5080645161290323)
('Epoch :', 157, 0.6932165007437429, 0.5040322580645161)
('Epoch :', 158, 0.6928886367428687, 0.4637096774193548)
('Epoch :', 159, 0.694068895

# Feature Map Visualization

In [7]:
def activation(feature_map, weights, label):
    output = np.zeros((224,224))
    for i in range(512):
        output += imresize(feature_map[i], (224,224))*weights[label,i]
    return output

In [8]:
for name, param in model_align.state_dict().items():
    if name =='fc.weight_v':
        weight = param

In [9]:
images, sounds, labels = train_dataset[37]
images_v = Variable(torch.tensor(images)).type(torch.FloatTensor).cuda().unsqueeze(0)
sounds_v = Variable(torch.tensor(sounds)).type(torch.FloatTensor).cuda().unsqueeze(0)
labels_v = Variable(torch.tensor(labels)).cuda().unsqueeze(0)
aligned_res, feature_maps = model_align(sounds_v, images_v)

In [None]:
output = activation(feature_maps[0,:,0].detach().cpu().numpy(), weight.detach().cpu().numpy(),0)

In [None]:
plt.imshow(output, cmap='gray')
plt.show()
plt.imshow(images[0,0], cmap='gray')

In [71]:
np.unique(output)

array([-367.36499567, -367.35861125, -367.14408086, ...,    8.636785  ,
          8.67113671,    8.9101118 ])