In [10]:
import sys
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

import matplotlib.pyplot as plt
import torch 
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from tqdm import tqdm_notebook as tqdm
from torch.utils.data import Dataset, DataLoader
import h5py  
import numpy as np
import os, sys
from scipy.misc import imresize
import cv2
import random
import soundfile as sf
import category_getter

In [2]:
class AudioDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, train, frames_len=40, transform=None, h5_file='/media/jeff/Backup/CS598PS/data_2682.h5', transform_label=None):
        """
        Args:
            train (bool): Whether or not to use training data
            frames (int): Number of video frames per video sample
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.train = train
        self.transform = transform
        self.frames_len = frames_len
        
        if self.train:
            dataset = h5py.File(h5_file,'r')
            self.videos_train = np.array(dataset['videos_train'])
            self.sounds_train = np.array(dataset['sounds_train'])
        else:
            dataset = h5py.File(h5_file,'r')
            self.videos_test = np.array(dataset['videos_test'])
            self.sounds_test = np.array(dataset['sounds_test'])
#             self.filenames_test = np.load('/media/jeff/Backup/CS598PS/test_filenames.npy')
        dataset.close()
        
    def __len__(self):
        if self.train:
            return len(self.videos_train)
        return len(self.videos_test)

    def __getitem__(self, idx):
        if self.train:
            image = self.videos_train[idx]
            audio = self.sounds_train[idx]
        else:
            image = self.videos_test[idx]
            audio = self.sounds_test[idx]
#             filename = self.filenames_test[idx]

        # Randomly sample 4 seconds from 10 second clip
        if random.random() < 0.5:
            start = random.randint(0,10) # Start frame
        else:
            start = random.randint(50,60)
        new_image = np.zeros((self.frames_len,256,256,1), dtype=np.uint8)
        for i in range(self.frames_len):
            new_image[i] = np.expand_dims(image[start+i],2)
        
        # Randomly align or misalign audio sample
        if random.random() < 0.5: # align
            audio = audio[int(start*220500/100.0):int(start*220500/100.0)+88200]
            label = 0
        else: # misalign
            if start < 30: # Add shift
                shift = random.randint(20, 60-start) # frame shift amount
                start = start+shift
            else: # Subtract shift
                shift = random.randint(20, start) # frame shift amount
                start = start-shift
            audio = audio[int(start*220500/100.0):int(start*220500/100.0)+88200]
            label = 1
            
        transform_image = np.zeros((self.frames_len,1,224,224))
        if self.transform:
            for i in range(self.frames_len):
                transform_image[i] = self.transform(new_image[i]) # Transform image frames
        
        return (transform_image, audio, label)

In [3]:
import sys
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

class Block2(nn.Module):
    expansion = 1

    def __init__(self, in_channels, out_channels, kernel_size, stride, downsample=None):
        super(Block2, self).__init__()
        self.out_channels = out_channels
        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding=0, dilation=1, groups=1, bias=True)
        self.bn1 = nn.BatchNorm1d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size=1, stride=1)
        self.bn2 = nn.BatchNorm1d(out_channels)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

class Block3(nn.Module):
    expansion = 1

    def __init__(self, in_channels, out_channels, kernel_size=(1,1,1), stride=1, downsample=None, padding=0):
        super(Block3, self).__init__()
        self.conv1 = nn.Conv3d(in_channels, out_channels, kernel_size, stride, padding=padding, dilation=1, groups=1, bias=True)
        self.bn1 = nn.BatchNorm3d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv3d(out_channels, out_channels, kernel_size=(1,1,1), stride=1)
        self.bn2 = nn.BatchNorm3d(out_channels)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        
        if self.downsample is not None:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)

        return out

def Linear(in_features, out_features, dropout=0.):
    m = nn.Linear(in_features, out_features)
    m.weight.data.normal_(mean=0, std=math.sqrt((1 - dropout) / in_features))
    m.bias.data.zero_()
    return nn.utils.weight_norm(m)

class alignment(nn.Module):
    def __init__(self):
        super(alignment, self).__init__()
        """Sound Features"""
        self.conv1_1 = nn.Conv1d(2, 64, 65, stride=4, padding=0, dilation=1, groups=1, bias=True)
        self.pool1_1 = nn.MaxPool1d(4, stride=4)

        self.s_net_1 = self._make_layer(Block2, 64, 128, 15, 4, 1)
        self.s_net_2 = self._make_layer(Block2, 128, 128, 15, 4, 1)
        self.s_net_3 = self._make_layer(Block2, 128, 256, 15, 4, 1)
        
        self.pool1_2 = nn.MaxPool1d(3, stride=3)
        self.conv1_2 = nn.Conv1d(256, 128, 3, stride=1, padding=0, dilation=1, groups=1, bias=True)
        
        """Image Features"""
        self.conv3_1 = nn.Conv3d(1, 64, (5,7,7), (2,2,2), padding=(2,3,3), dilation=1, groups=1, bias=True)
        self.pool3_1 = nn.MaxPool3d((1,3,3), (1,2,2), padding=(0,1,1))
        self.im_net_1 = self._make_layer(Block3, 64, 64, (3,3,3), (2,2,2), 2)

        """Fuse Features"""
        self.fractional_maxpool = nn.FractionalMaxPool2d((3,1), output_size=(10, 1))
        self.conv3_2 = nn.Conv3d(192, 512, (1, 1, 1))
        self.conv3_3 = nn.Conv3d(512, 128, (1, 1, 1))
        self.joint_net_1 = self._make_layer(Block3, 128, 128, (3,3,3), (2,2,2), 2)
        self.joint_net_2 = self._make_layer(Block3, 128, 256, (3,3,3), (1,2,2), 2)
        self.joint_net_3 = self._make_layer(Block3, 256, 512, (3,3,3), (1,2,2), 2)

        #TODO: Global avg pooling, fc and sigmoid
        self.fc = Linear(512,2)

    def _make_layer(self, block, in_channels, out_channels, kernel_size, stride, blocks):
        downsample = None
        if stride != 1 or in_channels != out_channels * block.expansion:
            if isinstance(kernel_size, int):
                downsample = nn.Sequential(
                    nn.Conv1d(in_channels, out_channels * block.expansion, kernel_size, stride),
                    nn.BatchNorm1d(out_channels * block.expansion),
                )
                layers = []
                layers.append(block(in_channels, out_channels, kernel_size, stride, downsample))
            else:
                downsample = nn.Sequential(
                    nn.Conv3d(in_channels, out_channels * block.expansion, kernel_size, stride, padding=1),
                    nn.BatchNorm3d(out_channels * block.expansion),
                )
                layers = []
                layers.append(block(in_channels, out_channels, kernel_size, stride, downsample, padding=1))

        
        for _ in range(1, blocks):
            layers.append(block(out_channels, out_channels))

        return nn.Sequential(*layers)

    def forward(self, sounds, images):
        batchsize = sounds.shape[0]
        sounds = sounds.view(batchsize, 2, -1)
        _, num, _, xd, yd, = images.shape
        images = images.view(batchsize, 1, num, xd, yd)
        
        out_s = self.conv1_1(sounds)
        out_s = self.pool1_1(out_s)

        out_s = self.s_net_1(out_s)
        out_s = self.s_net_2(out_s)
        out_s = self.s_net_3(out_s)

        out_s = self.pool1_2(out_s)
        out_s = self.conv1_2(out_s)
        
        out_im = self.conv3_1(images)
        out_im = self.pool3_1(out_im)
        out_im = self.im_net_1(out_im)

        #tile audio, concatenate channel wise
        out_s = self.fractional_maxpool(out_s.unsqueeze(3)) # Reduce dimension from 25 to 8
        out_s = out_s.squeeze(3).view(-1, 1, 1).repeat(1, 28, 28).view(-1,128,10,28,28) # Tile
        out_joint = torch.cat((out_s, out_im),1)
        out_joint = self.conv3_2(out_joint)
        out_joint = self.conv3_3(out_joint)
        out_joint = self.joint_net_1(out_joint)
        out_joint = self.joint_net_2(out_joint)
        out_joint = self.joint_net_3(out_joint)
        feature_maps = out_joint
        """Global Average Pooling"""
        out_joint = F.avg_pool3d(out_joint, kernel_size=out_joint.size()[2:]).view(batchsize,-1)
#         out_joint = out_joint.view(batchsize, 512, -1).mean(2)
        out_joint = self.fc(out_joint)
        out_joint = torch.sigmoid(out_joint)
        return out_joint, feature_maps

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

transform = transforms.Compose([
    transforms.ToPILImage(),
    # transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(224),
    transforms.ToTensor()
])

# train_dataset = AudioDataset(train=True,transform=transform,h5_file='/media/jeff/Backup/CS598PS/data_nice_2597.h5')
test_dataset = AudioDataset(train=False,transform=transform,h5_file='/media/jeff/Backup/CS598PS/data_nice_2597.h5')

# train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=32, shuffle=True, num_workers=4)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=1, shuffle=False, num_workers=1)

In [5]:
len(test_dataset)

807

In [6]:
# import model
model_align = alignment().cuda()
checkpoint = torch.load("models/silas_model.pth")
model_align.load_state_dict(checkpoint.state_dict())

# Evaluation

In [17]:
sys.path.append('data/process/')

cg = category_getter.CategoryGetter("data/process/eval_segments.csv")

loss_fn = nn.CrossEntropyLoss()

model_align.eval()
labels_lst = ['Animal','Channel, environment and background','Human sounds','Music',
             'Natural sounds','Sounds of things','Source-ambiguous sounds']
accs_dict = {}
for label in labels_lst:
    accs_dict[label] = []

filenames = np.load('/media/jeff/Backup/CS598PS/filenames_nice_test.npy')
    
for i in range(20):
    accs = []
    losses = []
    for batch_idx, (images, sounds, labels) in enumerate(test_loader):
        with torch.no_grad():
            images_v = Variable(images.type(torch.FloatTensor)).cuda()
            sounds_v = Variable(sounds.type(torch.FloatTensor)).cuda()
            labels_v = Variable(labels).cuda()
            aligned_res, _ = model_align(sounds_v, images_v)
            loss = loss_fn(aligned_res, labels_v)
            losses.append(loss.item())
            accs.append(np.mean((torch.argmax(aligned_res,1) == labels_v).detach().cpu().numpy()))

            filename = str(filenames[batch_idx])
            youtube_id = '_'.join(filename.split("_")[:-2])
            for id in cg.get_general_categories_for_video(youtube_id):
                accs_dict[cg.ontology.get_record_for_id(id)["name"]].append(np.mean((torch.argmax(aligned_res,1) == labels_v).detach().cpu().numpy()))
    print("Validation :", np.mean(losses), np.mean(accs))

('Validation :', 0.75242279578909699, 0.49566294919454773)
('Validation :', 0.74918198762773136, 0.49442379182156132)
('Validation :', 0.76109657367365957, 0.48327137546468402)
('Validation :', 0.74426339025686516, 0.50681536555142503)
('Validation :', 0.75176194192013035, 0.5018587360594795)
('Validation :', 0.76335253680062565, 0.50557620817843862)


Process Process-9:
  File "/usr/local/lib/python2.7/dist-packages/torch/multiprocessing/queue.py", line 17, in send
Traceback (most recent call last):
  File "/usr/lib/python2.7/multiprocessing/process.py", line 267, in _bootstrap
    self.run()
  File "/usr/lib/python2.7/multiprocessing/process.py", line 114, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python2.7/dist-packages/torch/utils/data/dataloader.py", line 110, in _worker_loop
    data_queue.put((idx, samples))
  File "/usr/lib/python2.7/multiprocessing/queues.py", line 390, in put
    return send(obj)
    ForkingPickler(buf, pickle.HIGHEST_PROTOCOL).dump(obj)
  File "/usr/lib/python2.7/pickle.py", line 224, in dump
    self.save(obj)
  File "/usr/lib/python2.7/pickle.py", line 286, in save
    f(self, obj) # Call unbound method with explicit self
  File "/usr/lib/python2.7/pickle.py", line 554, in save_tuple
    save(element)


KeyboardInterrupt: 

  File "/usr/lib/python2.7/pickle.py", line 286, in save
    f(self, obj) # Call unbound method with explicit self
  File "/usr/lib/python2.7/pickle.py", line 606, in save_list
    self._batch_appends(iter(obj))
  File "/usr/lib/python2.7/pickle.py", line 639, in _batch_appends
    save(x)
  File "/usr/lib/python2.7/pickle.py", line 286, in save
  File "/usr/lib/python2.7/multiprocessing/forking.py", line 67, in dispatcher
    f(self, obj) # Call unbound method with explicit self
    self.save_reduce(obj=obj, *rv)
  File "/usr/lib/python2.7/pickle.py", line 401, in save_reduce
    save(args)
  File "/usr/lib/python2.7/pickle.py", line 286, in save
    f(self, obj) # Call unbound method with explicit self
  File "/usr/lib/python2.7/pickle.py", line 554, in save_tuple
    save(element)
  File "/usr/lib/python2.7/pickle.py", line 286, in save
    f(self, obj) # Call unbound method with explicit self
  File "/usr/lib/python2.7/multiprocessing/forking.py", line 66, in dispatcher
    rv = re

In [18]:
for label in labels_lst:
    print(label, len(accs_dict[label]), np.mean(accs_dict[label]))

('Animal', 25, 0.56000000000000005)
('Channel, environment and background', 342, 0.50877192982456143)
('Human sounds', 778, 0.53084832904884316)
('Music', 4153, 0.49554538887551169)
('Natural sounds', 0, nan)
('Sounds of things', 1628, 0.50368550368550369)
('Source-ambiguous sounds', 167, 0.47904191616766467)


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


# Training Category splits

In [None]:
cg = category_getter.CategoryGetter("unbalanced_train_segments.csv")

loss_fn = nn.CrossEntropyLoss()
model_align.eval()
labels_lst = ['Animal','Channel, environment and background','Human sounds','Music',
             'Natural sounds','Sounds of things','Source-ambiguous sounds']
accs_dict = {}
for label in labels_lst:
    accs_dict[label] = []

    filenames = np.load('/media/jeff/Backup/CS598PS/data_nice_filenames.npy')

for i in range(5):
    accs = []
    losses = []
    for batch_idx, (images, sounds, labels) in enumerate(train_loader):
        with torch.no_grad():
            images_v = Variable(images.type(torch.FloatTensor)).cuda()
            sounds_v = Variable(sounds.type(torch.FloatTensor)).cuda()
            labels_v = Variable(labels).cuda()
            aligned_res, _ = model_align(sounds_v, images_v)
            loss = loss_fn(aligned_res, labels_v)
            losses.append(loss.item())
            accs.append(np.mean((torch.argmax(aligned_res,1) == labels_v).detach().cpu().numpy()))

            filename = str(filenames[batch_idx])
            youtube_id = '_'.join(filename.split("_")[:-2])
            for id in cg.get_general_categories_for_video(youtube_id):
                accs_dict[cg.ontology.get_record_for_id(id)["name"]].append(np.mean((torch.argmax(aligned_res,1) == labels_v).detach().cpu().numpy()))
    print("Validation :", np.mean(losses), np.mean(accs))

In [None]:
for label in labels_lst:
    print(label, len(accs_dict[label]), np.mean(accs_dict[label]))