In [1]:
import numpy as np
import torch
import torchvision
from torch import nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
from torchvision.utils import save_image
from glob import glob

In [170]:
class voice_face(Dataset):
    def __init__(self, voice_filenames, standardize=False):
        """
        Preconditions: csv files must contain matrices of the same dimension
        Args:
            voice_filenames (string or list): list of filenames/pathnames of csv files with spectrogram matrices
                                              assumes format voice_{n}_{m}.csv, 
                                              where n is the data ID and m is the spectrogram number for that speaker
            standardise (boolean):            whether to standardize the spectrograms
        """
        # ensure inputs are lists
        if type(voice_filenames) == str:
            voice_filenames = [voice_filenames]
        assert(type(voice_filenames) == list)
                
        # load voice spectrograms one by one
        face_IDs = [] # the face IDs associated with each spectrogram
        matrices = [] # the spectrograms
        for v_file in voice_filenames:
            # get n, the data ID 
            n, _ = get_n_m(v_file)
            face_IDs.append(n)
            
            # get spectrogram
            matrix = np.loadtxt(v_file, delimiter=',', dtype=np.float32)
            if standardize:
                matrix = (matrix - np.mean(matrix)) / np.std(matrix)
            matrices.append(matrix)
        
        # construct spectrograms tensor
        self.X = torch.Tensor(matrices)
        N, D, M = self.X.shape
        self.X = self.X.view(N, 1, D, M) # insert channel dimension
        
        # construct face_IDs tensor
        self.y = torch.tensor(face_IDs)
        
        assert(self.X.shape[0] == self.y.shape[0])
        
    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [171]:
def get_n_m(v_file):
    v_file = v_file.split('/')[-1] # strip the pathname if it exists
    v_file, _ = v_file.split('.') # strip the file extension
    _, n, m = v_file.split('_') # get n and m from the filename
    return n, m

In [52]:
# import dataset
voice_filenames = glob("data/voice_*.csv")
face_filenames = glob("data/face_*.csv")
dataset = VoiceMFCC(voice_filenames, standardize=True)

In [53]:
# split dataset
TRAIN_SIZE = 9099
VALID_SIZE = 2000
TEST_SIZE = 2000
train_dataset, validation_dataset, test_dataset = torch.utils.data.random_split(dataset, 
                                                                                [TRAIN_SIZE, 
                                                                                 VALID_SIZE, 
                                                                                 TEST_SIZE])

In [54]:
# pipe data through a dataloader for batching
BATCH_SIZE = 20
dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [136]:
class autoencoder(nn.Module):
    def __init__(self, w_length, face_length):
        """
        w_length: the length of the bottleneck vector i.e. # of basis faces used
        face_length: the height * width of the face images
        """
        super(autoencoder, self).__init__()
        self.encoder = nn.ModuleList(
            [
                nn.Conv2d(1, 8, kernel_size=3, stride=1, padding=1), 
                nn.ReLU(True),
                nn.MaxPool2d(2, stride=2), 
                nn.Conv2d(8, 16, kernel_size=3, stride=1, padding=1),
                nn.ReLU(True),
                nn.MaxPool2d(2, stride=2),
                nn.Conv2d(16, 64, kernel_size=3, stride=1, padding=1),
                nn.ReLU(True),
                nn.MaxPool2d(2, stride=2),
                nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)
            ]
        )
        
        self.decoder = nn.ModuleList(
            [
                nn.ConvTranspose2d(1, 64, kernel_size=3, stride=1, padding=1), 
                nn.ReLU(True),
                nn.ConvTranspose2d(64, 16, kernel_size=3, stride=2, padding=(1,0),output_padding = (1,0)), 
                nn.ReLU(True),
                nn.ConvTranspose2d(16, 8, kernel_size=3, stride=2, padding=(1,1),output_padding = (1,1)),
                nn.ReLU(True),
                nn.ConvTranspose2d(8, 1, kernel_size=3, stride=2, padding=(1,0), output_padding = (1,0)),
                nn.Tanh()
            ]
        )
        
        self.w_length = w_length
        self.face_length = face_length
        self.B = nn.Linear(self.w_length, self.face_length, bias=False)

    def forward(self, x):
        # start encoder
        for layer in self.encoder:
            x = layer.forward(x)
#             print(x.shape)
        
        # collapse final feature map into a vector by taking average across time
        N, _, H, _ = x.shape
        w = x.mean(dim=3)
        w = w.view(N, H)
        
#         # start decoder
#         for layer in self.decoder:
#             v = layer.forward(x)
# #             print(v.shape)
            
        # face construction
        f = self.B(w)
        
#         return v, f, w
        return f, w

In [None]:
ALPHA = 1
def combined_loss(model_output,labels):
    voice_output, face_output = model_output
    
    voice_data, face_id = labels
    face_data = face_struct[face_id]
    
    loss = fid(face_output, face_data) + ALPHA * MSE(voice_output, voice_data)
    return loss

def face_loss(face_outputs, labels):
    true_faces = face_struct[labels]
#     loss = fid(face_output, true_faces)
    loss = torch.dist(face_output, true_faces, p=2) ** 2 # squared L2 norm
    return loss

In [137]:
# model = autoencoder().cuda()
NUM_EPOCHS = 10
LEARNING_RATE = 1e-3
model = autoencoder()
MSE = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-5)

In [110]:
# for data in dataloader:
#     temp = data
#     break

In [111]:
# temp.shape

torch.Size([20, 1, 40, 173])

In [None]:
for epoch in range(NUM_EPOCHS):
    for batch in dataloader:
        # ===================forward=====================
        voice, face_id = batch
        voice_output, face_output, w = model(voice)
        loss = my_loss((voice_output,face_output), (voice,face_id))
        # ===================backward====================
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # ===================log========================
    if((epoch+1)%10 == 0):
        print('epoch [{}/{}], loss:{:.4f}'
              .format(epoch+1, NUM_EPOCHS, loss.data.item()))

torch.save(model.state_dict(), './model_state.pth')

In [None]:
# Check that NN outputs the same size file as input
# FID for similarity between faces