In [1]:
import numpy as np
import torch
import torchvision
from torch import nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
from torchvision.utils import save_image
from glob import glob

In [60]:
class VoiceMFCC(Dataset):
    """Voice MFCC spectra dataset."""

    def __init__(self, csv_files, standardize=False):
        """
        Preconditions: csv files must contain matrices of the same dimension
        Args:
            csv_files (string or list): list of filenames/pathnames of csv files 
                                        with MFCC spectrogram matrices
            transform (callable, optional): Optional transform to be applied
                                            on a sample.
        """
        # ensure csv_files is a list
        if type(csv_files) == str:
            csv_files = [csv_files]
        
        # load csv files with the MFCC spectra into a 3D tensor
        matrices = []
        for f in csv_files:
            matrix = np.loadtxt(f, delimiter=',', dtype=np.float32)
            if standardize:
                matrix = (matrix - np.mean(matrix)) / np.std(matrix)
            matrices.append(matrix)
        self.X = torch.Tensor(matrices)
        N, D, M = self.X.shape
        self.X = self.X.view(N, 1, D, M) # THIS LINE DIDNT SEEM TO FIX OUR PROBLEM...
        
    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        sample = self.X[idx]

        return sample
#     def __iter__(self, idx):
#         sample = self.X[idx]

#         return sample

In [3]:
# import dataset
csv_files = glob("../train_mfcc/*.csv")
dataset = VoiceMFCC(csv_files, standardize=True)

In [59]:
dataset = VoiceMFCC(csv_files[:10], standardize=True)

In [57]:
# split dataset
TRAIN_SIZE = 9099
VALID_SIZE = 2000
TEST_SIZE = 2000
# TRAIN_SIZE = 100
# VALID_SIZE = 10999
# TEST_SIZE = 2000
train_dataset, validation_dataset, test_dataset = torch.utils.data.random_split(dataset, 
                                                                                [TRAIN_SIZE, 
                                                                                 VALID_SIZE, 
                                                                                 TEST_SIZE])

ValueError: Sum of input lengths does not equal the length of the input dataset!

In [46]:
# pipe data through a dataloader for batching
BATCH_SIZE = 20
dataloader = DataLoader(train_dataset, shuffle=True) # batch_size=BATCH_SIZE

In [12]:
# class autoencoder(nn.Module):
#     def __init__(self):
#         super(autoencoder, self).__init__()
#         self.encoder = nn.Sequential(
#             nn.Conv2d(1, 8, 3, stride=3, padding=1), 
#             nn.ReLU(True),
#             nn.MaxPool2d(2, stride=2), 
#             nn.Conv2d(8, 16, 3, stride=2, padding=1),
#             nn.ReLU(True),
#             nn.MaxPool2d(2, stride=2) ,
#             nn.Conv2d(16, 64, 3, stride=2, padding=1),
#             nn.ReLU(True),
#             nn.MaxPool2d(2, stride=2),
#             nn.Conv2d(64, 128, 3, stride=2, padding=1) 
#             #nn.ReLU(True)
#             #nn.MaxPool2d(2, stride=2)
#         )
        
#         self.decoder = nn.Sequential(
#             nn.ConvTranspose2d(128, 64, 3, stride=2), 
#             nn.ReLU(True),
#             nn.ConvTranspose2d(64, 16, 3, stride=2, padding=1), 
#             nn.ReLU(True),
#             nn.ConvTranspose2d(16, 8, 3, stride=2, padding=1),
#             nn.ReLU(True),
#             nn.ConvTranspose2d(8, 1, 3, stride=3, padding=1),
#             nn.Tanh()
#         )

#     def forward(self, x):
#         for i in self.decoder:
#             i(x)
#         x = self.encoder(x)
#         x = self.decoder(x)
#         return x

In [13]:
class autoencoder(nn.Module):
    def __init__(self):
        super(autoencoder, self).__init__()
        self.encoder = nn.ModuleList(
            [
                nn.Conv2d(1, 8, kernel_size=3, stride=1, padding=1), 
                nn.ReLU(True),
                nn.MaxPool2d(2, stride=2), 
                nn.Conv2d(8, 16, kernel_size=3, stride=1, padding=1),
                nn.ReLU(True),
                nn.MaxPool2d(2, stride=2),
                nn.Conv2d(16, 64, kernel_size=3, stride=1, padding=1),
                nn.ReLU(True),
                nn.MaxPool2d(2, stride=2),
                nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)
            ]
        )
        
        self.decoder = nn.ModuleList(
            [
                nn.ConvTranspose2d(1, 64, kernel_size=3, stride=1, padding=1), 
                nn.ReLU(True),
                nn.ConvTranspose2d(64, 16, kernel_size=3, stride=2, padding=(1,0),output_padding = (1,0)), 
                nn.ReLU(True),
                nn.ConvTranspose2d(16, 8, kernel_size=3, stride=2, padding=(1,1),output_padding = (1,1)),
                nn.ReLU(True),
                nn.ConvTranspose2d(8, 1, kernel_size=3, stride=2, padding=(1,0), output_padding = (1,0)),
                nn.Tanh()
            ]
        )

    def forward(self, x):
        #print("shape start: {}".format(x.shape))
        
        #print("start encoder")
        for layer in self.encoder:
            x = layer.forward(x)
            #print(x.shape)
        
        # flatten bottleneck into vectors
        N, _, H, W = x.shape
        w = x.view(N, H * W)
        
        #print("start decoder")     
        for layer in self.decoder:
            x = layer.forward(x)
            #print(x.shape)
        
        return x, w

In [14]:
ALPHA = 1
def my_loss(model_output,labels):
    voice_output, face_output = model_output
    
    voice_data, face_id = labels
    face_data = mapper[face_id]
    
    loss = fid(face_output, face_data) + ALPHA * criterion(voice_output, voice_data)
    return loss

In [15]:
# model = autoencoder().cuda()
NUM_EPOCHS = 250
LEARNING_RATE = 1e-3
model = autoencoder()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-5)

In [19]:
for epoch in range(NUM_EPOCHS):
    for batch in dataloader:
        # ===================forward=====================
        voice, face_id = batch
        voice_output, face_output = model(voice)
        #loss = criterion(output, batch)
        loss = my_loss((voice_output,face_output), (voice,face_id))
        # ===================backward====================
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # ===================log========================
    if((epoch+1)%10 == 0):
        print('epoch [{}/{}], loss:{:.4f}'
              .format(epoch+1, NUM_EPOCHS, loss.data.item())) # EDIT: changed [0] to .item()

torch.save(model.state_dict(), './model_state.pth')

torch.Size([1, 1, 40, 173])


ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
# Check that NN outputs the same size file as input
# FID for similarity between faces

In [47]:
for data in dataloader:
    temp = data
    break

In [48]:
temp.shape

torch.Size([1, 1, 40, 173])

In [61]:
nn.MSELoss(torch.Tensor([1,4,5]), torch.Tensor([1,5,5]))

RuntimeError: bool value of Tensor with more than one value is ambiguous

In [63]:
criterion(torch.Tensor([1,4,5]), torch.Tensor([1,6,5]))

tensor(1.3333)