In [1]:
import numpy as np
import torch
import torchvision
from torch import nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
from torchvision.utils import save_image
from glob import glob
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
class voice_face(Dataset):
    def __init__(self, voice_filenames, standardize=False):
        """
        Preconditions: csv files must contain matrices of the same dimension
        Args:
            voice_filenames (string or list): list of filenames/pathnames of csv files with spectrogram matrices
                                              assumes format voice_{n}_{m}.csv, 
                                              where n is the data ID and m is the spectrogram number for that speaker
            standardise (boolean):            whether to standardize the spectrograms
        """
        # ensure inputs are lists
        if type(voice_filenames) == str:
            voice_filenames = [voice_filenames]
        assert(type(voice_filenames) == list)
                
        # load voice spectrograms one by one
        face_IDs = [] # the face IDs associated with each spectrogram
        matrices = [] # the spectrograms
        for v_file in voice_filenames:
            # get n, the data ID 
            n, _ = get_n_m(v_file)
            face_IDs.append(n)
            
            # get spectrogram
            matrix = np.loadtxt(v_file, delimiter=',', dtype=np.float32)
            if standardize:
                matrix = (matrix - np.mean(matrix)) / np.std(matrix)
            matrices.append(matrix)
        
        # construct spectrograms tensor
        self.X = torch.Tensor(matrices)
        N, D, M = self.X.shape
        self.X = self.X.view(N, 1, D, M) # insert channel dimension
        
        # construct face_IDs tensor
        self.y = torch.tensor(face_IDs)
        
        assert(self.X.shape[0] == self.y.shape[0])
        
    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [3]:
def get_n_m(v_file):
    v_file = v_file.split('/')[-1] # strip the pathname if it exists
    #v_file, _ = v_file.split('.') # strip the file extension
    _, n, m = v_file.split('_') # get n and m from the filename
    return int(n), int(m)

In [4]:
# import dataset
voice_filenames = glob("voice_to_face_net/toy_dataset/voicespecs/voice_*")
face_filenames = glob("voice_to_face_net/toy_dataset/facespecs/face_*")
dataset = voice_face(voice_filenames, standardize=True)

In [5]:
# split dataset
# TRAIN_SIZE = 9099
# VALID_SIZE = 2000
# TEST_SIZE = 2000
TRAIN_SIZE = 200
VALID_SIZE = 10
TEST_SIZE = 15
train_dataset, validation_dataset, test_dataset = torch.utils.data.random_split(dataset, 
                                                                                [TRAIN_SIZE, 
                                                                                 VALID_SIZE, 
                                                                                 TEST_SIZE])

In [6]:
# pipe data through a dataloader for batching
BATCH_SIZE = 10
dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [7]:
def conv_shape(L, K, S, P):
    return (L + 2*P - K) // S + 1

In [8]:
def deconv_shape(L, K, S, P):
    return ((L-1)*S - 2*P + K)

In [None]:
conv_shape(63, 3, 1, 1)

In [None]:
deconv_shape(31, 3, 2, 1)

In [9]:
class autoencoder(nn.Module):
    def __init__(self, w_length, face_length):
        """
        w_length: the length of the bottleneck vector i.e. # of basis faces used
        face_length: the height * width of the face images
        """
        super(autoencoder, self).__init__()
        # 1,025 x 251
#                 nn.Conv2d(1, 8, kernel_size=5, stride=1, padding=2),        # 1,025 x 251
#                 nn.ReLU(True),
# #                 nn.MaxPool2d(2, stride=2), 
#                 nn.Conv2d(8, 16, kernel_size=5, stride=2, padding=1),       # 512 x 125
#                 nn.ReLU(True),
# #                 nn.MaxPool2d(2, stride=2),
#                 nn.Conv2d(16, 64, kernel_size=5, stride=2, padding=1),      # 256 x 63
#                 nn.ReLU(True),
#                 nn.MaxPool2d(2, stride=2),                                  # 127 x 31
#                 nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)        # 127 x 31
        self.encoder = nn.ModuleList(
            [
                nn.Conv2d(1, 8, kernel_size=3, stride=1, padding=1), #1025 x 251
                nn.ReLU(True),
                #nn.MaxPool2d(2, stride=2), 
                nn.Conv2d(8, 16, kernel_size=3, stride=2, padding=0), #512 x 125
                nn.ReLU(True),
                #nn.MaxPool2d(2, stride=2),
                nn.Conv2d(16, 32, kernel_size=3, stride=2, padding=1), #256 x 63
                nn.ReLU(True),
                nn.MaxPool2d(2, stride=2),                             #128 x 31
                nn.Conv2d(32, 1, kernel_size=3, stride=1, padding=1)   #128 x 31
            ]
        )
        
        self.decoder = nn.ModuleList(
            [
                nn.ConvTranspose2d(1, 32, kernel_size=3, stride=1, padding=1), # 128 x 31
                nn.ReLU(True),
                nn.ConvTranspose2d(32, 16, kernel_size=3, stride=2, padding=(1,0), output_padding = (1,0)), # 256 x 63
                nn.ReLU(True),
                nn.ConvTranspose2d(16, 8, kernel_size=3, stride=2, padding=(1,1), output_padding = (1,0)), # 512 x 125
                nn.ReLU(True),
                nn.ConvTranspose2d(8, 1, kernel_size=3, stride=2, padding=0, output_padding = 0), # 1025 x 251
                nn.Tanh()
            ]
        )
        
        self.w_length = w_length
        self.face_length = face_length
        self.B = nn.Linear(self.w_length, self.face_length, bias=False)

    def forward(self, x):
        # start encoder
        for layer in self.encoder:
            x = layer.forward(x)
            #print(x.shape)
        
        # collapse final feature map into a vector by taking average across time
        N, _, H, _ = x.shape
        w = x.mean(dim=3)
        w = w.view(N, H)
        
        # start decoder
        for layer in self.decoder:
            x = layer.forward(x)
            #print(x.shape)
            
        # face construction
        f = self.B(w)
        
        return x, f, w

In [50]:
def save_state(path, model, optimizer): # epoch, loss
    torch.save({
            'model': str(model),
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            #'epoch': epoch,
            #'loss': loss,
            }, path)

In [58]:
def load_state(path, model,optimizer,print_model=True): # epoch, loss
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    if(print_model == True):
        model_state = checkpoint['model']
        print(model_state)
    #epoch = checkpoint['epoch']
    #loss = checkpoint['loss']

In [59]:
def view_state(model, optimizer, state_size = 0):
    if(state_size == 1):
        print("Model -",model)
        return
    print("Model's state_dict:")
    for param_tensor in model.state_dict():
        print(param_tensor, "\t", model.state_dict()[param_tensor])
    print("Optimizer's state_dict:")
    for var_name in optimizer.state_dict():
        print(var_name, "\t", optimizer.state_dict()[var_name])

In [13]:
# model = autoencoder().cuda()
NUM_EPOCHS = 3
LEARNING_RATE = 1e-3
model = autoencoder(w_length = 128, face_length = 128*128)
MSE = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-5)

In [22]:
for epoch in range(NUM_EPOCHS):
    for batch in dataloader:
        # ===================forward=====================
        voice, face_id = batch
        voice_output, face_output, w = model(voice)
        #loss = combined_loss((voice_output,face_output), (voice,face_id))
        loss = MSE(voice_output, voice)
        # ===================backward====================
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # ===================log========================
    #if((epoch+1)%10 == 0):
    print('epoch [{}/{}], loss:{:.4f}'
          .format(epoch+1, NUM_EPOCHS, loss.data.item()))
save_state("torch_state_1.pt",model,optimizer)
#torch.save(model.state_dict(), './model_state.pth')

epoch [1/3], loss:0.8437
epoch [2/3], loss:0.8520
epoch [3/3], loss:0.8194


TypeError: save_state() takes 3 positional arguments but 5 were given

In [53]:
save_state("torch_state_1.pt",model,optimizer)

In [61]:
view_state(model, optimizer)

Model's state_dict:
encoder.0.weight 	 tensor([[[[-0.2159, -0.2860,  0.3342],
          [ 0.2040, -0.2874,  0.3222],
          [-0.0075, -0.2922,  0.2345]]],


        [[[-0.3203, -0.3149,  0.0315],
          [-0.2452, -0.2904, -0.2895],
          [ 0.1776, -0.1676,  0.3312]]],


        [[[-0.2264, -0.2802, -0.2176],
          [-0.1547,  0.2153, -0.1277],
          [ 0.0980, -0.2403,  0.0309]]],


        [[[ 0.1302, -0.1906,  0.3084],
          [-0.1496, -0.0554, -0.0517],
          [ 0.0397, -0.0918,  0.3572]]],


        [[[-0.2141, -0.1886, -0.2179],
          [-0.1418,  0.2255, -0.1046],
          [ 0.1196,  0.2945, -0.2399]]],


        [[[ 0.1031, -0.0458, -0.2377],
          [-0.0142,  0.2522,  0.2282],
          [ 0.0832, -0.2175, -0.1329]]],


        [[[-0.1332, -0.1313, -0.1449],
          [ 0.2849, -0.0028,  0.0050],
          [-0.1772,  0.2655,  0.3650]]],


        [[[ 0.0962,  0.3400, -0.0317],
          [-0.2156,  0.0027, -0.0057],
          [-0.2155, -0.2639,  0.2402

In [60]:
view_state(model, optimizer, 1)

Model - autoencoder(
  (encoder): ModuleList(
    (0): Conv2d(1, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2))
    (3): ReLU(inplace=True)
    (4): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (5): ReLU(inplace=True)
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Conv2d(32, 1, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  )
  (decoder): ModuleList(
    (0): ConvTranspose2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): ConvTranspose2d(32, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 0), output_padding=(1, 0))
    (3): ReLU(inplace=True)
    (4): ConvTranspose2d(16, 8, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), output_padding=(1, 0))
    (5): ReLU(inplace=True)
    (6): ConvTranspose2d(8, 1, kernel_size=(3, 3), stride=(2, 2))
    (7): Tanh()
  )
  (B): Line

In [62]:
load_state("torch_state_1.pt",model,optimizer, False)

In [None]:
# for data in dataloader:
#     temp = data
#     break

In [None]:
# temp.shape

In [None]:
class voice_face_predictor(nn.Module):
    def __init__(self, w_length, face_length):
        """
        w_length: the length of the bottleneck vector i.e. # of basis faces used
        face_length: the height * width of the face images
        """
        super(voice_face_predictor, self).__init__()
        self.encoder = nn.ModuleList(
            [
                                                                            # 1,025 x 251
                nn.Conv2d(1, 8, kernel_size=5, stride=1, padding=2),        # 1,025 x 251
                nn.ReLU(True),
#                 nn.MaxPool2d(2, stride=2), 
                nn.Conv2d(8, 16, kernel_size=5, stride=2, padding=1),       # 512 x 125
                nn.ReLU(True),
#                 nn.MaxPool2d(2, stride=2),
                nn.Conv2d(16, 64, kernel_size=5, stride=2, padding=1),      # 255 x 62
                nn.ReLU(True),
                nn.MaxPool2d(2, stride=2),                                  # 127 x 31
                nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)        # 127 x 31
            ]
        )
        
#         self.decoder = nn.ModuleList(
#             [
#                 nn.ConvTranspose2d(1, 64, kernel_size=3, stride=1, padding=1), 
#                 nn.ReLU(True),
#                 nn.ConvTranspose2d(64, 16, kernel_size=3, stride=2, padding=(1,0),output_padding = (1,0)), 
#                 nn.ReLU(True),
#                 nn.ConvTranspose2d(16, 8, kernel_size=3, stride=2, padding=(1,1),output_padding = (1,1)),
#                 nn.ReLU(True),
#                 nn.ConvTranspose2d(8, 1, kernel_size=3, stride=2, padding=(1,0), output_padding = (1,0)),
#                 nn.Tanh()
#             ]
#         )
        
        self.w_length = 127
        self.face_length = face_length
        self.B = nn.Linear(self.w_length, self.face_length, bias=False)

    def forward(self, x):
        # start encoder
        for layer in self.encoder:
            x = layer.forward(x)
#             print(x.shape)
        
        # collapse final feature map into a vector by taking average across time
        N, _, H, _ = x.shape
        w = x.mean(dim=3)
        w = w.view(N, H)
        
#         # start decoder
#         for layer in self.decoder:
#             v = layer.forward(x)
# #             print(v.shape)
            
        # face construction
        f = self.B(w)
        
#         return v, f, w
        return f, w

In [None]:
ALPHA = 1
def combined_loss(model_output,labels):
    voice_output, face_output = model_output
    
    voice_data, face_id = labels
    face_data = face_struct[face_id]
    
    loss = fid(face_output, face_data) + ALPHA * MSE(voice_output, voice_data)
    return loss

# def face_loss(face_outputs, labels):
#     true_faces = face_struct[labels]
# #     loss = fid(face_output, true_faces)
# #     loss = torch.dist(face_output, true_faces, p=2) ** 2 # squared L2 norm
#     loss = MSE(face_output, true_faces)
#     return loss

def face_loss(face_outputs, labels):
    true_faces = []
    for i in labels:
        face = np.loadtxt("data/toy_dataset/facespecs/face_"+str(i.item()),delimiter=',')
        face = face.reshape(face.shape[0]*face.shape[1])
        true_faces.append(face)
    true_faces = torch.Tensor(true_faces)

#    true_faces = face_struct[labels]
#     loss = fid(face_output, true_faces)
#     loss = torch.dist(face_output, true_faces, p=2) ** 2 # squared L2 norm
    loss = MSE(face_outputs, true_faces)
    return loss

In [None]:
# model = autoencoder().cuda()
NUM_EPOCHS = 70
LEARNING_RATE = 1e-3
model = voice_face_predictor(w_length = 128, face_length = 128*128)
MSE = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-5)

In [None]:
# train just for faces
for epoch in range(NUM_EPOCHS):
    for batch in dataloader:
        # ===================forward=====================
        voice, face_ID = batch
        face_output, w = model(voice)
        loss = face_loss(face_output, face_ID)
        # ===================backward====================
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # ===================log========================
#     if((epoch+1)%10 == 0):
    print('epoch [{}/{}], loss:{:.4f}'
          .format(epoch+1, NUM_EPOCHS, loss.data.item()))

torch.save(model.state_dict(), './model_state.pth')

In [None]:
test_voice, test_face_ID = test_dataset[5]

In [None]:
test_face_ID = test_face_ID.item()
test_face = np.loadtxt("data/toy_dataset/facespecs/face_"+str(test_face_ID), delimiter=',')

In [None]:
plt.imshow(test_face, cmap="gray")

In [None]:
temp_f, temp_w = model(test_voice.view([1, 1, 1025, 251])) # run through model

In [None]:
temp_f_arr = temp_f.detach().numpy() # convert to numpy array

In [None]:
temp_f_arr = temp_f_arr.reshape((128,128))

In [None]:
plt.imshow(temp_f_arr, cmap="gray")

In [None]:
plt.imshow(temp_f_arr, cmap="gray")

In [None]:
plt.imshow(np.loadtxt("data/toy_dataset/facespecs/face_0", delimiter=','), cmap="gray")

In [None]:
plt.imshow(np.loadtxt("data/toy_dataset/facespecs/face_1", delimiter=','), cmap="gray")

In [None]:
plt.imshow(np.loadtxt("data/toy_dataset/facespecs/face_2", delimiter=','), cmap="gray")

In [None]:
plt.imshow(np.loadtxt("data/toy_dataset/facespecs/face_3", delimiter=','), cmap="gray")

In [None]:
plt.imshow(np.loadtxt("data/toy_dataset/facespecs/face_4", delimiter=','), cmap="gray")

In [None]:
# Check that NN outputs the same size file as input
# FID for similarity between faces

In [None]:
# import dataset
voice_filenames = glob("data/Voice_to_face/voicespecs/voice_*")
face_filenames = glob("data/Voice_to_face/facespecs/face_*")
dataset = voice_face(voice_filenames, standardize=True)

In [None]:
# split dataset
# TRAIN_SIZE = 9099
# VALID_SIZE = 2000
# TEST_SIZE = 2000
TRAIN_SIZE = 3000
VALID_SIZE = 300
TEST_SIZE = 300
train_dataset, validation_dataset, test_dataset = torch.utils.data.random_split(dataset, 
                                                                                [TRAIN_SIZE, 
                                                                                 VALID_SIZE, 
                                                                                 TEST_SIZE])

In [None]:
# pipe data through a dataloader for batching
BATCH_SIZE = 10
dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
class voice_face_predictor(nn.Module):
    def __init__(self, w_length, face_length):
        """
        w_length: the length of the bottleneck vector i.e. # of basis faces used
        face_length: the height * width of the face images
        """
        super(voice_face_predictor, self).__init__()
        self.encoder = nn.ModuleList(
            [
                                                                            # 1,025 x 251
                nn.Conv2d(1, 8, kernel_size=5, stride=1, padding=2),        # 1,025 x 251
                nn.ReLU(True),
#                 nn.MaxPool2d(2, stride=2), 
                nn.Conv2d(8, 16, kernel_size=5, stride=2, padding=1),       # 512 x 125
                nn.ReLU(True),
#                 nn.MaxPool2d(2, stride=2),
                nn.Conv2d(16, 64, kernel_size=5, stride=2, padding=1),      # 255 x 62
                nn.ReLU(True),
                nn.MaxPool2d(2, stride=2),                                  # 127 x 31
                nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)        # 127 x 31
            ]
        )
        
#         self.decoder = nn.ModuleList(
#             [
#                 nn.ConvTranspose2d(1, 64, kernel_size=3, stride=1, padding=1), 
#                 nn.ReLU(True),
#                 nn.ConvTranspose2d(64, 16, kernel_size=3, stride=2, padding=(1,0),output_padding = (1,0)), 
#                 nn.ReLU(True),
#                 nn.ConvTranspose2d(16, 8, kernel_size=3, stride=2, padding=(1,1),output_padding = (1,1)),
#                 nn.ReLU(True),
#                 nn.ConvTranspose2d(8, 1, kernel_size=3, stride=2, padding=(1,0), output_padding = (1,0)),
#                 nn.Tanh()
#             ]
#         )
        
        self.w_length = 127
        self.face_length = face_length
        self.B = nn.Linear(self.w_length, self.face_length, bias=False)

    def forward(self, x):
        # start encoder
        for layer in self.encoder:
            x = layer.forward(x)
#             print(x.shape)
        
        # collapse final feature map into a vector by taking average across time
        N, _, H, _ = x.shape
        w = x.mean(dim=3)
        w = w.view(N, H)
        
#         # start decoder
#         for layer in self.decoder:
#             v = layer.forward(x)
# #             print(v.shape)
            
        # face construction
        f = self.B(w)
        
#         return v, f, w
        return f, w

In [None]:
ALPHA = 1
def combined_loss(model_output,labels):
    voice_output, face_output = model_output
    
    voice_data, face_id = labels
    face_data = face_struct[face_id]
    
    loss = fid(face_output, face_data) + ALPHA * MSE(voice_output, voice_data)
    return loss

# def face_loss(face_outputs, labels):
#     true_faces = face_struct[labels]
# #     loss = fid(face_output, true_faces)
# #     loss = torch.dist(face_output, true_faces, p=2) ** 2 # squared L2 norm
#     loss = MSE(face_output, true_faces)
#     return loss

def face_loss(face_outputs, labels):
    true_faces = []
    for i in labels:
        face = np.loadtxt("data/Voice_to_face/facespecs/face_{}.csv".format(i.item()),delimiter=',')
        face = face.reshape(face.shape[0]*face.shape[1])
        true_faces.append(face)
    true_faces = torch.Tensor(true_faces)

#    true_faces = face_struct[labels]
#     loss = fid(face_output, true_faces)
#     loss = torch.dist(face_output, true_faces, p=2) ** 2 # squared L2 norm
    loss = MSE(face_outputs, true_faces)
    return loss

In [None]:
# model = autoencoder().cuda()
NUM_EPOCHS = 30
LEARNING_RATE = 1e-3
model = voice_face_predictor(w_length = 128, face_length = 128*128)
MSE = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-5)

In [None]:
# train just for faces
for epoch in range(NUM_EPOCHS):
    for batch in dataloader:
        # ===================forward=====================
        voice, face_ID = batch
        face_output, w = model(voice)
        loss = face_loss(face_output, face_ID)
        # bases 
        # loss_bases_val = loss_bases(bases, target_bases)
        # total_loss = loss+loss_bases_val 
        # total_loss.backward()
        # ===================backward====================
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    # ===================log========================
#     if((epoch+1)%10 == 0):
    print('epoch [{}/{}], loss:{:.4f}, completed at {}'
          .format(epoch+1, NUM_EPOCHS, loss.data.item(), datetime.now()))

torch.save(model.state_dict(), './model_state.pth')

In [None]:
test_voice, test_face_ID = test_dataset[2]

In [None]:
test_face = np.loadtxt("data/Voice_to_face/facespecs/face_{}.csv".format(test_face_ID), delimiter=',')

In [None]:
plt.imshow(test_face, cmap="gray")

In [None]:
temp_f, temp_w = model(test_voice.view([1, 1, 1025, 251])) # run through model
temp_f_arr = temp_f.detach().numpy() # convert to numpy array
temp_f_arr = temp_f_arr.reshape((128,128))

In [None]:
plt.imshow(temp_f_arr, cmap="gray")