In [1]:
import os
import numpy as np
import plotly.graph_objs as go
import plotly
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms
from torchvision.utils import make_grid, save_image
from torch.nn import functional as F
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
from scipy.stats import norm
import scipy.io
import os
from math import ceil
import joblib

In [2]:
# Hyperparameters
batch_size = 128
device = device = ( 'cuda' if torch.cuda.is_available() else 'cpu' )

img_size = 650    # 28x28x1
hidden_dim = 64  # hidden layer dim
#z_dim = 20        # latent space dimension (encoder)
z_dim = 2        # latent space dimension (encoder)

epochs = 10

In [3]:
# model architecture
class VAE(nn.Module):
  def __init__(self):
    super(VAE, self).__init__()

    # encoder
    self.fc1 = nn.Linear(img_size, hidden_dim)
    self.fc2_mean = nn.Linear(hidden_dim, z_dim)
    self.fc2_logvar = nn.Linear(hidden_dim, z_dim)
    #decoder
    self.fc3 = nn.Linear(z_dim, hidden_dim)
    self.fc4 = nn.Linear(hidden_dim, img_size)

  def encode(self, x):
    h = F.relu(self.fc1(x))
    mu = self.fc2_mean(h)         # compute mean of latent
    logvar = self.fc2_logvar(h)   # compute logvar of latent
    return mu, logvar

  def reparameterize(self, mu, logvar):
    # p(z|x) = mu + std * eps
    std = torch.exp(logvar/2)
    eps = torch.randn_like(std)
    return (mu + eps * std)

  def decode(self, z):
    h = F.relu(self.fc3(z))
    out = torch.sigmoid(self.fc4(h))
    return out

  def forward(self, x):
    # batch_size x 1 x 28 x 28 -> batch_size x 784
    #print(x.view(-1, img_size).shape)
    mu, logvar = self.encode(x.view(-1, img_size))
    z = self.reparameterize(mu, logvar)
    reconstructed = self.decode(z)
    return reconstructed, mu, logvar

# Initialize model, optimizer
model = VAE().to(device)
optim = torch.optim.Adam(model.parameters(), lr=1e-3)

In [5]:
print(model)

VAE(
  (fc1): Linear(in_features=650, out_features=64, bias=True)
  (fc2_mean): Linear(in_features=64, out_features=2, bias=True)
  (fc2_logvar): Linear(in_features=64, out_features=2, bias=True)
  (fc3): Linear(in_features=2, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=650, bias=True)
)


In [4]:
# Define loss function
def loss_function(rec_img, org_img, mu, logvar):
  # Reconstruction loss
  rec_loss = F.binary_cross_entropy(rec_img, org_img.view(-1, img_size), reduction='sum')
  # KL-div loss
  kl_div = 0.5 * torch.sum(logvar.exp() + mu.pow(2) - 1 - logvar)
  #
  return rec_loss + kl_div

# train function
def train(epoch):

  # train function
  model.train()
  train_loss = 0

  for i, (images, _) in enumerate(trainloader):
    images = images.to(device)
    rec_img, mu, logvar = model(images)
    loss = loss_function(rec_img, images, mu, logvar)

    optim.zero_grad()
    loss.backward()
    optim.step()

    train_loss += loss.item()

    if epoch % 400 and epoch > 0:
      #print(f'epoch {epoch}, batch {i}/{len(trainloader)}, loss {loss.item()/len(images)}')
      print(f'----> Epoch {epoch}, Average loss {train_loss/len(trainloader.dataset)}')

  #print(f'----> Epoch {epoch}, Average loss {train_loss/len(trainloader.dataset)}')

# Test function
def test(epoch):
  model.eval()
  test_loss = 0

  with torch.no_grad():
    for i, (images, _) in enumerate(testloader):
      images = images.to(device)
      rec_img, mu, logvar = model(images)
      loss = loss_function(rec_img, images, mu, logvar)
      test_loss += loss.item()

      if i == 0:
        comparison = torch.cat([images[:5], rec_img.view(batch_size, 1, 28, 28)[:5]])
        save_image(comparison.cpu(), f'results/reconstruction_{epoch}.png', nrow=5)

    print(f'----> Average test loss {test_loss/len(testloader.dataset)}')

In [None]:
model = VAE().to(device)
#model.load_state_dict(torch.load('/content/drive/MyDrive/MultiModalDeepLearning/audio_notebooks/autoenc.pth'))

In [5]:
fpath = '/content/drive/MyDrive/video_feat/features/audioVideo_features.csv'

df = pd.read_csv(fpath)
#df.tail()

In [6]:
df2 = df.transpose()
df2.head()

y = list(df.columns)
#print(y)
X = df2.values
#print(X)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(y)
list(le.classes_)
y_new = le.transform(y)

In [7]:
class MyDataset(Dataset):

  def __init__(self,fpath):
    df = pd.read_csv(fpath)
    df2 = df.transpose()

    X = df2.values
    y = le.transform(list(df.columns))

    self.x_train = torch.tensor(X,dtype=torch.float32)
    self.y_train = torch.tensor(y,dtype=torch.float32)

  def __len__(self):
    return len(self.y_train)

  def __getitem__(self,idx):
    return self.x_train[idx],self.y_train[idx]

In [8]:
myDs=MyDataset(fpath)
trainloader=  DataLoader(myDs,batch_size=10,shuffle=True)
testloader =  DataLoader(myDs,batch_size=10,shuffle=False)

next(iter(trainloader))

[tensor([[ 0.0382,  0.1334,  3.1703,  ...,  0.4380, -0.0162,  0.0599],
         [ 0.0150,  0.0793,  3.1717,  ...,  0.6241, -0.0064, -0.0152],
         [ 0.0463,  0.0900,  3.1739,  ...,  0.3940,  0.0678, -0.1526],
         ...,
         [ 0.0728,  0.0374,  3.1686,  ...,  0.7290, -0.1541,  0.0455],
         [ 0.0518,  0.0554,  3.1475,  ...,  0.5607, -0.1018, -0.0076],
         [ 0.0319,  0.0713,  3.2474,  ...,  0.4053, -0.0041,  0.0103]]),
 tensor([66., 29., 17., 60., 23., 69., 70., 46., 37.,  5.])]

In [9]:
# main function
for epoch in range(1, epochs+1000):
  train(epoch)
  #test(epoch)

[1;30;43mΗ έξοδος ροής περικόπηκε στις τελευταίες 5000 γραμμές.[0m
----> Epoch 453, Average loss -9742.356944444444
----> Epoch 453, Average loss -11858.187326388888
----> Epoch 453, Average loss -13833.476736111112
----> Epoch 453, Average loss -15657.47013888889
----> Epoch 453, Average loss -17594.127951388888
----> Epoch 454, Average loss -1938.1859375
----> Epoch 454, Average loss -4056.7274305555557
----> Epoch 454, Average loss -6047.251388888889
----> Epoch 454, Average loss -7881.182465277778
----> Epoch 454, Average loss -9912.392708333333
----> Epoch 454, Average loss -11746.201215277777
----> Epoch 454, Average loss -13613.338541666666
----> Epoch 454, Average loss -15655.2359375
----> Epoch 454, Average loss -17530.024479166666
----> Epoch 455, Average loss -1881.7559027777777
----> Epoch 455, Average loss -3838.0699652777776
----> Epoch 455, Average loss -5635.977083333333
----> Epoch 455, Average loss -7698.1046875
----> Epoch 455, Average loss -9740.565277777778
---->

In [8]:
#model.load_state_dict('/content/drive/MyDrive/MultiModalDeepLearning/audio_notebooks/autoenc.pth')
torch.save(model.state_dict(), '/content/drive/MyDrive/video_feat/autoenc_vidAud.pth')

In [17]:
### Testing latent space
testdat, labels = next(iter(testloader))
for i in range(0, 8):
  testdat_temp, labels_temp = next(iter(testloader))
  testdat = torch.cat([testdat,testdat_temp])
  labels = torch.cat([labels, labels_temp])


testdat = testdat.to(device)

testdat = torch.tensor(X,dtype=torch.float32)
y2 = le.transform(list(df.columns))
labels = torch.tensor(y2,dtype=torch.float32)

_, mu, logvar = model(testdat)
z = model.reparameterize(mu, logvar).data.cpu().numpy()


#z3 = np.append(z, labels)
#z3 = np.append(z, y2.reshape(-1), axis = 1)

In [20]:
np.save('/content/drive/MyDrive/video_feat/features/classes.npy', le.classes_)

In [12]:
df3 = pd.DataFrame(data = z, columns = ['x', 'y'])
df3['labels'] = y2
df3.head()

Unnamed: 0,x,y,labels
0,0.072882,-1.031886,0
1,0.118468,1.82244,1
2,-2.555076,0.10523,2
3,-1.036142,-0.167495,3
4,-3.358587,-1.058625,4


In [22]:
df3.to_csv('/content/drive/MyDrive/video_feat/features/audioVideo_z.csv', index=False)
torch.save(model.state_dict(), '/content/drive/MyDrive/video_feat/features/autoenc.pth')

In [21]:
import plotly.express as px

labals = labels.detach().to('cpu').numpy()
libels = []
for item in labals:
  item = str(item)
  libels.append(item)

fig = px.scatter(x=z[:, 0], y=z[:, 1], color=libels)
fig.show()

In [None]:
from sklearn.cluster import KMeans

nclust = int(len(z) / 5)

kmeans = KMeans(n_clusters=nclust, random_state=0, n_init="auto").fit(z)
kmeans.labels_

array([13,  9, 11, 17,  4,  2, 11,  3,  5,  3,  2,  6,  9,  3,  5, 17,  9,
       14, 14, 15, 12,  5,  5, 13,  6,  0,  5,  2, 15, 14,  5,  4,  8,  4,
       17, 10,  0, 17,  0,  0, 15,  5,  5, 17,  0,  3,  7,  6,  8,  3,  2,
        2, 15, 16, 12, 14,  5,  1,  8,  6,  7, 15, 14, 17, 14, 17,  7, 14,
        5,  8,  4, 11,  4,  3,  5, 14,  8,  1,  6, 11,  9,  4,  5, 12,  2,
        3,  2,  1,  1,  8], dtype=int32)

In [None]:
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=5)
neigh.fit(z)


nearest = neigh.kneighbors([[0,1]], return_distance=False).reshape(-1)
y3 = le.inverse_transform(y2)

most_similar = []
for k in nearest:
  most_similar.append(y3[k])

print(most_similar)

['GAYLE-abcdefu', 'Bruno-Mars-That-s-What-I-Like', 'M-neskin-Beggin', 'Lil-Nas-X-Jack-Harlow-INDUSTRY-BABY', 'Kendrick-Lamar-HUMBLE']


In [None]:
def getNmax(array, N):
  idx = (-array[0]).argsort()[:N]
  return idx

def scaleProbs(max_prob):
  max_probs_scaled = ((max_prob - min(max_prob)) / (max(max_prob) - min(max_prob))).round(2)
  max_probs_scaled = ((max_prob - min(max_prob)) / (max_prob - min(max_prob)).sum()).round(2)

  return max_probs_scaled

def getNmaxClassesProbs(clf_classes, preds, N):
  max_classes = []
  max_probs = []

  nmax = getNmax(preds, N)

  for i in nmax:
    max_classes.append(clf_classes[i])
    max_probs.append(preds[0][i])

  max_probs_scaled = scaleProbs(max_probs)

  return max_classes, max_probs_scaled

clf = joblib.load('/content/drive/MyDrive/MultiModalDeepLearning/classifiers/audio_clf.joblib')

#pred = clf.predict_proba([X[0]])
pred = clf.predict_proba([X[0]])
max_cl, max_prob = getNmaxClassesProbs(clf.classes_, pred, 5)

print(max_prob)
print(max_cl)

[0.61 0.33 0.03 0.03 0.  ]
['Kendrick-Lamar-HUMBLE', 'ROSAL-A-DESPECH-', 'Doja-Cat-Woman', 'Billie-Eilish-bad-guy', 'Future-Mask-Off-Official-Music-Video-xvZqHgFz51I-']
