In [1]:
import os
import numpy as np
import plotly.graph_objs as go
import plotly
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets, transforms
from torchvision.utils import make_grid, save_image
from torch.nn import functional as F
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
from scipy.stats import norm
import scipy.io
import os
from math import ceil
import joblib
from sklearn import preprocessing
import plotly.express as px

device = ( 'cuda' if torch.cuda.is_available() else 'cpu' )

In [2]:
# Autoencoder

# Hyperparameters
batch_size = 128
device = device = ( 'cuda' if torch.cuda.is_available() else 'cpu' )

img_size = 650    # 28x28x1
hidden_dim = 64  # hidden layer dim
#z_dim = 20        # latent space dimension (encoder)
z_dim = 2        # latent space dimension (encoder)

epochs = 10

# model architecture
class VAE(nn.Module):
  def __init__(self):
    super(VAE, self).__init__()

    # encoder
    self.fc1 = nn.Linear(img_size, hidden_dim)
    self.fc2_mean = nn.Linear(hidden_dim, z_dim)
    self.fc2_logvar = nn.Linear(hidden_dim, z_dim)
    #decoder
    self.fc3 = nn.Linear(z_dim, hidden_dim)
    self.fc4 = nn.Linear(hidden_dim, img_size)

  def encode(self, x):
    h = F.relu(self.fc1(x))
    mu = self.fc2_mean(h)         # compute mean of latent
    logvar = self.fc2_logvar(h)   # compute logvar of latent
    return mu, logvar

  def reparameterize(self, mu, logvar):
    # p(z|x) = mu + std * eps
    std = torch.exp(logvar/2)
    eps = torch.randn_like(std)
    return (mu + eps * std)

  def decode(self, z):
    h = F.relu(self.fc3(z))
    out = torch.sigmoid(self.fc4(h))
    return out

  def forward(self, x):
    # batch_size x 1 x 28 x 28 -> batch_size x 784
    #print(x.view(-1, img_size).shape)
    mu, logvar = self.encode(x.view(-1, img_size))
    z = self.reparameterize(mu, logvar)
    reconstructed = self.decode(z)
    return reconstructed, mu, logvar

# Initialize model, optimizer
audioVae = VAE().to(device)
audioVae.load_state_dict(torch.load('/content/drive/MyDrive/video_feat/features/autoenc.pth'))

<All keys matched successfully>

In [3]:
encoder = preprocessing.LabelEncoder()
encoder.classes_ = np.load('/content/drive/MyDrive/video_feat/features/classes.npy')

# these are the encoded video clips
songs_z = pd.read_csv('/content/drive/MyDrive/video_feat/features/audioVideo_z.csv')
songs_z.head()

labels2 = encoder.inverse_transform(songs_z.labels)
#print(labels)
songs_z_normal = songs_z
songs_z_normal.labels = labels2
songs_z_normal.head()
#songs_z = audioVae.encode(songs)

#test_song = torch.randn(138)

#test_song_z = audioVae.encode(test_song)

Unnamed: 0,x,y,labels
0,0.072882,-1.031886,24kGoldn-Mood
1,0.118468,1.82244,5-Seconds-of-Summer-Youngblood
2,-2.555076,0.10523,Alec-Benjamin-Let-Me-Down-Slowly
3,-1.036142,-0.167495,Anne-Marie-2002
4,-3.358587,-1.058625,Axwell-and-Ingrosso-More-Than-You-Know


In [4]:
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=5)

neigh.fit(songs_z[['x', 'y']].values)


#nearest = neigh.kneighbors([[0,1]], return_distance=False).reshape(-1)
#y3 = le.inverse_transform(y2)

#most_similar = []
#for k in nearest:
#  most_similar.append(y3[k])

#print(most_similar)

nearest = neigh.kneighbors([[0,1]], return_distance=False).reshape(-1)
#y3 = le.inverse_transform(y2)

most_similar = []
for k in nearest:
  most_similar.append(labels2[k])

print(most_similar)

['Ti-sto-Ava-Max-The-Motto', 'Billie-Eilish-bad-guy', 'Marshmello-x-Jonas-Brothers-Leave-Before-You-Love-Me', 'Taylor-Swift-The-Man', 'Kygo-Selena-Gomez-It-Ain-t-Me']


In [5]:
# correct solution:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0) # only difference


neighbor_distance, neighbor_index = neigh.kneighbors([[0,1]], return_distance=True)
ha = (neighbor_distance.reshape(-1, 1)+100)

neighbor_distance = softmax(ha).round(3)*100
#print(neighbor_distance)
neighbor_index = encoder.inverse_transform(neighbor_index.reshape(-1))
#print(neighbor_index)

for i in range(len(neighbor_index)):
  print(f'track{i+1}: {neighbor_index[i]}, probability: {neighbor_distance[i]}')


track1: Ti-sto-Ava-Max-The-Motto, probability: [15.2]
track2: Billie-Eilish-bad-guy, probability: [19.5]
track3: Marshmello-x-Jonas-Brothers-Leave-Before-You-Love-Me, probability: [21.4]
track4: Taylor-Swift-The-Man, probability: [21.9]
track5: Kygo-Selena-Gomez-It-Ain-t-Me, probability: [22.1]


In [6]:
!git clone https://github.com/tyiannak/pyAudioAnalysis.git

!pip install pyAudioAnalysis

#!pip install numpy
#!pip install matplotlib
#!pip install scipy
!pip install sklearn
#!pip install hmmlearn
!pip install simplejson
!pip install eyed3
!pip install pydub

remote: Total 3031 (delta 233), reused 415 (delta 209), pack-reused 2534[K
Receiving objects: 100% (3031/3031), 167.82 MiB | 28.40 MiB/s, done.
Resolving deltas: 100% (1794/1794), done.
Updating files: 100% (276/276), done.
Collecting pyAudioAnalysis
  Downloading pyAudioAnalysis-0.3.14.tar.gz (41.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyAudioAnalysis
  Building wheel for pyAudioAnalysis (setup.py) ... [?25l[?25hdone
  Created wheel for pyAudioAnalysis: filename=pyAudioAnalysis-0.3.14-py3-none-any.whl size=41264374 sha256=540735cd4e0d2f21105cd59d777edd11b8a5b97d31b208f014f9a94e6137bc09
  Stored in directory: /root/.cache/pip/wheels/a7/54/73/fa830689c2440d2c81ff175c60e374930ad1607a8881e0f43f
Successfully built pyAudioAnalysis
Installing collected packages: pyAudioAnalysis
Successfully installed pyAud

In [7]:
from pyAudioAnalysis import MidTermFeatures as aF
#from pyAudioAnalysis import audioTrainTest as aFa
import os
import numpy as np
import plotly.graph_objs as go
import plotly

In [8]:
fpath = '/content/drive/MyDrive/multi_modal/test'

classes = os.listdir(fpath)
print(classes)
#print(dirs)
dirs= []
for file in os.listdir(fpath):
    d = os.path.join(fpath, file)
    if os.path.isdir(d):
        dirs.append(d)

print(len(dirs))

class_names = [os.path.basename(d) for d in classes]

m_win, m_step, s_win, s_step = 1, 1, 0.1, 0.05

features = []
class_n = []
i= 0
for d in dirs: # get feature matrix for each directory (class)
    name = os.path.basename(d)
    class_n.append(name)

    f, files, fn = aF.directory_feature_extraction(d, m_win, m_step,
                                                   s_win, s_step)
    features.append(f)
    print(i)
    i+=1

['.ipynb_checkpoints', 'test1', 'g.py']
2
0
Analyzing file 1 of 1: /content/drive/MyDrive/multi_modal/test/test1/test1.wav


  sampling_rate, signal = wavfile.read(input_file) # from scipy.io


Feature extraction complexity ratio: 7.7 x realtime
1


In [9]:
arr = np.array(features[1])
arr2 = np.zeros(512)
arr3 = np.concatenate((arr, arr2), axis=0)
print(arr3.shape)
#print(arr[1])



testdat = torch.tensor(arr3,dtype=torch.float32)

_, mu, logvar = audioVae(testdat)
z = audioVae.reparameterize(mu, logvar).data.cpu().numpy()
print(z)

(650,)
[[-1.0149125 -1.1761402]]


In [10]:
# correct solution:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0) # only difference


neighbor_distance, neighbor_index = neigh.kneighbors(z, return_distance=True)
n_dist = neighbor_distance # for circle color
ha = (neighbor_distance.reshape(-1, 1)+100)

neighbor_distance = softmax(ha).round(3)*100
#print(neighbor_distance)
neighbor_index = encoder.inverse_transform(neighbor_index.reshape(-1))
#print(neighbor_index)

for i in range(len(neighbor_index)):
  print(f'track{i+1}: {neighbor_index[i]}, probability: {neighbor_distance[i]}')

track1: LISA-MONEY-, probability: [18.3]
track2: Harry-Styles-Golden, probability: [19.3]
track3: Imagine-Dragons-Thunder, probability: [20.1]
track4: Billie-Eilish-Khalid-lovely, probability: [20.3]
track5: Lady-Gaga-The-Cure, probability: [22.]


In [11]:
def extract_features(fpath):
    classes = os.listdir(fpath)
    #print(classes)
    #print(dirs)
    dirs= []
    for file in os.listdir(fpath):
        d = os.path.join(fpath, file)
        if os.path.isdir(d):
            dirs.append(d)

    #print(len(dirs))

    class_names = [os.path.basename(d) for d in classes]

    m_win, m_step, s_win, s_step = 1, 1, 0.1, 0.05

    features = []
    class_n = []
    i= 0
    for d in dirs: # get feature matrix for each directory (class)
        name = os.path.basename(d)
        class_n.append(name)

        f, files, fn = aF.directory_feature_extraction(d, m_win, m_step,
                                                      s_win, s_step)
        features.append(f)
        #print(i)
        i+=1

    arr = np.array(features[1])
    arr2 = np.zeros(512)
    arr3 = np.concatenate((arr, arr2), axis=0)
    testdat = torch.tensor(arr3,dtype=torch.float32)

    return testdat



In [12]:
def similarityAudio(fpath):

  audio_extr_f = extract_features(fpath)

  _, mu, logvar = audioVae(audio_extr_f)

  epsilon = 0.1
  std = torch.exp(logvar/2)
  eps = torch.randn_like(std)
  z = (mu + epsilon * std).data.cpu().numpy()
  #z = audioVae.reparameterize(mu, logvar).data.cpu().numpy()
  neighbor_distance, neighbor_index = neigh.kneighbors(z, return_distance=True)
  neighbor_color = neighbor_index
  #print(neighbor_color)
  plot_radius = float(np.array(np.max(neighbor_distance)))
  #print(f'pr {plot_radius}')
  ha = (neighbor_distance.reshape(-1, 1)+100)

  neighbor_distance = softmax(ha).round(3)*100
  neighbor_distance = softmax(100 - neighbor_distance).round(3)*100
  #print(neighbor_distance)
  neighbor_index = encoder.inverse_transform(neighbor_index.reshape(-1))
  #print(neighbor_index)
  print()
  print()
  for i in range(len(neighbor_index)):
    print(f'track{i+1}: {neighbor_index[i]}, probability: {neighbor_distance[i]}')


  #fig = px.scatter(x=songs_z.iloc[:, 0], y=songs_z.iloc[:, 1])
  #fig = px.scatter(x=z[:, 0], y=z[:, 1])
  #fig.show()

  import plotly.express as px
  import plotly.graph_objects as go

  fig1 = px.scatter(x=songs_z.iloc[:, 0], y=songs_z.iloc[:, 1])
  fig2 = px.scatter(x=z[:, 0], y=z[:, 1], color_discrete_sequence=['red'])
  #print(list(neighbor_color.reshape(-1)))
  z4 = songs_z.iloc[list(neighbor_color.reshape(-1))]
  #print(z4)
  fig4 = px.scatter(x=z4['x'], y=z4['y'], color =neighbor_index) #color_discrete_sequence=['LightGreen'])
  fig4.update_traces(
    marker=dict(size=8, symbol="star-diamond", line=dict(width=2, color="DarkSlateGrey")),
    selector=dict(mode="markers"),
  )


  fig3 = go.Figure(data=fig1.data + fig2.data + fig4.data)
  print(f'lp {z[:, 0]- plot_radius}')
  fig3.add_shape(type="circle", xref="x", yref="y", x0=float(z[:, 0])-plot_radius,
                 y0=float(z[:, 1])-plot_radius, x1=float(z[:, 0])+plot_radius, y1=float(z[:, 1])+plot_radius,
                 line_color="LightGreen", fillcolor="LightSalmon", opacity=0.2,)

  # Set figure size
  #fig3.update_layout(width=400, height=400)
  fig3.update_yaxes(
    scaleanchor="x",
    scaleratio=1,
  )

  fig3.show()

In [13]:
print(list(neighbor_index.reshape(-1)))

['LISA-MONEY-', 'Harry-Styles-Golden', 'Imagine-Dragons-Thunder', 'Billie-Eilish-Khalid-lovely', 'Lady-Gaga-The-Cure']


Audio Based Similarity Retrieval

In [14]:
similarityAudio(fpath)

Analyzing file 1 of 1: /content/drive/MyDrive/multi_modal/test/test1/test1.wav


  sampling_rate, signal = wavfile.read(input_file) # from scipy.io


Feature extraction complexity ratio: 5.9 x realtime


track1: Billie-Eilish-Khalid-lovely, probability: [86.]
track2: LISA-MONEY-, probability: [10.5]
track3: Imagine-Dragons-Thunder, probability: [1.9]
track4: POP-SMOKE-WHAT-YOU-KNOW-BOUT-LOVE, probability: [1.4]
track5: Lost-Frequencies-ft-Calum-Scott-Where-Are-You-Now, probability: [0.1]
lp [-2.0607164]


In [18]:
def similarityAudio2(fpath):

  audio_extr_f = extract_features(fpath)

  _, mu, logvar = audioVae(audio_extr_f)

  epsilon = 0.1
  std = torch.exp(logvar/2)
  eps = torch.randn_like(std)
  z = (mu + epsilon * std).data.cpu().numpy()
  #z = audioVae.reparameterize(mu, logvar).data.cpu().numpy()
  neighbor_distance, neighbor_index = neigh.kneighbors(z, return_distance=True)
  neighbor_color = neighbor_index
  #print(neighbor_color)
  plot_radius = float(np.array(np.max(neighbor_distance)))
  #print(f'pr {plot_radius}')
  ha = (neighbor_distance.reshape(-1, 1)+100)

  neighbor_distance = softmax(ha).round(3)*100
  neighbor_distance = softmax(100 - neighbor_distance).round(3)*100
  #print(neighbor_distance)
  neighbor_index = encoder.inverse_transform(neighbor_index.reshape(-1))
  #print(neighbor_index)
  print()
  print()
  for i in range(len(neighbor_index)):
    print(f'track{i+1}: {neighbor_index[i]}, probability: {neighbor_distance[i]}')


  #fig = px.scatter(x=songs_z.iloc[:, 0], y=songs_z.iloc[:, 1])
  #fig = px.scatter(x=z[:, 0], y=z[:, 1])
  #fig.show()

  import plotly.express as px
  import plotly.graph_objects as go

  fig1 = px.scatter(x=songs_z.iloc[:, 0], y=songs_z.iloc[:, 1])
  fig2 = px.scatter(x=z[:, 0], y=z[:, 1], color_discrete_sequence=['red'])
  #print(list(neighbor_color.reshape(-1)))
  z4 = songs_z.iloc[list(neighbor_color.reshape(-1))]
  #print(z4)
  fig4 = px.scatter(x=z4['x'], y=z4['y'], color =neighbor_index) #color_discrete_sequence=['LightGreen'])
  fig4.update_traces(
    marker=dict(size=8, symbol="star-diamond", line=dict(width=2, color="DarkSlateGrey")),
    selector=dict(mode="markers"),
  )


  fig3 = go.Figure(data=fig1.data + fig2.data + fig4.data)
  print(f'lp {z[:, 0]- plot_radius}')
  fig3.add_shape(type="circle", xref="x", yref="y", x0=float(z[:, 0])-plot_radius,
                 y0=float(z[:, 1])-plot_radius, x1=float(z[:, 0])+plot_radius, y1=float(z[:, 1])+plot_radius,
                 line_color="LightGreen", fillcolor="LightSalmon", opacity=0.2,)

  # Set figure size
  #fig3.update_layout(width=400, height=400)
  fig3.update_yaxes(
    scaleanchor="x",
    scaleratio=1,
  )

  return fig3

In [None]:
!pip install dash # The core dash backend
!pip install dash-daq

# Import packages
from dash import Dash, html, dash_table, dcc, callback, Output, Input
import pandas as pd
import plotly.express as px

# Incorporate data
df = songs_z

# Initialize the app
app = Dash(__name__)

# App layout
app.layout = html.Div([
    html.Div(children='My First App with Data, Graph, and Controls'),
    html.Hr(),
    dcc.RadioItems(options=['pop', 'lifeExp', 'gdpPercap'], value='lifeExp', id='controls-and-radio-item'),
    dash_table.DataTable(data=df.to_dict('records'), page_size=6),
    dcc.Graph(figure={}, id='controls-and-graph')
])

# Add controls to build the interaction
@callback(
    Output(component_id='controls-and-graph', component_property='figure'),
    Input(component_id='controls-and-radio-item', component_property='value')
)
def update_graph(col_chosen):
    fig = similarityAudio2(fpath)
    return fig

# Run the app
if __name__ == '__main__':
    app.run(debug=True)