In [1]:
import numpy as np
import os, sys, librosa
from librosa import display
from scipy import signal
from matplotlib import pyplot as plt
import matplotlib.gridspec as gridspec
import IPython.display as ipd
import pandas as pd
from numba import jit
from IPython.display import Audio 
import IPython

import cdpam

from scipy.stats import pearsonr, spearmanr

import scipy.spatial as sp

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import numpy as np

import librosa
from IPython.display import Audio, display
from PIL import Image
import matplotlib.pyplot as plt
import scipy.stats as stats
import collections as c

from torch.nn.modules.module import _addindent

import copy
import os
import math

import soundfile as sf
from matplotlib.pyplot import figure

from sklearn.preprocessing import MinMaxScaler

%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

### Per Antognini et al. (Synthesizing Diverse, High-Quality Audio Textures) - https://arxiv.org/abs/1806.08002:
#### Input representation 
1. Ensure hop_length is less than half of win_length
2. Get absolute value of stft, add 1 and take natural logarithm. 
   Adding 1 guarantees that the log-spectrogram is finite and positive

#### Network architecture
1. 6 CNN layers, with filter sizes 2^n
2. Each layer with 512 filters
3. Each filter randomly drawn from Glorot initialization (In PyTorch - torch.nn.init.xavier_uniform) 


In [2]:
BOOTSTRAP_ITERS = 10 #To generate SEMs and Means.

N_PARALLEL_CNNS=6

N_FFT = 512 
K_HOP = 128 
N_FREQ= 257
N_FILTERS = 512

possible_kernels = [2,4,8,16,64,128,256,512,1024,2048]
filters = [0]*N_PARALLEL_CNNS
for j in range(N_PARALLEL_CNNS):
    filters[j]=possible_kernels[j]
    

use_cuda = torch.cuda.is_available() #use GPU if available
print('GPU available =',use_cuda)
dtype = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor

GPU available = False


In [3]:
def read_audio_spectrum(filename):
    #x, fs  = sf.read(filename)
    x, fs  = librosa.load(filename, sr=16000)
    R = np.abs(librosa.stft(x, n_fft=N_FFT, hop_length=K_HOP, win_length=N_FFT,  center=False))  
    R += 1
    R = np.log(R)
    return R,fs


'''
Make input such that frequency bins of Spectrogram are channels.
i.e. if Spectrogram shape is 257 X 247 => Input to convolution should be 1 X 257 X 1 X 247 
(PyTorch uses batch X channels X Height X Width)
'''
def prepare_input(filename):
    R, fs = read_audio_spectrum(filename)
    a_style = np.ascontiguousarray(R[None,None,:,:])
    a_style = torch.from_numpy(a_style).permute(0,2,1,3) 
    converted_img = Variable(a_style).type(dtype)
    return converted_img

# Glorot initialization
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        torch.nn.init.xavier_uniform_(m.weight)

In [4]:

# This variable lets you 'tap' into the style_net to retrieve Gram Matrices at the layer matching that name.
style_layers_default = ['relu_2']


class style_net(nn.Module):
    """Here create the network you want to use by adding/removing layers in nn.Sequential"""
    def __init__(self, num_channels, num_filters, filter_size):
        super(style_net, self).__init__()
        self.layers = nn.Sequential(c.OrderedDict([
                            ('conv1',nn.Conv2d(num_channels, num_filters, kernel_size=(1,filter_size),bias=False)),
                            ('relu1',nn.ReLU())]))

            
    def forward(self,input):
        out = self.layers(input)
        return out

class GramMatrix(nn.Module):
    '''Compute the feature correlations'''
    def forward(self, input):
        a, b, c, d = input.size()         
        features = input.view(b, a * c * d)
        features2=features.unsqueeze(0)
        G = torch.matmul(features2, torch.transpose(features2, 1,2))
        return G.div(a * c * d)


    
'''
Initialize parallel CNNs as suggested by Antognini et al.
Typical values:
num_cnns = N_PARALLEL_CNNS
per_cnn_num_channels = N_FREQ
per_cnn_num_filters = N_FILTERS
per_cnn_filter_sizes = [2,4,8,16,64,128]
'''
def initialize_parallel_cnns(num_cnns, per_cnn_num_channels, per_cnn_num_filters, per_cnn_filter_sizes):
    cnnlist=[] 
    for j in range(num_cnns) :
        cnn = style_net(per_cnn_num_channels, per_cnn_num_filters, per_cnn_filter_sizes[j])
        cnn.apply(lambda x: weights_init(x))
        for param in cnn.parameters():
            param.requires_grad = False
        if use_cuda:
            cnn = cnn.cuda()

        cnnlist.append(cnn)
        
    return cnnlist


In [5]:
def get_gram(cnn, formatted_input, style_layers=style_layers_default):
    
    result = []
    cnn = copy.deepcopy(cnn)
    
    model = nn.Sequential()
    layer_list = list(cnn.layers)
    gram = GramMatrix()
     
    i = 1  
    for layer in layer_list:
        if isinstance(layer, nn.Conv2d): 
            name = "conv_" + str(i)
            model.add_module(name, layer)
            
            if name in style_layers: 
                target_feature = model(formatted_input).clone()
                target_feature_gram = gram(target_feature)
                target_feature_gram=torch.flatten(target_feature_gram)
                target_feature_gram=target_feature_gram.numpy()
                target_feature_gram=target_feature_gram.reshape(512,512)
                target_feature_gram= MinMaxScaler().fit_transform(target_feature_gram)
                
                result.append((target_feature_gram))

        if isinstance(layer, nn.ReLU):
            name = "relu_" + str(i)
            model.add_module(name, layer)
            
            if name in style_layers:
                target_feature = model(formatted_input).clone()
                target_feature_gram = gram(target_feature)
                target_feature_gram=torch.flatten(target_feature_gram)
                target_feature_gram=target_feature_gram.numpy()
                target_feature_gram=target_feature_gram.reshape(512,512)
                target_feature_gram= MinMaxScaler().fit_transform(target_feature_gram)
                result.append((target_feature_gram))
               
        if isinstance(layer, nn.MaxPool2d): 
            name = "pool_" + str(i)
            model.add_module(name, layer)
            
            if name in style_layers:
                target_feature = model(formatted_input).clone()
                target_feature_gram = gram(target_feature)
                target_feature_gram=torch.flatten(target_feature_gram)
                target_feature_gram=target_feature_gram.numpy()
                target_feature_gram=target_feature_gram.reshape(512,512)
                target_feature_gram= MinMaxScaler().fit_transform(target_feature_gram)
                result.append((target_feature_gram))
                
        i += 1
    return result

## Define Gram Loss (normalized w.r.t a target as defined in Antognini et. al)

### Based on the normalized gram loss defined in Antognini et al. (paper link at the top of this notebook)

#### Find Gram loss between two Gram Matrices. Normalized by the 'referenced' or 'target' gram matrix

$$Gram\_Loss_{G, \hat{G}} = \frac{\sum_{k,\mu,v}(G_{\mu,v}^k - \hat{G}_{\mu,v}^k)^2}{\sum_{k,\mu,v}(\hat{G}_{\mu,v}^k)^2}$$


In [6]:
def get_antognini_gram_loss(gm_1, gm_2):
    # Here both gm_1 and gm_2 are of shape 6 X 1 X 512 X 512
    frobenius_sum_of_gm_diff = 0
    frobenius_sum_of_target_norm = 0
    for layer_num in range(gm_1.shape[0]):
        frobenius_sum_of_gm_diff += np.linalg.norm(gm_1[layer_num][0] - gm_2[layer_num][0])**2
        frobenius_sum_of_target_norm += np.linalg.norm(gm_2[layer_num][0])**2
    return frobenius_sum_of_gm_diff/frobenius_sum_of_target_norm

In [7]:
def compute_cos_distance(gram1,gram2): 
    cos = nn.CosineSimilarity(dim=0, eps=1e-6)
    gram1_ = np.squeeze(gram1, axis=1)
    gram2_ = np.squeeze(gram2, axis=1)
    distance = np.zeros((6,1))
    for i in range(6): 
        temp= cos(torch.from_numpy(gram1_[i].flatten()), torch.from_numpy(gram2_[i].flatten())) 
        distance[i]=temp
    return 1-distance.mean()

In [8]:
cnnlist = initialize_parallel_cnns(N_PARALLEL_CNNS, N_FREQ, N_FILTERS, filters)

In [9]:
def get_param_sense_losses(audio_dir, test_type='gan', num_interp_steps=10, dir_list=None):
    example_dirs = [_ for _ in os.listdir(audio_dir) if _ !='.DS_Store']
    print(example_dirs)
    all_gm_losses = np.empty((0,num_interp_steps))
    for example_dir in example_dirs:
        
        if example_dir in dir_list:

            print('Analysing example ', example_dir)
            if test_type == 'gan':
                example_loc_path = os.path.join(audio_dir, example_dir, 'one_z_pitch_sweep')
            else:
                example_loc_path = os.path.join(audio_dir, example_dir)

            example_loc = os.listdir(example_loc_path)
            example_loc = [_ for _ in os.listdir(example_loc_path) if '.wav' in _]
            if test_type == 'gan':
                example_loc.sort(key=lambda x:int(x.split('_')[3].split('.')[0]))
            elif test_type == 'morph2':
                example_loc.sort(key=lambda x:float(x.split('-')[3]))
            else:
                example_loc.sort(key=lambda x:float(x.split('_')[1].split('.wav')[0]))

            #print(example_loc)
            audio_file_list = []
            for example_audio in example_loc:
                if '.wav' in example_audio and example_audio:
                    audio_file_list.append(os.path.join(example_loc_path, example_audio))

            audio_0 = prepare_input(audio_file_list[0])
            audio_0_gm = []
            for j in range(N_PARALLEL_CNNS):
                temp = get_gram(cnnlist[j], audio_0, style_layers=style_layers_default)
                audio_0_gm.append(temp)
            audio_0_gm = np.array(audio_0_gm)

            example_gm_losses = np.array([])
            for audio_file in audio_file_list:
                audio_i = prepare_input(audio_file)

                audio_i_gm = []
                for j in range(N_PARALLEL_CNNS):
                    temp = get_gram(cnnlist[j], audio_i, style_layers=style_layers_default)
                    audio_i_gm.append(temp)
                audio_i_gm = np.array(audio_i_gm)
                example_gm_losses = np.append(example_gm_losses, compute_cos_distance(audio_0_gm, audio_i_gm))
            #print(example_gm_losses)
            all_gm_losses = np.append(all_gm_losses, np.array([example_gm_losses]), axis=0)

    print('GM Loss array', np.mean(all_gm_losses, axis=0))
    return np.mean(all_gm_losses, axis=0)

# MorphGAN

In [10]:
morphgan_audio='/Users/purnimakamath/appdir/Github/ieee-tx-on-mm/data/water-wind/morphgan/audio/semantic'

water_dirs = ['2022-08-24 21:38','2022-08-24 21:39','2022-08-24 21:40','2022-08-24 21:41','2022-08-24 21:42']
morphgan_paramsense_losses_water = get_param_sense_losses(morphgan_audio, test_type='gan', 
                                                          num_interp_steps=11, dir_list=water_dirs)

wind_dirs = ['2022-08-24 21:44','2022-08-24 21:45','2022-08-24 21:46','2022-08-24 21:47','2022-08-24 21:48']
morphgan_paramsense_losses_wind = get_param_sense_losses(morphgan_audio, test_type='gan', 
                                                          num_interp_steps=11, dir_list=wind_dirs)

print('----------Results--------')
lin_line = [a for a in np.arange(0,1.1,0.1)]
print('MorphGAN: Corr coef for Semantic Param Sensitivity Water = ', pearsonr(lin_line, morphgan_paramsense_losses_water))
print('MorphGAN: Corr coef for Semantic Param Sensitivity Wind = ', pearsonr(lin_line, morphgan_paramsense_losses_wind))

['2022-08-24 21:48', '2022-08-24 21:41', '2022-08-24 21:46', '2022-08-24 21:47', '2022-08-24 21:40', '2022-08-24 21:38', '2022-08-24 21:39', '2022-08-24 21:45', '2022-08-24 21:42', '2022-08-24 21:44']
Analysing example  2022-08-24 21:41
Analysing example  2022-08-24 21:40
Analysing example  2022-08-24 21:38
Analysing example  2022-08-24 21:39
Analysing example  2022-08-24 21:42
GM Loss array [0.         0.01269024 0.0208034  0.02930466 0.04034504 0.06204682
 0.08139823 0.09305373 0.10165636 0.109344   0.11778615]
['2022-08-24 21:48', '2022-08-24 21:41', '2022-08-24 21:46', '2022-08-24 21:47', '2022-08-24 21:40', '2022-08-24 21:38', '2022-08-24 21:39', '2022-08-24 21:45', '2022-08-24 21:42', '2022-08-24 21:44']
Analysing example  2022-08-24 21:48
Analysing example  2022-08-24 21:46
Analysing example  2022-08-24 21:47
Analysing example  2022-08-24 21:45
Analysing example  2022-08-24 21:44
GM Loss array [0.         0.04532412 0.11898211 0.20578746 0.28231331 0.34086918
 0.37687345 0.42165

# One Hot GAN

In [11]:
onehot_audio='/Users/purnimakamath/appdir/Github/ieee-tx-on-mm/data/water-wind/onehot/audio/semantic'

water_dirs = ['2022-08-24 22:31','2022-08-24 22:33','2022-08-24 22:34','2022-08-24 22:35','2022-08-24 22:36']
onehot_paramsense_losses_water = get_param_sense_losses(onehot_audio, test_type='gan', 
                                                        num_interp_steps=11, dir_list=water_dirs)

wind_dirs = ['2022-08-24 22:26','2022-08-24 22:27','2022-08-24 22:28','2022-08-24 2221:29','2022-08-24 22:30']
onehot_paramsense_losses_wind = get_param_sense_losses(onehot_audio, test_type='gan',
                                                       num_interp_steps=11, dir_list=wind_dirs)

print('----------Results--------')
lin_line = [a for a in np.arange(0,1.1,0.1)]
print('One Hot: Corr coef for Semantic Param Sensitivity Water = ', pearsonr(lin_line, onehot_paramsense_losses_water))
print('One Hot: Corr coef for Semantic Param Sensitivity Wind = ', pearsonr(lin_line, onehot_paramsense_losses_wind))

['2022-08-24 22:35', '2022-08-24 22:33', '2022-08-24 22:34', '2022-08-24 22:29', '2022-08-24 22:27', '2022-08-24 22:26', '2022-08-24 22:28', '2022-08-24 22:31', '2022-08-24 22:36', '2022-08-24 22:30']
Analysing example  2022-08-24 22:35
Analysing example  2022-08-24 22:33
Analysing example  2022-08-24 22:34
Analysing example  2022-08-24 22:31
Analysing example  2022-08-24 22:36
GM Loss array [0.         0.16265204 0.22498458 0.2589373  0.30608114 0.32786236
 0.34416411 0.33627304 0.30581516 0.20112201 0.19798566]
['2022-08-24 22:35', '2022-08-24 22:33', '2022-08-24 22:34', '2022-08-24 22:29', '2022-08-24 22:27', '2022-08-24 22:26', '2022-08-24 22:28', '2022-08-24 22:31', '2022-08-24 22:36', '2022-08-24 22:30']
Analysing example  2022-08-24 22:27
Analysing example  2022-08-24 22:26
Analysing example  2022-08-24 22:28
Analysing example  2022-08-24 22:30
GM Loss array [0.         0.1518331  0.28248233 0.41201909 0.49096375 0.50662022
 0.51339376 0.52625141 0.52387473 0.48966953 0.46946993

# Mix

In [12]:
mix_audio='/Users/purnimakamath/appdir/Github/ieee-tx-on-mm/data/water-wind/mix/audio/semantic'

water_dirs = ['0','1','2','3','4','5']
mix_paramsense_losses_water = get_param_sense_losses(mix_audio, test_type='mix', 
                                                        num_interp_steps=11, dir_list=water_dirs)

wind_dirs = ['6','7','8','9','10','11']
mix_paramsense_losses_wind = get_param_sense_losses(mix_audio, test_type='mix',
                                                       num_interp_steps=11, dir_list=wind_dirs)

print('----------Results--------')
lin_line = [a for a in np.arange(0,1.1,0.1)]
print('Mix: Corr coef for Semantic Param Sensitivity Water = ', pearsonr(lin_line, mix_paramsense_losses_water))
print('Mix: Corr coef for Semantic Param Sensitivity Wind = ', pearsonr(lin_line, mix_paramsense_losses_wind))


['9', '0', '11', '7', '6', '1', '10', '8', '4', '3', '2', '5']
Analysing example  0
Analysing example  1
Analysing example  4
Analysing example  3
Analysing example  2
Analysing example  5
GM Loss array [0.         0.00472904 0.01249819 0.02383863 0.03938796 0.06138348
 0.09206909 0.13467153 0.19149764 0.26188955 0.31608174]
['9', '0', '11', '7', '6', '1', '10', '8', '4', '3', '2', '5']
Analysing example  9
Analysing example  11
Analysing example  7
Analysing example  6
Analysing example  10
Analysing example  8
GM Loss array [0.         0.0929633  0.17010932 0.22786329 0.27356359 0.3111529
 0.34455669 0.37387544 0.4007599  0.42232825 0.43077639]
----------Results--------
Mix: Corr coef for Semantic Param Sensitivity Water =  (0.9410272397855062, 1.5785749221401356e-05)
Mix: Corr coef for Semantic Param Sensitivity Wind =  (0.9658595826372901, 1.398807912871057e-06)


# Morph2

In [13]:
morph2_audio='/Users/purnimakamath/appdir/Github/ieee-tx-on-mm/data/water-wind/morph2/audio/semantic'

water_dirs = ['0','1','2','3','4','5']
morph2_paramsense_losses_water = get_param_sense_losses(morph2_audio, test_type='morph2', 
                                                        num_interp_steps=11, dir_list=water_dirs)

wind_dirs = ['6','7','8','9','10','11']
morph2_paramsense_losses_wind = get_param_sense_losses(morph2_audio, test_type='morph2',
                                                       num_interp_steps=11, dir_list=wind_dirs)

print('----------Results--------')
lin_line = [a for a in np.arange(0,1.1,0.1)]
print('Morph2: Corr coef for Semantic Param Sensitivity Water = ', pearsonr(lin_line, morph2_paramsense_losses_water))
print('Morph2: Corr coef for Semantic Param Sensitivity Wind = ', pearsonr(lin_line, morph2_paramsense_losses_wind))


['9', '0', '11', '7', '6', '1', '10', '8', '4', '3', '2', '5']
Analysing example  0
Analysing example  1
Analysing example  4
Analysing example  3
Analysing example  2
Analysing example  5
GM Loss array [0.         0.13553231 0.14017034 0.1480988  0.17383175 0.17727922
 0.22767127 0.28618371 0.30452233 0.31608174 0.24597192]
['9', '0', '11', '7', '6', '1', '10', '8', '4', '3', '2', '5']
Analysing example  9
Analysing example  11
Analysing example  7
Analysing example  6
Analysing example  10
Analysing example  8
GM Loss array [0.         0.10119768 0.09723039 0.09800444 0.0946367  0.04965055
 0.11818325 0.15529053 0.16999443 0.16886074 0.43077639]
----------Results--------
Morph2: Corr coef for Semantic Param Sensitivity Water =  (0.9014542131776159, 0.00015009031673983753)
Morph2: Corr coef for Semantic Param Sensitivity Wind =  (0.7602970280225969, 0.0066045317029890796)
