In [1]:
import argparse
import csv
import datetime
import glob
import math
import os
import sys
import time
import numpy as np
from collections import Counter
import pandas as pd 

from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit

In [2]:
import torch
import data_loader

In [3]:
import keras
from keras import optimizers
from keras import backend as K
from keras.models import Sequential, Model
from keras.layers import Input, Convolution2D, MaxPooling2D, Dense, Dropout, Activation, Flatten, merge
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import ELU

Using TensorFlow backend.


In [4]:
AUDIO_PATH = '/root/HSE_final/segm/audio'

In [5]:
from tqdm import tqdm_notebook
import pickle

In [None]:
musicDir = os.listdir(AUDIO_PATH)

In [None]:
def create_spectrograms_new(filelist):
    print("Reading and processing", len(filelist), "audio files")
    q = data_loader.SpectrogramParser({'sample_rate':16000,'window_size':0.015,'window_stride':0.0039,'window':'hamming'},normalize='max_frame')
    list_spectrograms = []
    for i in tqdm_notebook(range(len(filelist))):
        sample = filelist[i]
        filepath = os.path.join(AUDIO_PATH, sample)
        e = q.parse_audio(filepath)
        list_spectrograms.append(np.append(e.numpy(),np.full((121, 2), e.numpy().mean(), dtype='float32'), axis = 1))
        #list_spectrograms.append(e.numpy())
    return list_spectrograms

In [19]:
for types in tqdm_notebook(musicDir):
    list_of_files = []
    list_of_names = []
    for i in os.listdir(AUDIO_PATH + '/' + types):
        if i!= '.ipynb_checkpoints':
            list_of_files.append(AUDIO_PATH +'/'+ types+ '/' + i)
            list_of_names.append(i.split('.')[0])
    data = create_spectrograms_new(list_of_files)
    data = np.stack(data, axis=0 )
    data = standardize(data)
    data = add_channel(data, n_channels=1)
    if data.shape[0] > 1100:
        layer_output_1 = nn_2_way([data[:1100]])
        layer_output_2 = nn_2_way([data[1100:]])
        layer_output = []
        layer_output.append(np.concatenate((layer_output_1[0],layer_output_2[0]), axis = 0))
        layer_output.append(np.concatenate((layer_output_1[1],layer_output_2[1]), axis = 0))
    else:
        layer_output = nn_2_way([data])
    os.mkdir('/root/HSE_final/features/audio/'+types)
    for i in range(len(layer_output[0])):
        list_pickles = []
        with open('/root/HSE_final/features/audio/'+types + '/'+list_of_names[i]+'.pickle', 'wb') as f:
            list_pickles.append(layer_output[0][i])
            list_pickles.append(layer_output[1][i])
            pickle.dump(list_pickles, f)
         
        

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

Reading and processing 125 audio files


HBox(children=(IntProgress(value=0, max=125), HTML(value='')))

Reading and processing 1602 audio files


HBox(children=(IntProgress(value=0, max=1602), HTML(value='')))




In [12]:
def standardize(data):
    # vectorize before standardization (cause scaler can't do it in that format)
    N, ydim, xdim = data.shape
    data = data.reshape(N, xdim*ydim)

    # standardize
    scaler = preprocessing.StandardScaler()
    data = scaler.fit_transform(data)

    # reshape to original shape
    return data.reshape(N, ydim, xdim)

In [13]:
def add_channel(data, n_channels=1):
    # n_channels: 1 for grey-scale, 3 for RGB, but usually already present in the data
    
    N, ydim, xdim = data.shape
    data = data.reshape(N, ydim, xdim, n_channels)
        
    return data

In [14]:
# Metrics

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

metrics = ['accuracy', precision, recall]

In [17]:
nn_2_way = K.function([model.layers[0].input],
                    [model.layers[-2].output,model.layers[-3].output])

In [16]:
from keras.models import load_model
model = load_model('my_model.h5',custom_objects={'precision': precision,'recall':recall})
        

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


In [18]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 121, 1451, 1)      0         
_________________________________________________________________
bn_0_freq (BatchNormalizatio (None, 121, 1451, 1)      5804      
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 121, 1451, 64)     640       
_________________________________________________________________
bn1 (BatchNormalization)     (None, 121, 1451, 64)     256       
_________________________________________________________________
elu_1 (ELU)                  (None, 121, 1451, 64)     0         
_________________________________________________________________
pool1 (MaxPooling2D)         (None, 60, 362, 64)       0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 60, 362, 64)       36928     
__________