In [1]:
# conda env create python=3.7.3 -f winston_base.yml 
# conda install -c anaconda ipykernel
# python -m ipykernel install --user --name=winston_base

# conda env create python=3.7.13 -f HF.yml 
# conda env create python=3.10.4 -f tf2_new.yml 
# conda env create python=3.7.0 -f ssl.yml


In [2]:
import argparse
from smd.data import preprocessing
from smd.data import postprocessing
import smd.utils as utils
import numpy as np
import tensorflow as tf
import keras.models
from tqdm import tqdm
import os
import glob
import shutil
import json

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
output_root = '../INPUTS_OUTPUTS/Outputs'

In [4]:




def test_data_processing(file, mean, std):
    if os.path.splitext(file)[1] == '.npy':
        spec = np.load(file)
    else:
        audio = utils.load_audio(file)
        spec = preprocessing.get_spectrogram(audio)
    mels = preprocessing.get_scaled_mel_bands(spec)
    mels = preprocessing.normalize(mels, mean, std)
    return mels.T


def predict(data_path, output_file, model_path, mean_path, std_path, smoothing):
    mean = np.load(mean_path)
    std = np.load(std_path)

    print("Loading the model " + model_path + "..")
    with tf.device('/cpu:0'):
        model = keras.models.load_model(model_path)
    print("Start the prediction..")

    if os.path.isdir(data_path):
        if output_file != "":
            raise ValueError("It is possible to set an output file only if the input is a file.")

        files = glob.glob(os.path.abspath(data_path) + "/*.npy") + glob.glob(os.path.abspath(data_path) + "/*.wav")
        for file in tqdm(files):
            x = test_data_processing(file, mean, std)
            x = x.reshape((1, x.shape[0], x.shape[1]))
            output = model.predict(x, batch_size=1, verbose=0)[0].T
            output = postprocessing.apply_threshold(output)
            if smoothing:
                output = postprocessing.smooth_output(output)
            annotation = preprocessing.label_to_annotation(output)
            output_path = file.replace(".npy", '') + "_prediction.txt"
            output_path = output_path.replace('.wav','')
            utils.save_annotation(annotation, output_path)
    else:
        file = os.path.abspath(data_path)
        x = test_data_processing(file, mean, std)
        x = x.reshape((1, x.shape[0], x.shape[1]))
        output = model.predict(x, batch_size=1, verbose=0)[0].T
        output = postprocessing.apply_threshold(output)
        if smoothing:
            output = postprocessing.smooth_output(output)
        annotation = preprocessing.label_to_annotation(output)
        if output_file != "":
            output_path = output_file
        else:
            output_path = file.replace(".npy", '') + "_prediction.txt"
            output_path = output_path.replace('.wav','')
        utils.save_annotation(annotation, output_path)

In [5]:
root = '../'

output_location = 'INPUTS_OUTPUTS/Outputs/'

In [6]:

data_path =root + output_location + "Short_split_file/"

output_file =""

model_path = root + "speech-music-detection/checkpoint/weights.28-0.13exp1_blstm.hdf5"

mean_path = root + "speech-music-detection/checkpoint/mean_gtzan_esc-50_muspeak_musan.npy"

std_path = root + "speech-music-detection/checkpoint/std_gtzan_esc-50_muspeak_musan.npy"

smoothing = True

In [7]:
mean = np.load(mean_path)
std = np.load(std_path)

In [8]:
#load model (keras)
print("Loading the model " + model_path + "..")
with tf.device('/cpu:0'):
    model = keras.models.load_model(model_path)
print("Start the prediction..")

Loading the model ../speech-music-detection/checkpoint/weights.28-0.13exp1_blstm.hdf5..




Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Start the prediction..


In [9]:
with open(os.path.join(output_root, "Short_files.json"), "r") as openfile:
    audio_files = json.load(openfile)

In [10]:
print("Music/speech inference: ")
for filename, f_info in tqdm(audio_files.items()):
    f_path = f_info['filepaths']['wav']
    x = test_data_processing(f_path, mean, std)
    x = x.reshape((1, x.shape[0], x.shape[1]))
    output = model.predict(x, batch_size=1, verbose=0)[0].T
    output = postprocessing.apply_threshold(output)
    if smoothing:
        output = postprocessing.smooth_output(output)
    annotation = preprocessing.label_to_annotation(output)
    #make sure its sorted by starting time
    annotation = sorted(annotation, key=lambda x: x[0])
    f_info['speech_music_pred'] = annotation

  0%|          | 0/499 [00:00<?, ?it/s]

Music/speech inference: 


100%|██████████| 499/499 [00:53<00:00,  9.34it/s]


In [11]:
json_object = json.dumps(audio_files, indent=4)
with open(os.path.join(output_root, "Short_files.json"), "w") as outfile:
    outfile.write(json_object)