In [22]:
from __future__ import absolute_import
import glob
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import specgram
import soundfile as sf
from tqdm import tqdm
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.utils import to_categorical
from keras.optimizers import SGD
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D
from keras.layers.convolutional import Conv2D
from sklearn.svm import SVC


In [2]:
def extract_feature(file_name):
    """Generates feature input (mfccs, chroma, mel, contrast, tonnetz).
    -*- author: mtobeiyf https://github.com/mtobeiyf/audio-classification -*-
    """
    X, sample_rate = sf.read(file_name, dtype='float32')
    if X.ndim > 1:
        X = X[:,0]
    X = X.T

    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
    return mfccs, chroma, mel, contrast, tonnetz


In [3]:
def parse_audio_files(parent_dir, sub_dirs, file_ext=None, verbose=True):
    """Parses directory in search of specified file types, then compiles feature data from them.
    -*- adapted from code by mtobeiyf https://github.com/mtobeiyf/audio-classification -*-
    """
    # by default test for only these types
    if file_ext == None:
        file_types = ['*.ogg', '*.wav']
    else:
        file_types = []
        file_types.push(file_ext)
    features, labels = np.empty((0,193)), np.empty(0)
    for label, sub_dir in enumerate(sub_dirs):
        for file_ext in file_types:
            # file names
            iter = glob.glob(os.path.join(parent_dir, sub_dir, file_ext))
            if len(iter) > 0:
                if verbose: print('Reading', os.path.join(parent_dir, sub_dir, file_ext), '...')
                for fn in tqdm(iter):
                    ext_features = get_ext_features(fn)
                    if type(ext_features) is np.ndarray:
                        features = np.vstack([features, ext_features])
                        labels = np.append(labels, label)
    return np.array(features), np.array(labels, dtype = np.int)

In [4]:
def get_ext_features(fn):
    """Returns features for individual audio file.
    -*- adapted from code by mtobeiyf https://github.com/mtobeiyf/audio-classification -*-
    """
    try:
        mfccs, chroma, mel, contrast, tonnetz = extract_feature(fn)
        ext_features = np.hstack([mfccs, chroma, mel, contrast, tonnetz])
        return ext_features
    except Exception as e:
        print("[Error] extract feature error. %s" % (e))
        return None


In [5]:
def parse_audio_file(fn):
    """Returns features of single audio file
    -*- adapted from code by mtobeiyf https://github.com/mtobeiyf/audio-classification -*-
    """
    features = np.empty((0,193))
    ext_features = get_ext_features(fn)
    features = np.vstack([features,ext_features])
    return np.array(features)


In [6]:
def svm(num_classes):
    """Support vector machine.
    -*- ref: mtobeiyf https://github.com/mtobeiyf/audio-classification -*-
    """
    from sklearn.svm import SVC

    return SVC(C=20.0, gamma=0.00001)

In [7]:
def nn(num_classes):
    """Multi-layer perceptron.
    """
    model = Sequential()
    model.add(Dense(256, input_dim=193))
    model.add(Activation('relu'))
    model.add(Dropout(0.6))
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dropout(0.6))
    model.add(Dense(num_classes))
    model.add(Activation('softmax'))

    return model


In [8]:
def cnn(num_classes):
    """1D Convolutional Neural Network.
    -*- ref: panotti https://github.com/drscotthawley/panotti -*-
    """
    from keras.layers import Embedding
    from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D

    activation = 'softmax' if num_classes > 2 else 'sigmoid'
    model = Sequential()
    model.add(Conv1D(64, 3, input_shape=(193, 1)))
    model.add(Activation('relu'))
    model.add(Conv1D(64, 3))
    model.add(Activation('relu'))
    model.add(MaxPooling1D(3))
    model.add(Conv1D(128, 3))
    model.add(Activation('relu'))
    model.add(Conv1D(128, 3))
    model.add(Activation('relu'))
    model.add(GlobalAveragePooling1D())
    model.add(Dropout(0.5))
    model.add(Dense(num_classes))
    model.add(Activation(activation))

    return model


In [10]:
def feature_extraction(data_path):
    """Parses audio files in supplied data path.
    -*- author: mtobeiyf https://github.com/mtobeiyf/audio-classification -*-
    """
    r = os.listdir(data_path)
    r.sort()
    features, labels = parse_audio_files(data_path, r)
    return features, labels

In [25]:
def train(features, labels, type='cnn', num_classes=None, print_summary=False,
    save_model=False, lr=0.01, loss_type=None, epochs=50, optimizer='SGD', verbose=True):
    """Trains model based on provided feature & target data
    Options:
    - epochs: The number of iterations. Default is 50.
    - lr: Learning rate. Increase to speed up training time, decrease to get more accurate results (if your loss is 'jumping'). Default is 0.01.
    - optimiser: Default is 'SGD'.
    - print_summary: Prints a summary of the model you'll be training. Default is False.
    - loss_type: Classification type. Default is categorical for >2 classes, and binary otherwise.
    """
    print("SA MODEL MUNIR")
    labels = labels.ravel()
    if num_classes == None: num_classes = np.max(labels, axis=0)

    model = cnn(num_classes)
    if print_summary == True: model.summary()

    if loss_type == None:
        loss_type = 'binary' if num_classes <= 2 else 'categorical'
    
    print("*****")
    print(loss_type)
    print("*****")
    model.compile(optimizer=SGD(lr=lr),
                  loss='%s_crossentropy' % loss_type,
                  metrics=['accuracy'])

    if loss_type == 'categorical':
        y = to_categorical(labels - 1, num_classes=num_classes)
    else:
        y = labels - 1

    x = np.expand_dims(features, axis=2)

    model.fit(x, y, batch_size=64, epochs=epochs, verbose=verbose)

    return model


In [12]:
def predict(model, data_path):
    """Trains model based on provided feature & target data
    Options:
    - epochs: The number of iterations. Default is 50.
    - lr: Learning rate. Increase to speed up training time, decrease to get more accurate results (if your loss is 'jumping'). Default is 0.01.
    - optimiser: Default is 'SGD'.
    - print_summary: Prints a summary of the model you'll be training. Default is False.
    - type: Classification type. Default is categorical for >2 classes, and binary otherwise.
    """
    x_data = parse_audio_file(data_path)
    X_train = np.expand_dims(x_data, axis=2)
    pred = model.predict(X_train)
    return pred


In [13]:
def print_leaderboard(pred, data_path):
    """Pretty prints leaderboard of top matches
    """
    r = os.listdir(data_path)
    r.sort()
    sorted = np.argsort(pred)
    count = 0
    for index in (-pred).argsort()[0]:
        print('%d.' % (count + 1), r[index + 1], str(round(pred[0][index]*100)) + '%', '(index %s)' % index)
        count += 1


In [14]:
features, labels = feature_extraction('/home/munir/Desktop/pyAudioClassification-master/example/data')

  0%|          | 0/40 [00:00<?, ?it/s]

Reading /home/munir/Desktop/pyAudioClassification-master/example/data/101 - Dog/*.ogg ...


100%|██████████| 40/40 [00:36<00:00,  1.06it/s]
  0%|          | 0/40 [00:00<?, ?it/s]

Reading /home/munir/Desktop/pyAudioClassification-master/example/data/102 - Rooster/*.ogg ...


100%|██████████| 40/40 [00:31<00:00,  1.21it/s]
  0%|          | 0/40 [00:00<?, ?it/s]

Reading /home/munir/Desktop/pyAudioClassification-master/example/data/103 - Pig/*.ogg ...


100%|██████████| 40/40 [00:35<00:00,  1.09it/s]
  0%|          | 0/40 [00:00<?, ?it/s]

Reading /home/munir/Desktop/pyAudioClassification-master/example/data/104 - Cow/*.ogg ...


100%|██████████| 40/40 [00:35<00:00,  1.22it/s]
  0%|          | 0/40 [00:00<?, ?it/s]

Reading /home/munir/Desktop/pyAudioClassification-master/example/data/105 - Frog/*.ogg ...


100%|██████████| 40/40 [00:41<00:00,  1.02it/s]


In [26]:
model=train(features, labels,lr=0.0001,epochs=100,print_summary=True)

SA MODEL MUNIR
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_5 (Conv1D)            (None, 191, 64)           256       
_________________________________________________________________
activation_6 (Activation)    (None, 191, 64)           0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 189, 64)           12352     
_________________________________________________________________
activation_7 (Activation)    (None, 189, 64)           0         
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 63, 64)            0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 61, 128)           24704     
_________________________________________________________________
activation_8 (Activation)    (None, 61, 128)           0     

In [27]:
pred = predict(model, '/home/munir/Desktop/pyAudioClassification-master/example/cow_test.wav')
print_leaderboard(pred, '/home/munir/Desktop/pyAudioClassification-master/example/data')

1. 104 - Cow 54.0% (index 2)
2. 105 - Frog 31.0% (index 3)
3. 103 - Pig 12.0% (index 1)
4. 102 - Rooster 3.0% (index 0)
