# audio classification

https://developers.google.com/mediapipe/solutions/audio/audio_classifier

In [1]:
!wget -O classifier.tflite -q https://storage.googleapis.com/mediapipe-models/audio_classifier/yamnet/float32/1/yamnet.tflite

In [2]:
import urllib

audio_file_name = 'speech_16000_hz_mono.wav'
url = f'https://storage.googleapis.com/mediapipe-assets/{audio_file_name}'
urllib.request.urlretrieve(url, audio_file_name)

('speech_16000_hz_mono.wav', <http.client.HTTPMessage at 0x10782a410>)

In [3]:
from IPython.display import Audio, display

file_name = 'speech_16000_hz_mono.wav'
display(Audio(file_name, autoplay=False))

In [5]:
import numpy as np

from mediapipe.tasks import python
from mediapipe.tasks.python.components import containers
from mediapipe.tasks.python import audio
from scipy.io import wavfile

# Customize and associate model for Classifier
base_options = python.BaseOptions(model_asset_path='classifier.tflite')
options = audio.AudioClassifierOptions(
    base_options=base_options, max_results=4)

# Create classifier, segment audio clips, and classify
with audio.AudioClassifier.create_from_options(options) as classifier:
    sample_rate, wav_data = wavfile.read(audio_file_name)
    audio_clip = containers.AudioData.create_from_array(
        wav_data.astype(float) / np.iinfo(np.int16).max, sample_rate)
    classification_result_list = classifier.classify(audio_clip)

    assert(len(classification_result_list) == 5)

    # Iterate through clips to display classifications
    for idx, timestamp in enumerate([0, 975, 1950, 2925]):
        classification_result = classification_result_list[idx]
        top_category = classification_result.classifications[0].categories[0]
        print(f'Timestamp {timestamp}: {top_category.category_name} ({top_category.score:.2f})')

Timestamp 0: Speech (0.92)
Timestamp 975: Speech (0.99)
Timestamp 1950: Speech (0.98)
Timestamp 2925: Speech (1.00)


INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


In [9]:
audio_clip

<mediapipe.tasks.python.components.containers.audio_data.AudioData at 0x137d93520>

In [10]:
classification_result_list

[ClassificationResult(classifications=[Classifications(categories=[Category(index=0, score=0.91796875, display_name='', category_name='Speech'), Category(index=500, score=0.05859375, display_name='', category_name='Inside, small room'), Category(index=494, score=0.015625, display_name='', category_name='Silence'), Category(index=3, score=0.0078125, display_name='', category_name='Narration, monologue')], head_index=0, head_name='scores')], timestamp_ms=0),
 ClassificationResult(classifications=[Classifications(categories=[Category(index=0, score=0.98828125, display_name='', category_name='Speech'), Category(index=3, score=0.0078125, display_name='', category_name='Narration, monologue'), Category(index=500, score=0.00390625, display_name='', category_name='Inside, small room'), Category(index=2, score=0.0, display_name='', category_name='Conversation')], head_index=0, head_name='scores')], timestamp_ms=975),
 ClassificationResult(classifications=[Classifications(categories=[Category(in