# Introduction

For this notebook we wanted to use the previously trained model that was trained on the dataset from the Freesound General-Purpose Audio Tagging Challenge, and use transfer learning on a smaller, simialr dataset. We chose to use the UrbanSound dataset that can be downloaded from here: https://urbansounddataset.weebly.com/urbansound.html.
This dataset contains 1302 labeled sound recordings from 10 classes.

# Setup

In [2]:
import librosa
import librosa.display
from IPython.display import Audio
import matplotlib.pyplot as plt
from pathlib import Path
import os
import pandas as pd
import numpy as np
from fastai.vision.all import *

# Get the data

In [3]:
DATA = Path('../../data')
URBAN_SOUND = DATA/'urban-sound'
URBAN_SOUND_RECORDINGS = URBAN_SOUND/'data'
#AUDIO_RECORDINGS = AUDIO_TAGGING/'audio_train'

In [4]:
dst_path = URBAN_SOUND/'train'
dst_path.mkdir(exist_ok=True)

In [5]:
URBAN_SOUND.ls()

(#5) [Path('../../data/urban-sound/train'),Path('../../data/urban-sound/data'),Path('../../data/urban-sound/FREESOUNDCREDITS.txt'),Path('../../data/urban-sound/.DS_Store'),Path('../../data/urban-sound/UrbanSound_README.txt')]

In [6]:
URBAN_SOUND_RECORDINGS.ls()

(#10) [Path('../../data/urban-sound/data/car_horn'),Path('../../data/urban-sound/data/jackhammer'),Path('../../data/urban-sound/data/siren'),Path('../../data/urban-sound/data/street_music'),Path('../../data/urban-sound/data/engine_idling'),Path('../../data/urban-sound/data/gun_shot'),Path('../../data/urban-sound/data/drilling'),Path('../../data/urban-sound/data/dog_bark'),Path('../../data/urban-sound/data/children_playing'),Path('../../data/urban-sound/data/air_conditioner')]

In [11]:
from pydub import AudioSegment
from pydub.utils import which

AudioSegment.converter = which("ffmpeg")

# Data preprocessing

The dataset contains recordings of very various length. Some are just shy of a second, some several minutes long. Therefore we need to do some data processing such that the model is given more similar data. We decided found that cutting the longer recordings to 20 seconds gave a much better result. We also had to get rid of some recordings that was of unsupported file formats. 

In [13]:
children_playing = URBAN_SOUND_RECORDINGS/'street_music'
children_playing.ls()
sound = AudioSegment.from_file(children_playing/'7390.mp3')
print(f'Duration before cut: {sound.duration_seconds}')
s = sound[:10000]
s.export(children_playing/'test.mp3', format='mp3')
sound = AudioSegment.from_file(children_playing/'test.mp3')
print(f'Duration after cut: {sound.duration_seconds}')

Duration before cut: 20.0
Duration after cut: 10.0


###### Converting the .mp3 files to .wav files

In [14]:
def mp3_to_wav():
    for subdir, dirs, files in os.walk(URBAN_SOUND_RECORDINGS): ## cin
        for file in files:
            #print os.path.join(subdir, file)
            filepath = subdir + os.sep + file
            source_path = Path(subdir)
            #print(filepath)
            #print(file)

            if filepath.endswith('.mp3'):
                sound = AudioSegment.from_file(filepath)
                sound.export(filepath, 'wav')

In [9]:
#mp3_to_wav()

In [10]:
import soundfile as sf

In [11]:
f = sf.SoundFile(children_playing/'7390.mp3')
f.format, f.subtype, f.endian

('WAV', 'PCM_16', 'FILE')

###### Looping through each recording in the directory, removing files of unsupported file formats and cutting recordings over 20 seconds

In [20]:
count = 0
numRemoved = 0
for subdir, dirs, files in os.walk(URBAN_SOUND_RECORDINGS):
    for file in files:
        #print os.path.join(subdir, file)
        filepath = subdir + os.sep + file
        source_path = Path(subdir)
        #print(filepath)
        #print(file)

        if not filepath.endswith(('.csv', '.json','.DS_Store','.aif','.flac')):
            try:
                f = sf.SoundFile(filepath)
            except:
                os.remove(filepath) # Remove files with unsupoorted format
                numRemoved += 1
                print(f'Removed files: {numRemoved}', end='/r')
            else:
                try:
                    sound = AudioSegment.from_file(filepath)
                    file_format = file.split(".")[-1]
                    if sound.duration_seconds > 20:
                        first_twenty = sound[:20000]
                        first_twenty.export(filepath, format=file_format)
                        count += 1
                        print(f'Converted audio clips: {count}', end='\r')
                except:
                    os.remove(filepath)
                    numRemoved += 1
                    print(f'Removed files: {numRemoved}', end='\r')
        else:
            os.remove(filepath)
            numRemoved += 1
            print(f'Removed files: {numRemoved}', end='\r')

In [21]:
def log_mel_spec_tfm(fname, label, src_path , dst_path):
    data, sample_rate = librosa.load(src_path/fname)
    
    n_fft = 1024
    hop_length = 512
    n_mels = 80
    fmin = 20
    fmax = sample_rate / 2 
    
    mel_spec_power = librosa.feature.melspectrogram(data, sr=sample_rate, n_fft=n_fft, 
                                                    hop_length=hop_length, 
                                                    n_mels=n_mels, power=2.0, 
                                                    fmin=fmin, fmax=fmax)
    
    #mel_spec_power = librosa.feature.melspectrogram(x, sr=sample_rate)
    
    mel_spec_db = librosa.power_to_db(mel_spec_power, ref=np.max)
        
    dst_path = dst_path / label
    dst_path.mkdir(exist_ok=True)
    
    dst_fname = dst_path / (fname[:-4] + '.png')
    #print(dst_fname)
    plt.imsave(dst_fname, mel_spec_db)

In [22]:
import warnings
warnings.filterwarnings('ignore')

## Converting the recordings

As with the previous notebooks, we want to represent the recordings as images to be used in a CNN. Therefore we do as earlier and convert the recordings to mel spectograms. The images will be saved in folders where the folder name is the class.

In [33]:
def convert_to_spec():
    count = 0
    for subdir, dirs, files in os.walk(URBAN_SOUND_RECORDINGS):
        for file in files:
            #print os.path.join(subdir, file)
            filepath = subdir + os.sep + file
            source_path = Path(subdir)
            #print(filepath)
            #print(file)
            count += 1
            if count % 10 == 0:
                print(count, end='\r')
            label = subdir.split("/")[-1]
            #print(label)
            log_mel_spec_tfm(file, label, source_path, dst_path)

In [34]:
#convertto_spec()

In [35]:
dst_path

Path('../../data/urban-sound/train')

In [36]:
!tree -L 1 $dst_path

[01;34m../../data/urban-sound/train[00m
├── [01;34mair_conditioner[00m
├── [01;34mcar_horn[00m
├── [01;34mchildren_playing[00m
├── [01;34mdog_bark[00m
├── [01;34mdrilling[00m
├── [01;34mengine_idling[00m
├── [01;34mgun_shot[00m
├── [01;34mjackhammer[00m
├── [01;34msiren[00m
└── [01;34mstreet_music[00m

10 directories, 0 files
