##### Funktionen und eine Klasse um aus gelabelten ordner mit soundfiles einen fertigen Datensatz, bestehend aus Spectogrammen zum anlernen und validieren sowie zum anschließenden testen des learners erstellen.



In [1]:
%ls

copy_engine_from_ESC-50.ipynb  edit_soundfiles.ipynb   README.md
create_dataset.ipynb           learner_v1.ipynb        [0m[01;34msac_test[0m/
create_spectogram.ipynb        learner_v2.ipynb        [01;34mtemp[0m/
[01;34mdatasets[0m/                      merge_dataframes.ipynb


In [2]:
from pathlib import Path
import pandas as pd
import random
import shutil
# bibs für edit
import pydub
#bibs für spektogramme
import os
import imageio

import matplotlib
matplotlib.use('agg')

from matplotlib import pyplot as plt
from matplotlib import cm
from tqdm import tqdm
import pylab

import librosa
from librosa import display
import numpy as np

In [3]:
def create_spectrogram(source_filepath, destination_filepath):    
    y, sr = librosa.load(source_filepath, sr = 22050) # Use the default sampling rate of 22,050 Hz

    # Pre-emphasis filter
    pre_emphasis = 0.97
    y = np.append(y[0], y[1:] - pre_emphasis * y[:-1])

    # Compute spectrogram
    M = librosa.feature.melspectrogram(y, 
                                       sr, 
                                       fmax = sr/2, # Maximum frequency to be used on the on the MEL scale        
                                       n_fft=2048, 
                                       hop_length=512, 
                                       n_mels = 96, # As per the Google Large-scale audio CNN paper
                                       power = 2) # Power = 2 refers to squared amplitude
    # Power in DB
    log_power = librosa.power_to_db(M, ref=np.max)# Covert to dB (log) scale

    # Plotting the spectrogram and save as JPG without axes (just the image)
    pylab.figure(figsize=(5,5)) #was 14, 5
    pylab.axis('off') 
    pylab.axes([0., 0., 1., 1.], frameon=False, xticks=[], yticks=[]) # Remove the white edge
    librosa.display.specshow(log_power, cmap=cm.jet)
    #print(destination_filepath)
    pylab.savefig(destination_filepath, bbox_inches=None, pad_inches=0)
    pylab.close()

#create_spectrogram(src, dest)

In [4]:
def split_and_save(tgt_size, src_file, tgt_folder):
    
    
    if str(src_file)[-4:] == '.wav':
        soundf = pydub.AudioSegment.from_wav(src_file)
        fmt = 'wav'

    if str(src_file)[-4:] == '.mp3':
        soundf = pydub.AudioSegment.from_mp3(src_file)
        fmt = 'mp3'
    #soundf = pydub.AudioSegment.from_wav(src_file) #aktuell nur wav files -> mit if abfrage noch für mp3 anpassen
    for i, chunk in enumerate(soundf[::tgt_size]):
        #print(len(chunk))
        if len(chunk) == tgt_size: #soundfile wird nur gespeichert wenn es der target size entspricht --> letztes file wird verworfen wenn es zu kurz ist. Kann man sicher noch elleganter lösen in dem man letztes file mit leerer datei auffült oderso
            with open(tgt_folder + os.path.basename(src_file)[:-4] + "_%s.wav" % (i+1), "wb") as f:    
                chunk.export(f, format= 'wav')
                
    return()

In [5]:
def sflen(pth):
    if str(pth)[-4:] == '.wav':
        soundf = pydub.AudioSegment.from_wav(pth)
        sflen = len(soundf)
    elif str(pth)[-4:] == '.mp3':
        soundf = pydub.AudioSegment.from_mp3(pth)
        sflen = len(soundf)
    else:
        sflen = 0
        print("Warning: at least one file don't has valid datatype")
        
    return sflen
 

In [6]:
class Dataset():
    #initialisierung der Klasse
    def __init__(self, folderpath):
        self.pathlen = len(folderpath)
        self.folderpath = Path(folderpath)
        self.flst = list(self.folderpath.iterdir())
        self.lendict_p = {str(o)[:] : sflen(str(o)) for o in self.flst}
        self.lendict = {str(o)[len(folderpath):] : sflen(str(o)) for o in self.flst}
        self.lendf = pd.DataFrame({'soundfilelength' : self.lendict}) 
        
    #gibt allg. Infos über Datensatz aus    
    def info(self):
        #return len(list(self.folderpath.iterdir()))
        print("Number of files: " + str(len(self.flst)))
        print('Overall length: '+ str(self.lendf['soundfilelength'].sum()) + " ms")
        print('Max file length: '+ str(self.lendf['soundfilelength'].max()) + " ms")
        print('Min file length: '+ str(self.lendf['soundfilelength'].min()) + " ms")
        print('mean file length: '+ str(self.lendf['soundfilelength'].mean()) + " ms")
    
    #gibt Liste aller Dateien zurück    
    def filelist(self):
        return self.flst
    
    #gibt Länge aller Soundfiles zusammen aus
    def oalen(self):
        oalen = self.lendf['soundfilelength'].sum()
        #for i in self.filelist():
         #   if str(i)[-3:] == 'wav':
          #      soundf = pydub.AudioSegment.from_wav(i)
           #     oalen = oalen + len(soundf)
            #if str(i)[-3:] == 'mp3':
             #   soundf = pydub.AudioSegment.from_mp3(i)
              #  oalen = oalen + len(soundf)
        return oalen
    
    #gibt längstes Soundfile aus
    def maxfile(self):
        maxfile = self.lendf['soundfilelength'].idxmax()
        return maxfile
    
    #gibt kürzestes Soundfile aus
    def minfile(self):
        minfile = self.lendf['soundfilelength'].idxmin()
        return minfile
    
    #alle Dateien werden in Spectrogram umgewandelt
    def convert_all(self, destination_filepath): #konvertiert alle files im ordner so wie sie sind zu spectrogrammen
        x = 1
        for i in self.flst:
            create_spectrogram(str(i), destination_filepath + str(i)[data1.pathlen:-4] + ".jpg")
            x = x +1
        print("converted " + str(x) + " files to spectrogram")
        
    #alle Dateien werden in angebene Länge zerschnitten
    def split_all(self, tgt_size, tgt_folder):
        
        for i in self.flst:
            split_and_save(tgt_size, str(i), tgt_folder)
            
    # alle Dateien werden in angegebene Ziellänge zerschnitten und direkt ein Spectrogram erstellt         
    def split_and_convert(self, tgt_size, tgt_folder, csv = False, label = '', ds_label = '' ): #evtl. noch optionale angaben ob man gesplitete soundfiles behalten möchte oder nicht implementieren
        if tgt_folder[-1] != '/':
            tgt_folder = tgt_folder + '/'
        
        
        if label == '':
            print('testprint:' + os.path.basename(tgt_folder[:-1]))
            label = os.path.basename(tgt_folder[:-1])

        
        if ds_label == '':
            ds_label = str(os.path.basename(tgt_folder[:-1]))
            
        
        
        tempfolder = 'temp/'
        #Ordner für split daten erstellen alternativcode: (PATH).mkdir(exist_ok=True)
        if os.path.isdir(tempfolder) == False:
            os.makedirs(tempfolder)
            
        #Ordner für Spectrogramme erstellen
        if os.path.isdir(tgt_folder) == False:
            os.makedirs(tgt_folder)
            
        for i in self.flst:
            split_and_save(tgt_size, i, tempfolder)
            #daten convertieren
            x = 1
            for j in list(Path(tempfolder).iterdir()):
                create_spectrogram(str(j), tgt_folder + os.path.basename(str(j))[:-4] + ".jpg")
                x = x +1
                os.remove(j)
        #Rückmeldung
            print("converted " + str(x) + " files to spectrogram")  
        if csv == True:    
            #dataframe/csv erstellen
            filelist = list(Path(tgt_folder).iterdir())

            labellist = [label] * len(filelist)
            ds_labellist = [ds_label] * len(filelist)
            pathlist = [os.path.relpath(tgt_folder)] * len(filelist)

            df = pd.DataFrame({'Filename' : filelist, 'Label' : labellist, 'DS_Label' : ds_labellist, 'Path' : pathlist})

            #Filename von dateipfad bereinigen
            for i, row in df.iterrows():
                df.loc[i, 'Filename'] = str(os.path.basename(df.loc[i, 'Filename']))

            #DF unter label.csv abspeichern eine ordner ebene unter specto ordner
            df.to_csv(str(os.path.dirname(tgt_folder) + '.csv'))
            print('Path to .csv: ' + str(os.path.dirname(tgt_folder) + '.csv'))
        
        #return df
        
    
    #Dateien in Dataset order werden auf 3 ordner(learn, val und test) in angegebener Verteilung (z.B. 0.7,0.2,0.1) aufgeteilt
    def divide_data(self, tgt_dset_path, tgt_dist_learn = 0.7, tgt_dist_val = 0.2, tgt_dist_test = 0.1):
        
        if round(tgt_dist_learn + tgt_dist_val + tgt_dist_test, 4) != 1.0:
            print('sum of inputs are not 100%')
            return ()
        #ideale Verteilung ausrechnen
        oalength = self.oalen()
        
        tgt_time_learn = tgt_dist_learn * oalength
        tmp_time_learn = 0
        tgt_time_val = tgt_dist_val * oalength
        tmp_time_val = 0
        tgt_time_test = tgt_dist_test * oalength
        tmp_time_test = 0
        
        
        #ablageorte für verteilte Soundfiles erstellen
        if os.path.isdir(tgt_dset_path) == False:
            os.makedirs(tgt_dset_path)
        if os.path.isdir(tgt_dset_path + '/learn') == False:
            os.makedirs(tgt_dset_path + '/learn')
        if os.path.isdir(tgt_dset_path + '/val') == False:
            os.makedirs(tgt_dset_path + '/val')
        if os.path.isdir(tgt_dset_path + '/test') == False:
            os.makedirs(tgt_dset_path + '/test')
            
        learnpath = tgt_dset_path + '/learn'
        valpath = tgt_dset_path + '/val'
        testpath = tgt_dset_path + '/test'
        
        #Fehler bzw. delta funktion um aus tgt_time und tmp_time um zu bestimmen welche rornder die nächste datei bekommt?
        def dt_learn():
            return tmp_time_learn/(tgt_time_learn/100)
        
        def dt_val():
            return tmp_time_val/(tgt_time_val/100)
        
        def dt_test():
            return tmp_time_test/(tgt_time_test/100)
        
        #random liste erstellen
        randomlist = list(self.filelist())
        random.shuffle(randomlist)
        
        #random liste durchgehen und Datei immer zum Ordner mit größtem Fehler/Delta kopieren und tmp zeit des Ordners erhöhen
        for i in randomlist:
            #Abfragen welcher ordner am weitesten von zielwert abweicht
            if (dt_learn() < 100) and (dt_learn() <= dt_val()) and (dt_learn() <= dt_test()):
                #kopiere i nach learn ordner
                shutil.copy(i, learnpath)
                #adiere zeit von datei i zu tmp-time_val
                tmp_time_learn = tmp_time_learn + self.lendict_p[str(i)]
                
                
            elif (dt_val() < 100) and (dt_val() <= dt_learn()) and (dt_val() <= dt_test()):
                #kopiere i nach learn ordner
                shutil.copy(i, valpath)
                #adiere zeit von datei i zu tmp-time_val
                tmp_time_val = tmp_time_val + self.lendict_p[str(i)]
                
            elif (dt_test() < 100) and (dt_test() <= dt_val()) and (dt_test() <= dt_learn()):
                #kopiere i nach learn ordner
                shutil.copy(i, testpath)
                #adiere zeit von datei i zu tmp-time_val
                tmp_time_test = tmp_time_test + self.lendict_p[str(i)]
                
                

            
        #zeit von kopierter datei aus dict -> self.lendict_p['datasets/diesel_orig/diesel_01.wav']
        
        #print('oalen = ' + str(oalength), 'tgt time learn =' + str(tgt_time_learn) + ', tgt time val =' + str(tgt_time_val) + ', tgt time test =' + str(tgt_time_test))
        #print('real time learn = ' + str(tmp_time_learn) + 'real time val = ' + str(tmp_time_val) + 'real time test = ' + str(tmp_time_test))
        #print(str(dt_learn()) + ' ' + str(dt_val()) + ' ' + str(dt_test()))
        print('error learn: ' + str(round(dt_learn() - 100, 2)) + '%; error val: ' + str(round(dt_val() - 100, 2)) + '%; error test: ' + str(round(dt_test() - 100, 2)) + '%;')
        
#funktion die alle files unter x sekunden anzeigt wäre noch gut 
#funktion die die files der länge nach sortiert anzeigt (sowohl auf als auch absteigend)

#data1 = Dataset('/home/lt/techlabs/project-st-19-01-PredictivePollution/datasets/car_sounds_app')

In [23]:
#diesel = Dataset('/home/lt/techlabs/pp_raw_data/diesel/')

In [34]:
#benzin = Dataset('/home/lt/techlabs/pp_raw_data/benzin')

In [25]:
#diesel.info()

Number of files: 59
Overall length: 20922374 ms
Max file length: 3641842 ms
Min file length: 5000 ms
mean file length: 354616.5084745763 ms


In [35]:
#benzin.info()

Number of files: 346
Overall length: 21607292 ms
Max file length: 2651463 ms
Min file length: 3745 ms
mean file length: 62448.82080924856 ms


In [27]:
#benzin.maxfile()

'/94 Project Cars 2 Car Sounds Part 1.wav'

In [7]:
divide_path_benzin = '/home/lt/techlabs/pp_raw_data/pp_raw_data_div/benzin_div'
divide_path_diesel = '/home/lt/techlabs/pp_raw_data/pp_raw_data_div/diesel_div'

In [38]:
#benzin.divide_data(divide_path_benzin, 0.7,0.2,0.1)

error learn: 0.7%; error val: -1.91%; error test: -1.1%;


In [43]:
#diesel.divide_data(divide_path_diesel, 0.7,0.2,0.1)

error learn: -6.27%; error val: 14.41%; error test: 15.07%;


In [8]:
benzin_train = Dataset('/home/lt/techlabs/pp_raw_data/pp_raw_data_div/benzin_div/learn')
benzin_valid = Dataset('/home/lt/techlabs/pp_raw_data/pp_raw_data_div/benzin_div/val')
benzin_test = Dataset('/home/lt/techlabs/pp_raw_data/pp_raw_data_div/benzin_div/test')

In [9]:
diesel_train = Dataset('/home/lt/techlabs/pp_raw_data/pp_raw_data_div/diesel_div/learn')
diesel_valid = Dataset('/home/lt/techlabs/pp_raw_data/pp_raw_data_div/diesel_div/val')
diesel_test = Dataset('/home/lt/techlabs/pp_raw_data/pp_raw_data_div/diesel_div/test')

In [10]:
benzin_train.info()

Number of files: 220
Overall length: 15213371 ms
Max file length: 2651463 ms
Min file length: 5251 ms
mean file length: 69151.68636363637 ms


In [11]:
benzin_train.minfile()

'/salamisound-4212861-harley-davidson-883-iron.mp3'

In [12]:
benzin_valid.info()

Number of files: 73
Overall length: 4230291 ms
Max file length: 1199874 ms
Min file length: 6190 ms
mean file length: 57949.191780821915 ms


In [13]:
benzin_valid.minfile()

'/dbsacc.mp3'

In [14]:
benzin_test.info()

Number of files: 47
Overall length: 2136916 ms
Max file length: 770972 ms
Min file length: 6991 ms
mean file length: 45466.29787234042 ms


In [15]:
diesel_train.info()

Number of files: 41
Overall length: 13727214 ms
Max file length: 3641842 ms
Min file length: 5000 ms
mean file length: 334810.0975609756 ms


In [16]:
diesel_valid.info()

Number of files: 7
Overall length: 4787585 ms
Max file length: 1711387 ms
Min file length: 5000 ms
mean file length: 683940.7142857143 ms


In [17]:
diesel_test.info()

Number of files: 11
Overall length: 2407575 ms
Max file length: 1334729 ms
Min file length: 5000 ms
mean file length: 218870.45454545456 ms


In [18]:
milliseconds = 1000
foldername = 'dataset1_1sec'

In [19]:
benzin_train.split_and_convert(milliseconds, ('datasets/' + foldername + '/train/benzin'), csv = True , label = 'benzin', ds_label = 'train')

converted 9 files to spectrogram
converted 14 files to spectrogram
converted 8 files to spectrogram
converted 20 files to spectrogram
converted 12 files to spectrogram
converted 614 files to spectrogram
converted 2282 files to spectrogram
converted 17 files to spectrogram
converted 21 files to spectrogram
converted 23 files to spectrogram
converted 8 files to spectrogram
converted 31 files to spectrogram
converted 19 files to spectrogram
converted 10 files to spectrogram
converted 9 files to spectrogram
converted 22 files to spectrogram
converted 15 files to spectrogram
converted 21 files to spectrogram
converted 17 files to spectrogram
converted 15 files to spectrogram
converted 14 files to spectrogram
converted 15 files to spectrogram
converted 7 files to spectrogram
converted 17 files to spectrogram
converted 16 files to spectrogram
converted 10 files to spectrogram
converted 10 files to spectrogram
converted 13 files to spectrogram
converted 18 files to spectrogram
converted 15 fil

In [20]:
benzin_valid.split_and_convert(milliseconds, ('datasets/' + foldername + '/valid/benzin'), csv = True , label = 'benzin', ds_label = 'valid')

converted 11 files to spectrogram
converted 22 files to spectrogram
converted 22 files to spectrogram
converted 8 files to spectrogram
converted 8 files to spectrogram
converted 12 files to spectrogram
converted 12 files to spectrogram
converted 21 files to spectrogram
converted 14 files to spectrogram
converted 25 files to spectrogram
converted 16 files to spectrogram
converted 645 files to spectrogram
converted 614 files to spectrogram
converted 19 files to spectrogram
converted 18 files to spectrogram
converted 20 files to spectrogram
converted 504 files to spectrogram
converted 8 files to spectrogram
converted 11 files to spectrogram
converted 11 files to spectrogram
converted 13 files to spectrogram
converted 10 files to spectrogram
converted 14 files to spectrogram
converted 30 files to spectrogram
converted 11 files to spectrogram
converted 22 files to spectrogram
converted 21 files to spectrogram
converted 20 files to spectrogram
converted 11 files to spectrogram
converted 15 f

In [21]:
benzin_test.split_and_convert(milliseconds, ('datasets/' + foldername + '/test/benzin'), csv = True , label = 'benzin', ds_label = 'test')

converted 19 files to spectrogram
converted 8 files to spectrogram
converted 17 files to spectrogram
converted 706 files to spectrogram
converted 14 files to spectrogram
converted 11 files to spectrogram
converted 16 files to spectrogram
converted 7 files to spectrogram
converted 9 files to spectrogram
converted 11 files to spectrogram
converted 17 files to spectrogram
converted 14 files to spectrogram
converted 15 files to spectrogram
converted 38 files to spectrogram
converted 27 files to spectrogram
converted 19 files to spectrogram
converted 21 files to spectrogram
converted 10 files to spectrogram
converted 20 files to spectrogram
converted 13 files to spectrogram
converted 771 files to spectrogram
converted 12 files to spectrogram
converted 21 files to spectrogram
converted 17 files to spectrogram
converted 10 files to spectrogram
converted 16 files to spectrogram
converted 16 files to spectrogram
converted 8 files to spectrogram
converted 15 files to spectrogram
converted 11 fil

In [22]:
diesel_train.split_and_convert(milliseconds, ('datasets/' + foldername + '/train/diesel'), csv = True , label = 'diesel', ds_label = 'train')

converted 1389 files to spectrogram
converted 6 files to spectrogram
converted 179 files to spectrogram
converted 35 files to spectrogram
converted 610 files to spectrogram
converted 32 files to spectrogram
converted 6 files to spectrogram
converted 6 files to spectrogram
converted 1921 files to spectrogram
converted 504 files to spectrogram
converted 31 files to spectrogram
converted 6 files to spectrogram
converted 30 files to spectrogram
converted 6 files to spectrogram
converted 113 files to spectrogram
converted 6 files to spectrogram
converted 309 files to spectrogram
converted 6 files to spectrogram
converted 1280 files to spectrogram
converted 11 files to spectrogram
converted 198 files to spectrogram
converted 6 files to spectrogram
converted 290 files to spectrogram
converted 33 files to spectrogram
converted 431 files to spectrogram
converted 6 files to spectrogram
converted 3642 files to spectrogram
converted 6 files to spectrogram
converted 107 files to spectrogram
convert

In [23]:
diesel_valid.split_and_convert(milliseconds, ('datasets/' + foldername + '/valid/diesel'), csv = True , label = 'diesel', ds_label = 'valid')

converted 6 files to spectrogram
converted 876 files to spectrogram
converted 1483 files to spectrogram
converted 19 files to spectrogram
converted 1712 files to spectrogram
converted 179 files to spectrogram
converted 516 files to spectrogram
Path to .csv: datasets/dataset1_1sec/valid/diesel.csv


In [24]:
diesel_test.split_and_convert(milliseconds, ('datasets/' + foldername + '/test/diesel'), csv = True , label = 'diesel', ds_label = 'test')

converted 492 files to spectrogram
converted 6 files to spectrogram
converted 6 files to spectrogram
converted 41 files to spectrogram
converted 6 files to spectrogram
converted 1335 files to spectrogram
converted 204 files to spectrogram
converted 287 files to spectrogram
converted 6 files to spectrogram
converted 27 files to spectrogram
converted 6 files to spectrogram
Path to .csv: datasets/dataset1_1sec/test/diesel.csv
