In [18]:
import audio_metadata
import csv
import json
import librosa
import math
import numpy as np
import os
#import pandas as pd
import scipy.signal as signal
from time import time

mediadir = "W:\Music\FLAC"
folder_startswith = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "0"] # last crash after jimi hendrix
include_rootfiles = False
ofdir = "outfiles"
clip_dur = 1 # in seconds
nfft = 256
precision = 0 # decimal places

#df = None
#df_inuse = False
metacolnames = ['label', 'filename', 'clipnum']
pixelcolnames = []
allcolnames = []
datarows = []

genres = {}

lastdir = None

for root, dirs, files in os.walk(mediadir):
    if (lastdir is not None) and (lastdir != root) and (len(datarows) > 0):
        fp = os.path.normpath(lastdir)
        print(f"Writing new datafile {os.path.join(ofdir, '--'.join(fp.split(os.sep)[2:] + ['data.csv']))}")
        # Write the previous directory's dataframe, just for safety
        with open(os.path.join(ofdir, '--'.join(fp.split(os.sep)[2:] + ['data.csv'])), 'w', newline='') as data_outfile:
            dw = csv.DictWriter(data_outfile, fieldnames=allcolnames)
            dw.writeheader()
            dw.writerows(datarows)
        #df.to_csv(os.path.join(ofdir, '--'.join(fp.split(os.sep)[2:] + ['data.csv'])))
        with open(os.path.join(ofdir, '--'.join(fp.split(os.sep)[2:] + ['genres.json'])), "w") as genres_outfile:
            genres_outfile.write(json.dumps(genres))
        
        # Clear dataframe and re-init genres
        #df = None
        #df_inuse = False
        datarows = []
        genres = {}
        lastdir = root
    
    rootsplit = os.path.normpath(root).split(os.sep)
    if len(rootsplit) > 3 and (type(folder_startswith) == list) and (rootsplit[3].startswith(tuple(folder_startswith))):
        # Ignoring this folder
        print(f'ignoring folder {rootsplit[3]}')
        continue

    if len(rootsplit) == 3 and not include_rootfiles:
        print('ignoring files in root dir')
        continue

    for filename in files:
        t_start = time()
        if filename.lower().endswith(('.mp3', '.flac', '.wav')) and not filename.startswith('._'):
            print(os.path.join(root, filename))
            try:
                md = audio_metadata.load(os.path.join(root, filename))
                print(f"Loaded in {round(time() - t_start, 2)}s")
                genre = None
                if 'genre' in md['tags']:
                    gs = [e for s in [t.split(';') for t in md['tags']['genre']] for e in s]
                    genre = gs[0]
                    for g in gs:
                        if g in genres:
                            genres[g] += 1
                        else:
                            genres[g] = 1
                else:
                    continue
                
                t_start = time()
                samples, sample_rate = librosa.load(os.path.join(root, filename))
                print(f"Librosa loaded in {round(time() - t_start, 2)}s")
                real_clip_dur = clip_dur * 1.1
                samples_per_clip = sample_rate * clip_dur
                real_samples_per_clip = sample_rate * real_clip_dur

                t_start = time()
                f, t, sxx = signal.spectrogram(samples, nfft=nfft, return_onesided=True)
                print(f"Spectrogram in {round(time() - t_start, 2)}s")

                tbin_width = len(samples) / len(t)
                song_dur = len(samples) / sample_rate
                num_clips = math.floor(song_dur / clip_dur)
                clip_sample_starts = [c * samples_per_clip for c in range(0, num_clips)]
                clip_tbin_starts = [round(s / tbin_width) for s in clip_sample_starts]
                tbins_per_clip = math.floor(samples_per_clip / tbin_width)
                clip_tbin_ends = [s + tbins_per_clip for s in clip_tbin_starts]

                Sxx_dB = 10*np.log10(sxx)

                #if df is None:
                if len(datarows) == 0:
                    pixelcolnames = [f'pixel{n}' for n in range(0, len(f) * tbins_per_clip)]
                    allcolnames = metacolnames + pixelcolnames
                    #df = pd.DataFrame([], columns=allcolnames)
                    #df_inuse = True
                    if lastdir is None:
                        lastdir = root


                ### Talked with Tyler on 4/20 after class, instead of concatenating df's row by row try appending dicts to a list then use the list to make the dataframe
                ### To do that, find a way to convert the np array to a dict. I've already got a list of column names, so it shouldn't be too hard
                ### Also use time.time to check how long things are running, might be able to prove it is that pd.concat step that's increasing in duration each time

                t_start = time()
                for c in range(0, num_clips):
                    t_startclip = time()
                    #print(Sxx_dB[:, clip_tbin_starts[c]:clip_tbin_ends[c]].shape, t[clip_tbin_starts[c]], t[clip_tbin_ends[c]])
                    Sxx_dB_flat = np.ravel(Sxx_dB[:, clip_tbin_starts[c]:clip_tbin_ends[c]])
                    Sxx_dB_round = np.round(Sxx_dB_flat, precision)
                    row = dict(zip(pixelcolnames, Sxx_dB_round))
                    #print(Sxx_dB_flat)
                    #df = pd.concat([df, pd.DataFrame(Sxx_dB_flat.reshape(1, -1), columns=pixelcolnames)])
                    #df.iloc[-1, df.columns.get_loc('label')] = genre
                    #df.iloc[-1, df.columns.get_loc('filename')] = filename
                    #df.iloc[-1, df.columns.get_loc('clipnum')] = c
                    row['label'] = genre
                    row['filename'] = filename
                    row['clipnum'] = c
                    datarows.append(row)
                    #print(f"\tAppended clip in {round(time() - t_startclip, 2)}s")
                print(f"Finished appending clips in {round(time() - t_start, 2)}s")

            except audio_metadata.UnsupportedFormat as ufe:
                pass
                #print(f"Error with {filename}, skipping")
            except audio_metadata.FormatError as fe:
                pass
                #print(f"Format error with {filename}, skipping")

if (lastdir is not None) and (len(datarows) > 0):
    fp = os.path.normpath(lastdir)
    print(f"Writing new datafile {os.path.join(ofdir, '--'.join(fp.split(os.sep)[2:] + ['data.csv']))}")
    # Write the previous directory's dataframe, just for safety
    #df.to_csv(os.path.join(ofdir, '--'.join(fp.split(os.sep)[2:] + ['data.csv'])))
    with open(os.path.join(ofdir, '--'.join(fp.split(os.sep)[2:] + ['data.csv'])), 'w', newline='') as data_outfile:
        dw = csv.DictWriter(data_outfile, fieldnames=allcolnames)
        dw.writeheader()
        dw.writerows(datarows)
    with open(os.path.join(ofdir, '--'.join(fp.split(os.sep)[2:] + ['genres.json'])), "w") as genres_outfile:
        genres_outfile.write(json.dumps(genres))

ignoring files in root dir
W:\Music\FLAC\-NSYNC\Greatest Hits\01 Bye Bye Bye.flac
Loaded in 0.04s
Librosa loaded in 2.56s
Spectrogram in 0.08s


    divide by zero encountered in log10


Finished appending clips in 0.38s
W:\Music\FLAC\-NSYNC\Greatest Hits\02 Girlfriend (The Neptunes Remix).flac
Loaded in 0.03s
Librosa loaded in 3.52s
Spectrogram in 0.11s
Finished appending clips in 0.59s
W:\Music\FLAC\-NSYNC\Greatest Hits\03 This I Promise You.flac
Loaded in 0.04s
Librosa loaded in 3.05s
Spectrogram in 0.12s
Finished appending clips in 0.47s
W:\Music\FLAC\-NSYNC\Greatest Hits\04 It's Gonna Be Me.flac
Loaded in 0.03s
Librosa loaded in 2.43s
Spectrogram in 0.08s
Finished appending clips in 0.34s
W:\Music\FLAC\-NSYNC\Greatest Hits\05 God Must Have Spent A Little More Time On You (Remix).flac
Loaded in 0.04s
Librosa loaded in 2.91s
Spectrogram in 0.1s
Finished appending clips in 0.48s
W:\Music\FLAC\-NSYNC\Greatest Hits\06 I Want You Back.flac
Loaded in 0.04s
Librosa loaded in 2.47s
Spectrogram in 0.09s
Finished appending clips in 0.38s
W:\Music\FLAC\-NSYNC\Greatest Hits\07 Pop.flac
Loaded in 0.06s
Librosa loaded in 2.07s
Spectrogram in 0.08s
Finished appending clips in 0.4

FileNotFoundError: [Errno 2] No such file or directory