# Data Collection

The goal of this process is to take a bunch of audio files, extract their genre, and reformat them to a consistent array that can be placed in a CSV.

In [7]:
import audio_metadata
import csv
import json
import librosa
import math
import numpy as np
import os
#import pandas as pd # Throwing everything through a pandas df was taking too much time, so dump direct to CSV
import random
import scipy.signal as signal
from time import time

# Music is taken from the radio station's music library, mapped as a network drive here
mediadir = "W:\Music\MP3s"
folder_startswith = ["리안", "伊東"] # These albums cause encoding errors, ignoring
include_rootfiles = True
ofdir = "outfiles-2s"
clip_dur = 2 # in seconds
nfft = 256
precision = 0 # decimal places
clips_per_song = 5
start_end_width = 0.1 # ignore clips from the first and last (start_end_width) % of the song (to avoid fade ins/outs)

#df = None
#df_inuse = False
metacolnames = ['label', 'filename', 'clipnum']
pixelcolnames = []
allcolnames = []
datarows = []

genres = {}

lastdir = None

if not os.path.exists(ofdir):
    os.mkdir(ofdir)

for root, dirs, files in os.walk(mediadir):
    if (lastdir is not None) and (lastdir != root) and (len(datarows) > 0):
        # We just finished all the files in lastdir
        fp = os.path.normpath(lastdir)
        print(f"Writing new datafile {os.path.join(ofdir, '--'.join(fp.split(os.sep)[2:] + ['data.csv']))}")
        # Write the previous directory's dataframe, just for safety
        with open(os.path.join(ofdir, '--'.join(fp.split(os.sep)[2:] + ['data.csv'])), 'w', newline='') as data_outfile:
            dw = csv.DictWriter(data_outfile, fieldnames=allcolnames)
            dw.writeheader()
            dw.writerows(datarows)
        #df.to_csv(os.path.join(ofdir, '--'.join(fp.split(os.sep)[2:] + ['data.csv'])))
        with open(os.path.join(ofdir, '--'.join(fp.split(os.sep)[2:] + ['genres.json'])), "w") as genres_outfile:
            genres_outfile.write(json.dumps(genres))
        
        # Clear dataframe and re-init genres
        #df = None
        #df_inuse = False
        datarows = []
        genres = {}
        lastdir = root
    
    rootsplit = os.path.normpath(root).split(os.sep)
    if len(rootsplit) > 3 and (type(folder_startswith) == list) and (rootsplit[3].lower().startswith(tuple(folder_startswith))):
        # Ignoring this folder
        print(f'ignoring folder {rootsplit[3]}')
        continue

    if len(rootsplit) == 3 and not include_rootfiles:
        print('ignoring files in root dir')
        continue

    for filename in files:
        # For every file in this directory
        t_start = time()
        if filename.lower().endswith(('.mp3', '.flac', '.wav')) and not filename.startswith('._'):
            # Likely a valid audio file
            print(os.path.join(root, filename))
            try:
                # Load the file's metadata
                md = audio_metadata.load(os.path.join(root, filename))
                print(f"Loaded in {round(time() - t_start, 2)}s")
                genre = None
                if 'genre' in md['tags']:
                    # Get its genre. If there's multiple, take the first
                    gs = [e for s in [t.split(';') for t in md['tags']['genre']] for e in s]
                    genre = gs[0]
                    for g in gs:
                        if g in genres:
                            genres[g] += 1
                        else:
                            genres[g] = 1
                else:
                    # No genre, so we can't use it for this task. Skip over it
                    continue
                
                t_start = time()
                # Load the actual audio data
                samples, sample_rate = librosa.load(os.path.join(root, filename))
                print(f"Librosa loaded in {round(time() - t_start, 2)}s")
                # We have clip duration in seconds, so do some math to get clip duration in samples
                real_clip_dur = clip_dur * 1.1
                samples_per_clip = sample_rate * clip_dur
                real_samples_per_clip = sample_rate * real_clip_dur

                t_start = time()
                # Generate a spectrogram from the audio
                f, t, sxx = signal.spectrogram(samples, nfft=nfft, return_onesided=True)
                print(f"Spectrogram in {round(time() - t_start, 2)}s")

                # Figure out which columns in the spectrogram correspond to which samples
                tbin_width = len(samples) / len(t)
                song_dur = len(samples) / sample_rate
                if song_dur < ((clips_per_song * clip_dur) / 0.8) * 1.1:
                    # Not enough samples in the song
                    continue
                num_clips = math.floor(song_dur / clip_dur)
                clip_sample_starts = [c * samples_per_clip for c in range(0, num_clips)]
                clip_tbin_starts = [round(s / tbin_width) for s in clip_sample_starts]
                tbins_per_clip = math.floor(samples_per_clip / tbin_width)
                clip_tbin_ends = [s + tbins_per_clip for s in clip_tbin_starts]

                # Audio perception is logarathmic, so scale the data accordingly
                Sxx_dB = 10*np.log10(sxx)

                #if df is None:
                if len(datarows) == 0:
                    # Initialize a new datarows object, happens on first file in each directory
                    pixelcolnames = [f'pixel{n}' for n in range(0, len(f) * tbins_per_clip)]
                    allcolnames = metacolnames + pixelcolnames
                    #df = pd.DataFrame([], columns=allcolnames)
                    #df_inuse = True
                    if lastdir is None:
                        lastdir = root


                ### Talked with Tyler on 4/20 after class, instead of concatenating df's row by row try appending dicts to a list then use the list to make the dataframe
                ### To do that, find a way to convert the np array to a dict. I've already got a list of column names, so it shouldn't be too hard
                ### Also use time.time to check how long things are running, might be able to prove it is that pd.concat step that's increasing in duration each time

                t_start = time()
                # Instead of taking every clip of every song, pick {clips_per_song} clips at random
                clip_idxs = []
                clip_idx_start = math.ceil(len(clip_tbin_starts) * start_end_width)
                clip_idx_end = math.floor(len(clip_tbin_starts) * (1-start_end_width))
                for c in range(clips_per_song):
                    idx = random.randint(clip_idx_start, clip_idx_end)
                    while idx in clip_idxs:
                        idx = random.randint(clip_idx_start, clip_idx_end)
                    clip_idxs.append(idx)
                print(clip_idxs)

                for c in clip_idxs:
                    # Reformat the 2D spectrogram into a 1D array to be put into a CSV
                    t_startclip = time()
                    #print(Sxx_dB[:, clip_tbin_starts[c]:clip_tbin_ends[c]].shape, t[clip_tbin_starts[c]], t[clip_tbin_ends[c]])
                    Sxx_dB_flat = np.ravel(Sxx_dB[:, clip_tbin_starts[c]:clip_tbin_ends[c]])
                    Sxx_dB_round = np.round(Sxx_dB_flat, precision)
                    row = dict(zip(pixelcolnames, Sxx_dB_round))
                    #print(Sxx_dB_flat)
                    #df = pd.concat([df, pd.DataFrame(Sxx_dB_flat.reshape(1, -1), columns=pixelcolnames)])
                    #df.iloc[-1, df.columns.get_loc('label')] = genre
                    #df.iloc[-1, df.columns.get_loc('filename')] = filename
                    #df.iloc[-1, df.columns.get_loc('clipnum')] = c

                    # Add in metadata and label
                    row['label'] = genre
                    row['filename'] = filename
                    row['clipnum'] = c
                    datarows.append(row)
                    #print(f"\tAppended clip in {round(time() - t_startclip, 2)}s")
                print(f"Finished appending clips in {round(time() - t_start, 2)}s")

            except audio_metadata.UnsupportedFormat as ufe:
                pass
                #print(f"Error with {filename}, skipping")
            except audio_metadata.FormatError as fe:
                pass
                #print(f"Format error with {filename}, skipping")

if (lastdir is not None) and (len(datarows) > 0):
    # Write out the last folder's content
    fp = os.path.normpath(lastdir)
    print(f"Writing new datafile {os.path.join(ofdir, '--'.join(fp.split(os.sep)[2:] + ['data.csv']))}")
    # Write the previous directory's dataframe, just for safety
    #df.to_csv(os.path.join(ofdir, '--'.join(fp.split(os.sep)[2:] + ['data.csv'])))
    with open(os.path.join(ofdir, '--'.join(fp.split(os.sep)[2:] + ['data.csv'])), 'w', newline='') as data_outfile:
        dw = csv.DictWriter(data_outfile, fieldnames=allcolnames)
        dw.writeheader()
        dw.writerows(datarows)
    with open(os.path.join(ofdir, '--'.join(fp.split(os.sep)[2:] + ['genres.json'])), "w") as genres_outfile:
        genres_outfile.write(json.dumps(genres))

W:\Music\MP3s\14 - Eric Clapton - Layla.mp3
Loaded in 0.17s
Librosa loaded in 1.84s
Spectrogram in 0.16s


    divide by zero encountered in log10


[69, 54, 164, 113, 45]
Finished appending clips in 0.02s
W:\Music\MP3s\14 -  , The Yardbirds - For Your Love.mp3
Loaded in 0.12s
Librosa loaded in 0.63s
Spectrogram in 0.06s
[12, 26, 21, 19, 36]
Finished appending clips in 0.02s
W:\Music\MP3s\144 - King Harvest - A Little Bit Like Magic.mp3
Loaded in 0.16s
Librosa loaded in 0.59s
Spectrogram in 0.05s
[37, 20, 13, 8, 59]
Finished appending clips in 0.02s
W:\Music\MP3s\19 - Eric Clapton - Lay Down Sally.mp3
Loaded in 0.15s
Librosa loaded in 1.34s
Spectrogram in 0.09s
[92, 68, 21, 23, 46]
Finished appending clips in 0.02s
W:\Music\MP3s\2 - Ian & Sylvia - Early Morning Rain.mp3
Loaded in 0.14s
Librosa loaded in 1.0s
Spectrogram in 0.09s
[78, 69, 46, 64, 73]
Finished appending clips in 0.02s
W:\Music\MP3s\218 - Linda Ronstadt - You're No Good.mp3
Loaded in 0.12s
Librosa loaded in 0.93s
Spectrogram in 0.08s
[42, 14, 97, 34, 48]
Finished appending clips in 0.02s
W:\Music\MP3s\224 - Michael Jackson - Ben.mp3
Loaded in 0.13s
Librosa loaded in 0

    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    
    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    
    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    
    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
 

Loaded in 0.1s
W:\Music\MP3s\Blanchard's Friday Playlist\Ariana Grande - Everyday feat Future (Clean).mp3
Loaded in 0.05s
W:\Music\MP3s\Blanchard's Friday Playlist\Ariana Grande - Focus.mp3
Loaded in 0.04s
W:\Music\MP3s\Blanchard's Friday Playlist\Bebe Rexha - I Got You.mp3
Loaded in 0.03s
W:\Music\MP3s\Blanchard's Friday Playlist\Bruno Mars - 24k.mp3
Loaded in 0.03s
W:\Music\MP3s\Blanchard's Friday Playlist\Bruno Mars - Thats What I Like (Clean).mp3
Loaded in 0.03s
W:\Music\MP3s\Blanchard's Friday Playlist\Calvin Harris & Disciples - How Deep Is Your Love.mp3


    Ignoring ``TYER``.
    Year frame values must be 4-character number strings.
    
    Ignoring ``TYER``.
    Year frame values must be 4-character number strings.
    


Loaded in 0.13s
Librosa loaded in 0.87s
Spectrogram in 0.08s
[75, 16, 61, 73, 72]
Finished appending clips in 0.02s
W:\Music\MP3s\Blanchard's Friday Playlist\Calvin Harris - This Is What You Came For (feat. Rihanna).mp3
Loaded in 0.12s
Librosa loaded in 0.5s
Spectrogram in 0.08s
[96, 62, 33, 75, 28]
Finished appending clips in 0.02s
W:\Music\MP3s\Blanchard's Friday Playlist\Charlie Puth - One Call Away.mp3
Loaded in 0.13s
W:\Music\MP3s\Blanchard's Friday Playlist\Charlie Puth - We Don't Talk Anymore (feat. Selena Gomez).mp3
Loaded in 0.11s
Librosa loaded in 0.5s
Spectrogram in 0.08s
[40, 82, 68, 36, 52]
Finished appending clips in 0.02s
W:\Music\MP3s\Blanchard's Friday Playlist\Clean Bandit - Rockabye ft. Sean Paul.mp3
Loaded in 0.03s
W:\Music\MP3s\Blanchard's Friday Playlist\David Guetta - Bang My Head (feat. Sia & Fetty Wap).mp3
Loaded in 0.1s
W:\Music\MP3s\Blanchard's Friday Playlist\Daya - Hide Away.mp3
Loaded in 0.11s
W:\Music\MP3s\Blanchard's Friday Playlist\Daya - Sit Still, Loo

    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    
    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    


Loaded in 0.11s
Librosa loaded in 0.48s
Spectrogram in 0.08s
[50, 27, 26, 80, 30]
Finished appending clips in 0.02s
W:\Music\MP3s\Blanchard's Friday Playlist\DJ Snake & AlunaGeorge - You Know You Like It.mp3
Loaded in 0.08s
Librosa loaded in 1.06s
Spectrogram in 0.09s
[74, 87, 73, 78, 103]
Finished appending clips in 0.02s
W:\Music\MP3s\Blanchard's Friday Playlist\DJ Snake - Let Me Love You.mp3


    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    
    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    


Loaded in 0.32s
W:\Music\MP3s\Blanchard's Friday Playlist\DJ Snake - Middle (feat. Bipolar Sunshine).mp3
Loaded in 0.13s
W:\Music\MP3s\Blanchard's Friday Playlist\DNCE - Toothbrush.mp3
Loaded in 0.09s
Librosa loaded in 0.56s
Spectrogram in 0.09s
[99, 60, 96, 35, 32]
Finished appending clips in 0.02s
W:\Music\MP3s\Blanchard's Friday Playlist\Drake - One Dance (feat. Wizkid & Kyla).mp3
Loaded in 0.03s
Librosa loaded in 0.42s
Spectrogram in 0.07s
[59, 26, 36, 78, 70]
Finished appending clips in 0.02s
W:\Music\MP3s\Blanchard's Friday Playlist\Echosmith - Cool Kids.mp3
Loaded in 0.96s
Librosa loaded in 0.41s
Spectrogram in 0.08s
[71, 42, 56, 78, 61]
Finished appending clips in 0.02s
W:\Music\MP3s\Blanchard's Friday Playlist\Ed Sheeran - Shape Of You.mp3
Loaded in 0.03s
W:\Music\MP3s\Blanchard's Friday Playlist\Elle King - Ex's & Oh's.mp3


    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    
    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    
    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    
    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
 

Loaded in 0.1s
W:\Music\MP3s\Blanchard's Friday Playlist\Ellie Goulding - On My Mind.mp3
Loaded in 0.11s
W:\Music\MP3s\Blanchard's Friday Playlist\Fall Out Boy - Irresistible (feat. Demi Lovato).mp3
Loaded in 0.1s
W:\Music\MP3s\Blanchard's Friday Playlist\Flo Rida - G.D.F.R. (Feat.Sage The Gemini & Lookas).mp3
Loaded in 0.08s
Librosa loaded in 0.83s
Spectrogram in 0.07s
[51, 65, 43, 64, 77]
Finished appending clips in 0.02s
W:\Music\MP3s\Blanchard's Friday Playlist\Flo Rida - I Don't Like It, I Love It (Feat.Robin Thicke & Verdine White).mp3
Loaded in 0.08s
Librosa loaded in 0.97s
Spectrogram in 0.08s
[19, 14, 27, 49, 88]
Finished appending clips in 0.02s
W:\Music\MP3s\Blanchard's Friday Playlist\Flo Rida - My House.mp3
Loaded in 0.09s
W:\Music\MP3s\Blanchard's Friday Playlist\Hailee Steinfeld & Grey - Starving.mp3
Loaded in 0.11s
W:\Music\MP3s\Blanchard's Friday Playlist\Hey Violet - Guys My Age.mp3
Loaded in 0.04s
W:\Music\MP3s\Blanchard's Friday Playlist\James Bay - Let It Go.mp3


    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    
    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    


Loaded in 0.11s
W:\Music\MP3s\Blanchard's Friday Playlist\Jason Derulo - Want to Want Me.mp3
Loaded in 0.11s
Librosa loaded in 0.87s
Spectrogram in 0.08s
[31, 80, 75, 66, 36]
Finished appending clips in 0.02s
W:\Music\MP3s\Blanchard's Friday Playlist\Justin Bieber - What Do You Mean.mp3


    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    
    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    


Loaded in 0.11s
W:\Music\MP3s\Blanchard's Friday Playlist\Justin Timberlake - CAN'T STOP THE FEELING!.mp3
Loaded in 0.11s
Librosa loaded in 0.54s
Spectrogram in 0.09s
[28, 16, 90, 52, 46]
Finished appending clips in 0.02s
W:\Music\MP3s\Blanchard's Friday Playlist\Katy Perry - Chained To The Rhythm ft. Skip Marley.mp3
Loaded in 0.04s
W:\Music\MP3s\Blanchard's Friday Playlist\Kiiara - Gold.mp3
Loaded in 0.11s
Librosa loaded in 0.51s
Spectrogram in 0.09s
[31, 59, 15, 21, 39]
Finished appending clips in 0.02s
W:\Music\MP3s\Blanchard's Friday Playlist\Lillywood & Robin Schulz - Prayer in C (Robin Schulz Remix) [Radio Edit].mp3
Loaded in 0.14s
Librosa loaded in 0.81s
Spectrogram in 0.07s
[11, 35, 56, 69, 42]
Finished appending clips in 0.02s
W:\Music\MP3s\Blanchard's Friday Playlist\Machine Gun Kelly ft. Camila Cabello - Bad Things.mp3
Loaded in 0.03s
W:\Music\MP3s\Blanchard's Friday Playlist\Major Lazer - Cold Water.mp3


    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    
    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    
    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    
    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
 

Loaded in 0.12s
W:\Music\MP3s\Blanchard's Friday Playlist\Major Lazer - Lean On.mp3
Loaded in 0.08s
W:\Music\MP3s\Blanchard's Friday Playlist\Maroon 5 - Don't Wanna Know ft. Kendrick Lamar.mp3
Loaded in 0.05s
W:\Music\MP3s\Blanchard's Friday Playlist\Meghan Trainor - Like I'm Gonna Lose You.mp3


    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    
    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    
    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    
    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
 

Loaded in 0.1s
W:\Music\MP3s\Blanchard's Friday Playlist\Meghan Trainor - No.mp3
Loaded in 0.11s
W:\Music\MP3s\Blanchard's Friday Playlist\One Direction - Perfect.mp3
Loaded in 0.1s
W:\Music\MP3s\Blanchard's Friday Playlist\Rachel Platten - Stand By You.mp3
Loaded in 0.12s
W:\Music\MP3s\Blanchard's Friday Playlist\Robin Schulz - Sugar (feat. Francesco Yates).mp3
Loaded in 0.11s
W:\Music\MP3s\Blanchard's Friday Playlist\Rudimental Feat. Ed Sheeran - Lay It All On Me.mp3
Loaded in 0.11s
W:\Music\MP3s\Blanchard's Friday Playlist\Selena Gomez - Hands To Myself.mp3
Loaded in 0.04s
W:\Music\MP3s\Blanchard's Friday Playlist\Selena Gomez - Kill Em with Kindness.mp3
Loaded in 0.03s
Librosa loaded in 0.51s
Spectrogram in 0.08s
[24, 47, 92, 38, 78]
Finished appending clips in 0.02s
W:\Music\MP3s\Blanchard's Friday Playlist\Shawn Mendes & Camila Cabello - I Know What You Did Last Summer.mp3
Loaded in 0.1s
W:\Music\MP3s\Blanchard's Friday Playlist\Sia - Cheap Thrills (feat. Sean Paul).mp3
Loaded in

    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    
    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    
    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    
    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
 

Loaded in 0.11s
W:\Music\MP3s\Blanchard's Friday Playlist\Starley - Call On Me (Ryan Riback Remix).mp3
Loaded in 0.03s
W:\Music\MP3s\Blanchard's Friday Playlist\The Chainsmokers - Closer.mp3
Loaded in 0.14s
W:\Music\MP3s\Blanchard's Friday Playlist\The Chainsmokers - Paris.mp3
Loaded in 0.06s
W:\Music\MP3s\Blanchard's Friday Playlist\The Chainsmokers - Roses.mp3
Loaded in 0.04s
W:\Music\MP3s\Blanchard's Friday Playlist\The Weeknd - Can't Feel My Face.mp3
Loaded in 0.1s
Librosa loaded in 0.9s
Spectrogram in 0.08s
[34, 63, 36, 41, 35]
Finished appending clips in 0.02s
W:\Music\MP3s\Blanchard's Friday Playlist\The Weeknd - I Feel It Coming ft. Daft Punk.mp3
Loaded in 0.05s
W:\Music\MP3s\Blanchard's Friday Playlist\The Weeknd - In The Night.mp3
Loaded in 0.13s
W:\Music\MP3s\Blanchard's Friday Playlist\The Weeknd - StarBoy (Clean) Ft Daft Punk.mp3
Loaded in 0.03s
W:\Music\MP3s\Blanchard's Friday Playlist\Tori Kelly - Hollow.mp3
Loaded in 0.13s
W:\Music\MP3s\Blanchard's Friday Playlist\Twent

    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    
    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    


Loaded in 0.12s
W:\Music\MP3s\Blanchard's Friday Playlist\twenty one pilots - Ride.mp3
Loaded in 0.11s
Librosa loaded in 0.5s
Spectrogram in 0.08s
[25, 91, 17, 32, 48]
Finished appending clips in 0.02s
W:\Music\MP3s\Blanchard's Friday Playlist\twenty one pilots - Stressed Out.mp3
Loaded in 0.1s
Librosa loaded in 0.86s
Spectrogram in 0.08s
[48, 38, 17, 41, 46]
Finished appending clips in 0.02s
W:\Music\MP3s\Blanchard's Friday Playlist\Zara Larsson & MNEK - Never Forget You.mp3


    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    
    Ignoring ``TRCK``.
    Number frame values must consist only of digits and '/'.
    
    Ignoring ``TSOA`` frame with value ``['George Orcullo Ortha II +639209517019 Sycip Gorres Velayo & Co.']``.
    ``TSOA`` is not supported in the ID3v2.3 specification.
    
    Ignoring ``SYLT``.
    Missing data in synchronized lyrics frame.
    
    Ignoring ``SYLT``.
    Missing data in synchronized lyrics frame.
    


Loaded in 0.1s
W:\Music\MP3s\Blanchard's Friday Playlist\ZAYN Taylor Swift - I Don't Wanna Live Forever.mp3
Loaded in 0.04s
W:\Music\MP3s\Blanchard's Friday Playlist\Zedd ft Selena Gomez - I Want You To Know.mp3
Loaded in 0.08s
Librosa loaded in 0.9s
Spectrogram in 0.09s
[26, 81, 51, 12, 89]
Finished appending clips in 0.02s
Writing new datafile outfiles-2s\MP3s--Blanchard's Friday Playlist--data.csv
W:\Music\MP3s\junk\Booker T. & The M.G.'s - Melting Pot.mp3
Loaded in 0.13s
Librosa loaded in 1.09s
Spectrogram in 0.21s
[118, 79, 96, 102, 137]
Finished appending clips in 0.02s
W:\Music\MP3s\junk\Boston Horns - Funkafized.mp3
Loaded in 0.1s
Librosa loaded in 0.67s
Spectrogram in 0.11s
[17, 52, 108, 132, 76]
Finished appending clips in 0.02s
W:\Music\MP3s\junk\Buddy Rich - Nuttville.mp3
Loaded in 0.12s
Librosa loaded in 0.64s
Spectrogram in 0.11s
[30, 80, 61, 111, 126]
Finished appending clips in 0.02s
W:\Music\MP3s\junk\Casiopea - Asayake.mp3
Loaded in 0.1s
Librosa loaded in 0.87s
Spectr

In [8]:
# Collect all the data files into one large csv and genre json

# The previous code block writes two files for each folder it scans, just so we don't lose all our work if it crashes most of the way through
# This code block combines all of those files into two large files: a data.csv with all rows generated, and a genres.json which lets us clean up genre names

import csv
import json
import os
from time import time

outdir = 'outfiles-2s/'

genre_list = []
cols = None
dw = None

with open('data-2s.csv', 'w', newline='') as of_data:
    for fp in os.listdir(outdir):
        # For each of the files generated by the last block
        t_start = time()
        print(fp)
        if fp.endswith('.csv'):
            # Data files
            with open(os.path.join(outdir, fp), 'r') as f:
                # Ran into memory problems trying to load all of the data files, so run through row by row shuffling them over
                dr = csv.reader(f)
                if cols is None:
                    # Check if we already have a header written
                    cols = next(dr)
                    dw = csv.writer(of_data)
                    dw.writerow(cols)
                else:
                    # Skip the header row
                    next(dr)
                row = next(dr, False)
                while (row != False):
                    # Write each row into the main data csv
                    dw.writerow(row)
                    row = next(dr, False)
            print(f"CSV in {round(time() - t_start, 2)}s")
        if fp.endswith('.json'):
            # Genre files
            with open(os.path.join(outdir, fp), 'r') as f:
                # These are smaller, so we just combine them into a list
                genre_list.append(json.load(f))
            print(f"JSON in {round(time() - t_start, 2)}s")

# Combine genre dicts into one
genres = {}
for gd in genre_list:
    for g in gd:
        if g in genres:
            # Genre already in our file, so just increment the count
            genres[g]['count'] = genres[g]['count'] + gd[g]
        else:
            # Genre not yet in our file, initialize count and provide a spot to put in a cleaned name
            genres[g] = {}
            genres[g]['count'] = gd[g]
            genres[g]['clean'] = ""

print(genres)
with open('genres-2s.json', 'w') as of_genre:
    # Write the genres to a JSON file
    json.dump(genres, of_genre)


FLAC---NSYNC--data.csv
CSV in 0.34s
FLAC---NSYNC--genres.json
JSON in 0.0s
FLAC--10,000 Maniacs--data.csv
CSV in 0.34s
FLAC--10,000 Maniacs--genres.json
JSON in 0.0s
FLAC--123--data.csv
CSV in 1.72s
FLAC--123--genres.json
JSON in 0.0s
FLAC--3 Doors Down--data.csv
CSV in 0.32s
FLAC--3 Doors Down--genres.json
JSON in 0.0s
FLAC--30 Seconds To Mars--data.csv
CSV in 0.32s
FLAC--30 Seconds To Mars--genres.json
JSON in 0.0s
FLAC--3OH!3--data.csv
CSV in 0.03s
FLAC--3OH!3--genres.json
JSON in 0.0s
FLAC--3OH!3--Streets Of Gold--data.csv
CSV in 0.37s
FLAC--3OH!3--Streets Of Gold--genres.json
JSON in 0.0s
FLAC--3OH!3--Want [Deluxe]--data.csv
CSV in 0.41s
FLAC--3OH!3--Want [Deluxe]--genres.json
JSON in 0.0s
FLAC--60's Afternoon--data.csv
CSV in 0.45s
FLAC--60's Afternoon--genres.json
JSON in 0.0s
FLAC--60's Beach House--data.csv
CSV in 0.83s
FLAC--60's Beach House--genres.json
JSON in 0.0s
FLAC--70's pop hits--data.csv
CSV in 11.77s
FLAC--70's pop hits--genres.json
JSON in 0.01s
FLAC--70's summer c

In [15]:
# Use the cleaned genres JSON to update genres to a standard set of genres

# In genres-corrected.json, we manually provided a label for each genre in the 'clean' field.
# This reduces the number of labels and lets us remove oddball genres or other unknown genres.

import csv
import json

genres = {}
with open('genres-corrected.json', 'r') as jfile:
    # Read in genre definitions
    genres = json.load(jfile)

with open('data-2s.csv', 'r') as ifile:
    r = csv.reader(ifile)
    with open('data-clean-2s.csv', 'w') as ofile:
        w = csv.writer(ofile)
        # Copy headers over
        w.writerow(next(r))
        # Get each individual row
        row = next(r, None)
        while row is not None:
            #print(f"{row[0]} -> {genres[row[0]]['clean']}")
            if genres[row[0]]['clean'] != 'skip':
                # If we renamed a genre as "skip" it means we don't want to include it. As long as clean name is not skip, update genre and write to the new file
                row[0] = genres[row[0]]['clean']
                w.writerow(row)
            row = next(r, None)
