In [1]:
# Audio Processing
import scipy.signal as sig
import scipy.io.wavfile as wav
import numpy as np
from pydub import AudioSegment

# Fma tools
#    utils will load .env: dotenv.load_dotenv(dotenv.find_dotenv())
import fma.utils

# general
import matplotlib.pyplot as plt
import IPython.display as ipd
import os
import random

# Environment vars
AUDIO_DIR should be set to the unzipped fma tracks data folder. ex. ~/fma-stft/data/fma_small

METADATA_DIR should be set to the unzipped fma metadata. ex. ~/fma-stft/data/fma_metadata

In [2]:
AUDIO_DIR = os.environ.get('AUDIO_DIR')
METADATA_DIR = os.environ.get('METADATA_DIR')

print("Audio Dir: {}\nMetadata Dir: {}".format(AUDIO_DIR,METADATA_DIR))

Audio Dir: /home/actlab/mark/fma-stft/data/fma_small
Metadata Dir: /home/actlab/mark/fma-stft/data/fma_metadata


# Get genre-based subsets

Get tracks that are in the 'small' subset and that have genres 'Instrumental' and 'Electronic'

In [3]:
# Load all tracks (might take a few seconds)
tracks = fma.utils.load(os.path.join(METADATA_DIR, 'tracks.csv'))

# Get 'small' subset
small = tracks['set', 'subset'] <= 'small'
tracks_small = tracks[small] # dataframe with only 'small' subset tracks

# 'Instrumental' genre
instrumental = tracks_small['track','genre_top'] == 'Instrumental'
instrumental_tracks = tracks_small[instrumental] #only 'small' subset tracks that are also 'instrumental'

# 'Electronic' genre
electronic = tracks_small['track', 'genre_top'] == 'Electronic'
electronic_tracks = tracks_small[electronic]

print("Instrumental dataframe: ")
ipd.display(instrumental_tracks['track'].head())
print("Electronic dataframe: ")
ipd.display(electronic_tracks['track'].head())

  'category', categories=SUBSETS, ordered=True)


Instrumental dataframe: 


Unnamed: 0_level_0,bit_rate,comments,composer,date_created,date_recorded,duration,favorites,genre_top,genres,genres_all,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
10250,256000,0,,2009-04-09 07:19:43,NaT,76,1,Instrumental,[538],"[538, 18, 1235]",,7400,en,Attribution-Noncommercial-No Derivative Works ...,6577,,0,,[],Tree Symphony
11638,160000,0,,2009-04-28 07:44:13,2010-09-29,155,1,Instrumental,[538],"[538, 18, 1235]",,3620,en,Attribution-Noncommercial-Share Alike 3.0 Unit...,1460,,13,,[],Romantic Neck Tattoo
14568,320000,0,,2009-06-06 09:47:46,NaT,105,12,Instrumental,"[18, 1235]","[18, 1235]",,15064,en,Attribution-Noncommercial-Share Alike 2.0 UK: ...,11214,,1,,[],Monkey Dance
14569,320000,0,,2009-06-06 09:47:56,NaT,406,22,Instrumental,"[18, 1235]","[18, 1235]",,12282,en,Attribution-Noncommercial-Share Alike 2.0 UK: ...,6319,,2,,[],Gretchen's Tango
14570,320000,0,,2009-06-06 09:48:01,NaT,71,4,Instrumental,"[18, 1235]","[18, 1235]",,7371,en,Attribution-Noncommercial-Share Alike 2.0 UK: ...,3850,,3,,[],So Wunsch Ich Dir


Electronic dataframe: 


Unnamed: 0_level_0,bit_rate,comments,composer,date_created,date_recorded,duration,favorites,genre_top,genres,genres_all,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1482,256000,0,,2008-11-26 03:00:45,2008-11-26,216,7,Electronic,[15],[15],,93276,en,Attribution-Noncommercial-Share Alike 3.0 Unit...,77299,,2,,[],Reindeer Dance
3573,256000,0,,2008-12-04 19:58:19,2003-07-23,235,15,Electronic,[15],[15],"<p><span style=""margin: 0pt 5px; float: left;""...",7512,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,6678,,19,,[],Piggled
4519,256000,0,,2008-12-04 21:21:24,2008-10-24,117,13,Electronic,[15],[15],"<p><span style=""font-family: Verdana,Geneva,Ar...",33081,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,28728,,1,,[],Golden
4520,256000,0,,2008-12-04 21:21:26,2008-10-24,199,25,Electronic,[15],[15],"<p><span style=""font-family: Verdana,Geneva,Ar...",22786,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,18365,,2,,[],From Stardust to Sentience
4521,256000,0,,2008-12-04 21:21:29,2008-10-24,184,9,Electronic,[15],[15],"<p><span style=""font-family: Verdana,Geneva,Ar...",24779,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,23431,,3,,[],Namer


## Optional: Save subset .csv s for faster loading in the future
This isn't used in this notebook, it just might be useful for other purposes

In [4]:
tracks_small.to_csv(os.path.join(METADATA_DIR,'tracks_small.csv'))
instrumental_tracks.to_csv(os.path.join(METADATA_DIR,'tracks_small_instrumental.csv'))
electronic_tracks.to_csv(os.path.join(METADATA_DIR,'tracks_small_electronic.csv'))

# Create file list from subsets
The index values from the dataframes in the last section coorespond to the filenames of the actual audio files. This section creats a list of filepaths for loading them

In [5]:
def audio_paths_from_indices(index):
    paths = []
    for i in range(index.size):
        # tracks are seperated into folders of 1000 songs ordered sequentially
        folder_num = index[i] // 1000
        # folder/file names are 3/6 digits respectively padded with 0
        paths.append(os.path.join(AUDIO_DIR, "{:03}/{:06}.mp3".format(folder_num,index[i])))
    return paths

In [6]:
# get index values
instrumental_index = instrumental_tracks.index.values
electronic_index = electronic_tracks.index.values

# The fma_small subset is genre-balanced, there should be ~1000 of each
print("# Instrumental Tracks: {}\n# Electronic Tracks: {}".format(instrumental_index.size,electronic_index.size))

# generate file lists
instrumental_paths = audio_paths_from_indices(instrumental_index)
electronic_paths = audio_paths_from_indices(electronic_index)

print("Instrumental songs: ")
ipd.display(instrumental_paths[:5])
print("Electronic songs: ")
ipd.display(electronic_paths[:5])

# Instrumental Tracks: 1000
# Electronic Tracks: 1000
Instrumental songs: 


['/home/actlab/mark/fma-stft/data/fma_small/010/010250.mp3',
 '/home/actlab/mark/fma-stft/data/fma_small/011/011638.mp3',
 '/home/actlab/mark/fma-stft/data/fma_small/014/014568.mp3',
 '/home/actlab/mark/fma-stft/data/fma_small/014/014569.mp3',
 '/home/actlab/mark/fma-stft/data/fma_small/014/014570.mp3']

Electronic songs: 


['/home/actlab/mark/fma-stft/data/fma_small/001/001482.mp3',
 '/home/actlab/mark/fma-stft/data/fma_small/003/003573.mp3',
 '/home/actlab/mark/fma-stft/data/fma_small/004/004519.mp3',
 '/home/actlab/mark/fma-stft/data/fma_small/004/004520.mp3',
 '/home/actlab/mark/fma-stft/data/fma_small/004/004521.mp3']

# Create Spectrograms from files

Now we can calculate short time fourier transforms (STFTs), create spectrograms, and calculate mel-scale spectrograms

### STFT / Spectrogram functions

Audio processing functions are found int utilities/audio_utils.py. That file makes use of several functions compiled from [Tim Sainburg's blog](https://timsainb.github.io/spectrograms-mfccs-and-inversion-in-python.html)

He additionaly credits:
1. [Kyle Kastner](https://gist.github.com/kastnerkyle/179d6e9a88202ab0a2fe) for spectrogram and inversion code
2. [James Lyons](https://github.com/jameslyons/python_speech_features) for mel-scale and mel filters

Aditional information about spectrograms and STFTs:
https://en.wikipedia.org/wiki/Short-time_Fourier_transform

### Additional utilities

In [None]:
# converts mp3 to mono-channel 1D wave
#  Input:
#    mp3_path: path to mp3 file
#    t_start,t_stop: time endpoints in seconds to retrieve audio
#       if unspecified, whole audio will be used
#  Output:
#    audio_segment: full pydub mp3 AudioSegment
#    wave: mono-channel (1D) waveform extracted from mp3 with given time bounds
#  Notes:
#    No error checking; make sure path and time endpoints make sense
#     
def mono_sample_from_mp3(mp3_path, t_start = None, t_stop = None):
    #load audio segment
    audio_segment = AudioSegment.from_mp3(mp3_path)
    #get mono wave
    wave = mono_wave(audio_segment,t_start,t_stop)
    
    return audio_segment, wave
    

def mono_wave(audio_segment, t_start = None, t_stop = None):
    if (t_start == None):
        t_start = 0
    if (t_stop == None):
        t_stop = audio_segment.duration_seconds
        
    #convert raw audio data to numpy array
    raw_data = np.fromstring(audio_segment.raw_data, np.int16)
    sample = raw_data
    #convert to mono channel
    if (audio_segment.channels == 2):
        #raw data is interleaved channels [amp1 amp2 amp1 amp2....]
        sample = np.array([raw_data[0::2].copy(), raw_data[1::2].copy()]).transpose()
        sample = (sample[:, 0] / 2 + sample[:, 1] / 2).astype(np.int16)
    #else just use the first channel (could improve for multi-channel audio)
    else:
        sample = np.array([raw_data[0::audio_segment.channels].copy()]).transpose()
        
    #sample rate (Hz)
    sr = audio_segment.frame_rate
    #get audio from time bounds
    sample = sample[(t_start*sr):(t_stop*sr)]

    return sample

### Making a spectrogram

In [8]:
# Random song from instrumental, electronic genres
inst_song = instrumental_paths[5]
elec_song = electronic_paths[111]

# Load songs
inst_audio = AudioSegment.from_mp3(inst_song)
elec_audio = AudioSegment.from_mp3(elec_song)

# Stft
inst_wav, inst_freq,inst_time,inst_Zxx = mono_stft(inst_audio)
elec_wav, elec_freq,elec_time,elec_Zxx = mono_stft(elec_audio)

# Data shape (should be the same for both!)
print("Data Shape:\n  Instrumental: {}\n  Electronic: {}".format(inst_Zxx.shape,elec_Zxx.shape))

Data Shape:
  Instrumental: (129, 3447)
  Electronic: (129, 3447)
