In [None]:
pip install ffmpeg

In [None]:
import os
import librosa
import math
import json

DATASET_PATH = "genres_original"
JSON_PATH = "data_10.json"
SAMPLE_RATE = 22050
DURATION = 30 # measured in seconds
SAMPLES_PER_TRACK = SAMPLE_RATE * DURATION

# Number of segments: Breaks up the tracks into a number of different segments, this increases the 
# number of training data to test on the NN
def save_mfcc(dataset_path, json_path, n_mfcc=13, n_fft=2048, hop_length=512, num_segments=5):

    # dictionary to store 
    # mapping: the genre that is defined
    # mfcc: the training data, the inputs
    # labels: the output that we expect to label the genres (in number form)
    data = {
        "mapping": [],
        "mfcc": [],
        "labels": []
    }

    num_samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)
    expected_num_mfcc_vectors_per_segment = math.ceil(num_samples_per_segment / hop_length) 
    # THe above value could be 1.2 -> 2 (round to higher integer)

    # Loop through all the genres
    # Dirpath: path to the folder we are currently in 
    # DirNames: All names of folders in the dirPath
    # FileNames: All the file names in the dirPath
    # i: the index, which will be used for the labels 
    for i, (dirPath, dirNames, fileNames) in enumerate(os.walk(dataset_path)):
        # ensure that we're not at the root level
        # we are not at the data level 
        if dirPath is not dataset_path:

            # save the semantic label
            dirPath_components = dirPath.split("/") # genre/blues => ["genre", "blues"]
            semantic_label = dirPath_components[-1] # This gets the last element in the set
            data["mapping"].append(semantic_label)
            print("\nProcessing {}".format(semantic_label))

            # Go through all the files in the dirPath
            # This will process all the files for the specific genre 
            for f in fileNames:

                # Prevent processing of non wav files
                if not f.endswith('.wav'):
                    continue
                
                # load audio file
                file_path = os.path.join(dirPath, f)
                signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)

                # Process segments, extract MFCC, and store the data
                for s in range(num_segments):
                    # Segment logic
                    start_sample = num_samples_per_segment * s # s=0 -> 0
                    finish_sample = start_sample + num_samples_per_segment # s=0 -> num_samples_per_segment
                    
                    # Get MFCC
                    mfcc = librosa.feature.mfcc(y=signal[start_sample:finish_sample], 
                                                sr=sr,
                                                n_fft=n_fft,
                                                n_mfcc=n_mfcc,
                                                hop_length=hop_length)

                    mfcc = mfcc.T # You want to transpose
                    
                    # Store the mfcc for segment if it has the expected length
                    if len(mfcc) == expected_num_mfcc_vectors_per_segment:
                        data["mfcc"].append(mfcc.tolist())
                        data["labels"].append(i - 1)
                        print("{}, segment: {}".format(file_path, s + 1))

    # Save to Json File
    with open(json_path, "w") as fp:
        json.dump(data, fp, indent=4)

if __name__ == "__main__":
    save_mfcc(DATASET_PATH, JSON_PATH, num_segments=10)
    print("Finished Processing All Files")
                             
    