# MFCCs File Creation

## Mount Drive

In [5]:
# This is to mount your Google Drive to Colab to read and write the data
# from google.colab import drive
# drive.mount("/content/drive")

## Import Libraries

In [1]:
import json
import librosa
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")

## Extract Features from MP3s

In [4]:
import zipfile
import os

# Extracting MP3 files if not done already
if not os.path.exists('./Datasets/MP3s/'):
    with zipfile.ZipFile('./Datasets/MP3s.zip', 'r') as mp3s_zip:
        mp3s_zip.extractall('./Datasets/')

# PATH is the directory you want to write your data to.
PATH = "./Datasets/MFCCs/"

N_MFCC = 30
NUM_SEGMENTS = 30
TRACK_DURATION = 30
SAMPLE_RATE = 22050
SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION
SAMPLES_PER_SEGMENT = int(SAMPLES_PER_TRACK / NUM_SEGMENTS)

FILE_NAME = "mfcc" + str(N_MFCC) + "_NS" + str(NUM_SEGMENTS) + "_D" + str(TRACK_DURATION) + "_RNN-LSTM.json"
# e.g. FILE_NAME = "mfcc20_NS10_D30_RNN-LSTM.json" 

# mfcc20 means 20 coefficients
# NS10 means the audio track is cut into 10 segments
# D30 means the overall audio track duration is 30 seconds (starting from the beginning of the MP3)

# Dictionary to store labels and MFCCs
data_json = {
  "labels": [],
  "mfcc": []
}

# Read the file to extract the genre
# trackGenre.csv is the file with the ground truth and is read to get all the labels
file_name = "./Datasets/trackGenre.csv"
data = pd.read_csv(file_name)

# This loops through all the MP3 files, computes the MFCCs for each file, and stores them in a .json object with the subgenre label as a string
for file in os.listdir("./Datasets/MP3s"):
    song_name = f"./Datasets/MP3s/{file}"
    y, sr = librosa.load(song_name, sr=SAMPLE_RATE, duration=TRACK_DURATION)

    # Get the id of the track (this is the filename with the .mp3 removed)
    index = np.where(data == int(file[: -4]))[0][0]
    # Find the genre of the track
    genre = data.iloc[index][1]

    # Process all segments of the audio file
    for d in range(NUM_SEGMENTS):
        # Calculate start and finish sample for the current segment
        start = SAMPLES_PER_SEGMENT * d
        finish = start + SAMPLES_PER_SEGMENT

        # Extract mfcc
        mfcc = librosa.feature.mfcc(y[start:finish], sr=sr, n_mfcc=N_MFCC)
        mfcc = mfcc.T

        data_json["labels"].append(genre)
        data_json["mfcc"].append(mfcc.tolist())

with open(PATH + FILE_NAME, "w") as fp:
    json.dump(data_json, fp, indent=4)