In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import sklearn
import os
from os.path import join

In [None]:
#DATA Load

DATA_PATH = join('/kaggle','input','2021-ml-tp-p6')
pd_train = pd.read_csv(join(DATA_PATH, 'train_data.csv'))
pd_test = pd.read_csv(join(DATA_PATH, 'test_data.csv'))

print(pd_train.info(), pd_test.info())

# **Feature extraction**

 1. Sampling&Quantization(continuos audio signal) --> Discrete audio signal
 2. Short Time Fourier Transfrom(Discrete audio signal) --> Spectrogram
 3. Mel-Filter(Spectrogram) --> Mel-Spectrogram
 4. Discrete Cosine Transform(Mel-Spectrogram) --> Mel Frequency Cepstrum Coefficient
 
All of the above processes are implemented through functions provided by the librosa library. The following functions are used:
1. librosa.load : Reading continuous audio signal (.wav file) as discrete audio signal.
2. librosa.stft : FFT (Fast Fourier Transform) is performed after dividing the discrete audio signal by frame by windowing  
[stft_documentation](https://librosa.org/doc/latest/generated/librosa.stft.html?highlight=stft)
3. librosa.feature.melspectrogram : Mel-Spectrogram is created by applying Mel-filter to the spectrogram obtained through the stft function   as input.  
[melspectrogram_documentation](https://librosa.org/doc/latest/generated/librosa.feature.melspectrogram.html?highlight=melspectrogram)
4. librosa.feature.mfcc : MFCC is generated by performing DCT with the Mel-spectrogram obtained through the melspectrogram function as input.  [mfcc_documentation](https://librosa.org/doc/latest/generated/librosa.feature.mfcc.html?highlight=mfcc)

In [None]:
import librosa
import glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import librosa, librosa.display 


def extract_feature(file_name):
    # Function to extract speech features
    # Input: name of .wav audio file
    # Output: features processed from the input file (Spectrogram, Mel-spectrogram, MFCC)
    
    result=np.array([])
    X, sample_rate = librosa.load(file_name, sr=22050)
    
    #----------step3. Finding the spectrogram---------------------
    # 1. Using the input signal (X) as the input of the librosa.stft function, obtain the spectrogram.
    # - Human voice cannot change the current spoken pronunciation within the time between 20 and 40 (ms).
    # Therefore, when dividing the section on the time axis, adjust the n_fft parameter value in order to divide it within an interval of 20 to 40 (ms).
    #
    # 2. Take the absolute value of the spectrogram and convert it to a value in the form of a complex number.
    #
    # 3. To use the obtained spectrogram for training, take the average value of the frame axis and store it in spectrogram_feature.
    # - Since the shape of the spectrogram is (length of frequency, number of frames)
    # (Using the np.mean function, the shape of the spectrogram should be (1, length of frequency).)
    # -----------------------------------------------------------------------------
    
    spectrogram = librosa.stft(X, n_fft = 462)
    # Use the input signal X as input to the librosa.stft() function and adjust the n_fft parameter value to return a spectrogram.
    
    # Setting history for highest performance
    # 92ms: n_fft = 2048, 23ms: n_fft = 512
    # 20ms: 445.2 => 446 => 467 => 445 => 512 => 660 => 480 => 462

    spectrogram = np.abs(spectrogram)
    # It takes the absolute value of the spectrogram through the np.abs() function and converts it into a complex number.
    spectrogram_feature = np.mean(spectrogram, axis = 1)
    # Using the np.mean function, take the average value on the frame axis axis = 1 and store it in spectrogram_feature.
    
    #----------step4. Finding the Mel-spectrogram.--------------------
    # 1. Create a power spectrogram by squaring the spectrogram obtained in step3-2.
    #
    # 2. Use the power spectrogram as input to the librosa.feature.melspectrogram function to obtain the mel-spectrogram.
    #
    # 3. Since the mel-spectrogram obtained in the previous process is a power-magnitude value,
    # Convert power magnitude to decibel (db) value through librosa.power_to_db function.
    #
    # 4. To use the obtained mel-spectrogram for learning, take the average value of the frame axis and store it in mel_spectrogram_feature.
    # - Since the shape of the mel-spectrogram is (length of frequency, number of frames)
    # (Using the np.mean function, the shape of the spectrogram should be (1, length of frequency).)
    # ------------------------------------------------- ----------------------------
    
    power_spectrogram = np.square(spectrogram)
    mel_spectrogram = librosa.feature.melspectrogram(S = power_spectrogram) #446 => 467 => 512 => delete
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram)
    mel_spectrogram_feature = np.mean(mel_spectrogram_db, axis = 1)

     #----------step5. Finding the MFCC----------
     # 1. Obtain MFCC by inputting the mel-spectrogram converted to decibel in step4-3 as input to the librosa.feature.mfcc function.
     #
     # 2. To use the obtained MFCC learning, take the average value of the frame axis and store it in mfcc_feature.
     # - MFCC shape consists of (MFCC length, number of frames).
     # (Using the np.mean function, the shape of the MFCC should be (1, the length of the MFCC))
     # ------------------------------------------------- ----------------------------
    
    MFCC = librosa.feature.mfcc(S = mel_spectrogram_db)
    mfcc_feature = np.mean(MFCC, axis = 1)

    return spectrogram_feature, mel_spectrogram_feature, mfcc_feature 

# **Loading Data**

In [None]:
#Load the data and extract features for each sound file
from tqdm import tqdm
def load_data(data_info, isTrain=True):
    
    PATH = join('/kaggle','input','2021-ml-tp-p6')
    if isTrain: #train_data
        train_data = {'spectrogram':[],'mel':[],'mfcc':[]} #Dictionary of voice features
        train_label = [] #List containing labels to be used for training
        
        file_list = data_info['file_name']
        emotion_list = data_info['emotion']
        for file_name, emotion in tqdm(zip(file_list, emotion_list)):
            # Load the training data through the voice file name and emotion information in the train.csv file.
            # Append the voice features obtained through extract_feature to the appropriate array in the train_data dictionary.

            PATH_train = join(DATA_PATH, 'train_data', 'train_data', file_name)
            spectrogram_feature, mel_spectrogram_feature, mfcc_feature = extract_feature(PATH_train)
            train_data['spectrogram'].append(spectrogram_feature)
            train_data['mel'].append(mel_spectrogram_feature)
            train_data['mfcc'].append(mfcc_feature)
            train_label.append(emotion)
            
        return train_data, np.array(train_label)
    
    else: #test_data
        test_data = {'spectrogram':[],'mel':[],'mfcc':[]} #Dictionary of voice features
        file_list = data_info['file_name']
    
        for file_name in tqdm(file_list):
            PATH_test = join(DATA_PATH, 'test_data', 'test_data', file_name)

            spectrogram_feature, mel_spectrogram_feature, mfcc_feature = extract_feature(PATH_test)
            test_data['spectrogram'].append(spectrogram_feature)
            test_data['mel'].append(mel_spectrogram_feature)
            test_data['mfcc'].append(mfcc_feature)
            
        return test_data

#Split the dataset
train_data, y_train = load_data(pd_train)
test_data = load_data(pd_test, isTrain=False)

# Model training and inference
1. Obtained **spectrogram** through Short Time Fourier Transform (stft) for voice signal,
2. By applying mel-filter to the obtained spectrogram, **mel-spectrogram** was obtained.
3. Finally, **mfcc** was calculated by taking discrete cosine transform (DCT) on the mel-spectrogram.

All of the features obtained above are features that can be used for model learning in various tasks using voice data (human voice classification, emotion classification through voice, etc.).
Check how much the accuracy of the model according to each **feature is different**.

- The classifier uses sklearn's **RandomForestClassifier**.

In [None]:
from sklearn.ensemble import RandomForestClassifier
sample = pd.read_csv(join(DATA_PATH,'sample_submit.csv'))


for feature_name in train_data.keys():
    # Spectrogram, mel-spectrogram, and mfcc features exist in the train_data variable in dictionary form.
    x_train = np.array(train_data[feature_name])
    x_test = np.array(test_data[feature_name])
    
    rlf = RandomForestClassifier(criterion = 'entropy', random_state = 1) # setting parameters
    rlf.fit(x_train, y_train) # training model
    predict = rlf.predict(x_test) # predict
    
    # class_weight => default (Points drop when used)
    # min_samples_split => default (Points drop when used)
    # min_samples_leaf => default (Points drop when used)
    
    # baseline 1 (spectrogram) : 0.46064
    # n_fft = 445, criterion = 'entropy', random_state = 1 // kaggle: 0.46527
    # n_fft = 512, criterion = 'entropy', random_state = 1 // kaggle : 0.48148
    
    # baseline 2 (mel-spectrogram) : 0.47222
    # n_fft = 445, criterion = 'entropy', random_state = 1 // kaggle: 0.46990
    # n_fft = 445, criterion = 'entropy', max_depth = 800, random_state = 1 // kaggle: 0.46990
    # n_fft = 445, criterion = 'gini', random_state = 1 // kaggle: 0.44907
    # n_fft = 512, criterion = 'entropy', random_state = 1 // kaggle: 0.49305 **
    # n_fft = 660, criterion = 'entropy', random_state = 1 // kaggle: 0.47453
    # n_fft = 480, criterion = 'entropy', random_state = 1 // kaggle: 0.48148
    # n_fft = 462, criterion = 'entropy', random_state = 1 // kaggle: 0.47453
    
    # baseline 3 (RF_baseline) : 0.51620
    # n_fft = 445, criterion = 'entropy', random_state = 1 // kaggle: 0.56250
    # n_fft = 445, criterion = 'entropy', max_depth = 800, random_state = 1 // kaggle: 56250
    # n_fft = 512, criterion = 'entropy', random_state = 1 // kaggle : 0.56944
    # n_fft = 660, criterion = 'entropy', random_state = 1 // kaggle: 0.55787
    # n_fft = 480, criterion = 'entropy', random_state = 1 // kaggle: 0.57407 **
    # n_fft = 462, criterion = 'entropy', random_state = 1 // kaggle: 0.57175


    sample['emotion'] = predict.reshape(-1,1)
    sample.to_csv(join(feature_name+'.csv'),index=False, header=True)