## Wav파일

    - data frame에 feature로 표현.
    - Dacon baseline 참조.

### Import

In [2]:
import random
import pandas as pd
import numpy as np
import os
import librosa

from tqdm.auto import tqdm

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings(action='ignore') 

### Hyper parameter Setting

In [3]:
CFG = {
    'SR':16000,
    'N_MFCC':32, # MFCC 벡터를 추출할 개수
    'SEED':41
}

### Fixed Random Seed

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

### Data Preprocessing_1

In [7]:
unlabeled_df = pd.read_csv('../Data/Raw/open/unlabeled_data.csv')

In [11]:
def get_mfcc_feature(df, data_type, save_path):
    # Data Folder path
    root_folder = '../Data/Raw/open'
    if os.path.exists(save_path):
        print(f'{save_path} is exist.')
        return
    features = []
    for uid in tqdm(df['id']):
        root_path = os.path.join(root_folder, data_type)
        path = os.path.join(root_path, str(uid).zfill(5)+'.wav')

        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])

        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)
    
    # 기존의 자가진단 정보를 담은 데이터프레임에 추출된 오디오 Feature를 추가
    mfcc_df = pd.DataFrame(features, columns=['mfcc_'+str(x) for x in range(1,CFG['N_MFCC']+1)])
    df = pd.concat([df, mfcc_df], axis=1)
    df.to_csv(save_path, index=False)
    print('Done.')

In [12]:
get_mfcc_feature(unlabeled_df, 'unlabeled', './unlabeled_mfcc_data.csv')

  0%|          | 0/1867 [00:00<?, ?it/s]

Done.


In [17]:
unlabeled_df_2 = pd.read_csv('../Data/Interim/unlabeled_mfcc_data.csv')

In [22]:
unlabeled_df_2

Unnamed: 0,id,age,gender,respiratory_condition,fever_or_muscle_pain,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,...,mfcc_23,mfcc_24,mfcc_25,mfcc_26,mfcc_27,mfcc_28,mfcc_29,mfcc_30,mfcc_31,mfcc_32
0,9538,35,male,1,0,-1120.77430,3.262080,-1.125641,-0.140864,-1.190534,...,-0.328535,-0.153679,-0.185454,-0.072446,-0.184721,0.007735,0.042908,0.067729,-0.103394,0.029599
1,9539,40,female,0,1,-563.66266,40.990430,11.358990,10.386800,0.678367,...,-3.095698,-1.243139,-1.159546,-1.044881,-1.760962,-0.561652,-1.371108,-0.277004,-1.092461,-1.453109
2,9540,33,male,0,0,-471.12717,14.603409,-2.081593,0.689829,-2.041967,...,-1.585414,1.074778,-0.647867,0.288036,-1.733079,0.312302,-0.947721,1.203055,0.150240,0.336698
3,9541,35,male,0,0,-479.80826,9.709410,-4.041464,-1.983426,-4.197571,...,-1.153454,-0.358307,-1.067606,0.646617,-1.439341,0.566152,-0.633585,-0.243688,-0.643032,0.327573
4,9542,54,female,0,0,-735.22080,34.609220,9.149350,14.897414,-1.717604,...,-6.462830,3.713589,-5.910144,3.659088,-5.101023,3.369651,-4.941661,3.173295,-4.588366,2.735559
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1862,11400,8,female,0,0,-460.73500,22.145943,-5.924560,6.891037,-4.565688,...,-1.168158,0.281287,-0.027616,-0.804502,-0.053456,1.071405,-1.741596,-0.108647,-0.523533,1.001609
1863,11401,29,male,0,1,-331.94794,34.791584,6.310560,7.249037,-11.100346,...,-6.816239,-1.239252,-7.371759,-4.854304,-5.922822,-2.728520,-0.768779,-1.207135,0.277877,2.442800
1864,11402,17,male,0,0,-429.88678,26.009357,-0.204722,1.174492,-4.136636,...,3.359710,1.737853,-1.403238,-3.027201,-2.623329,1.607557,2.409688,3.169971,0.131990,0.524453
1865,11403,22,male,0,0,-647.73620,27.062975,7.233326,16.181942,4.624071,...,-1.979098,-0.542786,-1.097919,0.407303,-0.257606,-0.489647,-1.584274,0.220399,-1.117770,-1.005962
