<a href="https://colab.research.google.com/github/msmsm104/Dacon_covid19/blob/main/Preprocessing/mfcc/Data_preprocessing_1_0611(32).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Wav파일

    - data frame에 feature로 표현.
    - Dacon baseline 참조.

In [None]:
!pip install librosa

Collecting librosa
  Downloading librosa-0.9.1-py3-none-any.whl (213 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.1/213.1 KB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
Collecting pooch>=1.0
  Downloading pooch-1.6.0-py3-none-any.whl (56 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 KB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting soundfile>=0.10.2
  Downloading SoundFile-0.10.3.post1-py2.py3.cp26.cp27.cp32.cp33.cp34.cp35.cp36.pp27.pp32.pp33-none-macosx_10_5_x86_64.macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.whl (613 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m613.4/613.4 KB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m
Collecting numba>=0.45.1
  Downloading numba-0.55.2-cp38-cp38-macosx_10_14_x86_64.whl (2.3 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m

### Import

In [None]:
import random
import pandas as pd
import numpy as np
import os
import librosa

from tqdm.auto import tqdm

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings(action='ignore') 

### Hyper parameter Setting

In [None]:
CFG = {
    'SR':16000,
    'N_MFCC':32, # MFCC 벡터를 추출할 개수
    'SEED':41
}

### Fixed Random Seed

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

### Data Preprocessing_1

In [None]:
train_df = pd.read_csv('./Raw/open/train_data.csv')
test_df = pd.read_csv('./Raw/open/test_data.csv')

In [None]:
def get_mfcc_feature(df, data_type, save_path):
    # Data Folder path
    root_folder = './Raw/open'
    if os.path.exists(save_path):
        print(f'{save_path} is exist.')
        return
    features = []
    for uid in tqdm(df['id']):
        root_path = os.path.join(root_folder, data_type)
        path = os.path.join(root_path, str(uid).zfill(5)+'.wav')

        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])

        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)
    
    # 기존의 자가진단 정보를 담은 데이터프레임에 추출된 오디오 Feature를 추가
    mfcc_df = pd.DataFrame(features, columns=['mfcc_'+str(x) for x in range(1,CFG['N_MFCC']+1)])
    df = pd.concat([df, mfcc_df], axis=1)
    df.to_csv(save_path, index=False)
    print('Done.')

In [None]:
get_mfcc_feature(train_df, 'train', './train_mfcc_data.csv')
get_mfcc_feature(test_df, 'test', './test_mfcc_data.csv')

  0%|          | 0/3805 [00:00<?, ?it/s]

Done.


  0%|          | 0/5732 [00:00<?, ?it/s]

Done.


In [None]:
train_df_2 = pd.read_csv('./train_mfcc_data.csv')
test_df_2 = pd.read_csv('./test_mfcc_data.csv')

In [None]:
train_df_2

Unnamed: 0,id,age,gender,respiratory_condition,fever_or_muscle_pain,covid19,mfcc_1,mfcc_2,mfcc_3,mfcc_4,...,mfcc_23,mfcc_24,mfcc_25,mfcc_26,mfcc_27,mfcc_28,mfcc_29,mfcc_30,mfcc_31,mfcc_32
0,1,24,female,0,1,0,-274.93472,29.345425,-19.152718,-7.836880,...,-2.320942,2.150005,-0.925417,2.116030,-0.192730,2.417784,-4.736650,1.237788,-1.600426,-1.462419
1,2,51,male,0,0,0,-311.56317,52.478150,-0.098957,-11.070889,...,-6.494778,0.545812,-6.261986,-2.384403,-6.743353,0.255105,-0.966994,-2.113054,-2.433555,0.881178
2,3,22,male,0,0,0,-438.29000,46.588910,-22.689060,-3.607528,...,-0.156510,-1.682014,2.618637,1.244486,-0.074025,-0.964130,-0.735731,-0.420304,0.795621,0.411339
3,4,29,female,1,0,0,-368.42610,46.939358,-7.443123,-3.694383,...,-0.155855,3.839285,-2.503368,2.750743,1.758510,2.094587,0.295868,1.737648,-0.654136,1.847976
4,5,23,male,0,0,0,-535.19446,7.165523,-7.422007,2.231186,...,-0.144311,-0.413284,-1.452623,0.235582,-0.973687,0.777570,-0.735323,1.141641,-0.497988,1.190929
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3800,3801,53,male,0,0,0,-328.48767,68.190380,-40.383747,2.297682,...,-0.168663,1.255295,-4.972386,-0.053485,1.204618,3.961083,-4.195477,2.746365,-4.138545,-2.572868
3801,3802,25,male,0,0,0,-386.25732,73.931350,-12.114974,-0.500643,...,-2.914732,4.907058,1.142662,6.096552,2.885290,3.611389,-0.631856,3.847093,2.540035,5.938597
3802,3803,26,female,0,0,0,-347.20593,58.544130,12.969809,27.973340,...,-2.396356,-2.741529,-2.629161,-1.906816,-3.183893,-1.446224,-0.021817,-1.601471,-1.630300,-1.382297
3803,3804,27,female,0,0,0,-179.11195,70.697860,-14.571251,-10.143574,...,-7.733275,5.680011,-2.057645,1.684632,-1.205147,-0.134177,-4.729319,0.557955,-5.249906,-0.278406


In [None]:
test_df_2

Unnamed: 0,id,age,gender,respiratory_condition,fever_or_muscle_pain,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,...,mfcc_23,mfcc_24,mfcc_25,mfcc_26,mfcc_27,mfcc_28,mfcc_29,mfcc_30,mfcc_31,mfcc_32
0,3806,48,female,1,0,-619.98790,9.328649,0.821094,2.373343,-0.171163,...,-0.313271,-0.053711,-0.029855,0.043562,-0.405293,-0.246975,-0.110937,0.019196,-0.155829,-0.130705
1,3807,24,female,0,0,-493.60570,8.492651,-2.758263,-2.801817,-2.770232,...,-0.677968,-0.643835,-0.822677,-0.113844,-1.210391,0.273594,-0.685921,0.440101,-0.960277,0.371881
2,3808,29,male,0,0,-1058.92520,-2.354820,-7.397019,-0.686011,-5.422640,...,-0.005907,3.130717,2.591598,0.466579,-0.126740,1.610208,-3.389459,-2.868738,-1.301445,-2.902404
3,3809,39,female,0,0,-362.16165,27.119738,-9.231675,9.747519,-1.013352,...,-3.150050,-0.271802,0.034901,0.414227,-2.836006,0.105023,0.518484,2.139494,0.944032,1.029428
4,3810,34,male,0,0,-387.59268,45.165874,-14.454832,-2.965205,-9.931789,...,-4.803989,-3.594219,0.289146,-5.298121,1.614137,3.108044,-5.373443,0.106868,2.054100,-1.994548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5727,9533,43,male,0,0,-363.31592,48.014732,6.568864,11.673552,-7.893135,...,-2.381601,1.899532,1.335172,-0.633322,1.076864,-1.657736,-5.816984,-1.341339,-1.719530,-0.579730
5728,9534,48,female,0,1,-650.22455,2.844074,2.063639,5.160135,-4.574105,...,-1.014055,0.853542,-0.998176,0.421669,-0.817565,1.201457,-1.148051,0.335183,0.076959,-0.215559
5729,9535,44,female,0,0,-335.09128,54.100250,-15.533144,13.702870,-9.206227,...,-2.222737,-2.004309,-2.250272,-1.290327,-2.485475,-1.268755,-4.346352,0.055307,-1.181318,0.464578
5730,9536,25,female,0,0,-534.98730,17.522040,-9.687601,5.585782,-8.310497,...,-1.565546,-1.021776,-1.453681,-1.553367,-1.581636,-1.428654,-1.613187,-0.816451,-0.766406,0.130751
