<a href="https://colab.research.google.com/github/msmsm104/Dacon/blob/TIL/Preprocessing/Data_preprocessing_1_0612(64).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Wav파일

    - data frame에 feature로 표현.
    - Dacon baseline 참조.

### Import

In [None]:
import random
import pandas as pd
import numpy as np
import os
import librosa

from tqdm.auto import tqdm

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings(action='ignore') 

### Hyper parameter Setting

In [None]:
CFG = {
    'SR':16000,
    'N_MFCC':64, # MFCC 벡터를 추출할 개수
    'SEED':41
}

### Fixed Random Seed

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

### Data Preprocessing_1

In [None]:
train_df = pd.read_csv('./Raw/open/train_data.csv')
test_df = pd.read_csv('./Raw/open/test_data.csv')

In [None]:
def get_mfcc_feature(df, data_type, save_path):
    # Data Folder path
    root_folder = './Raw/open'
    if os.path.exists(save_path):
        print(f'{save_path} is exist.')
        return
    features = []
    for uid in tqdm(df['id']):
        root_path = os.path.join(root_folder, data_type)
        path = os.path.join(root_path, str(uid).zfill(5)+'.wav')

        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])

        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)
    
    # 기존의 자가진단 정보를 담은 데이터프레임에 추출된 오디오 Feature를 추가
    mfcc_df = pd.DataFrame(features, columns=['mfcc_'+str(x) for x in range(1,CFG['N_MFCC']+1)])
    df = pd.concat([df, mfcc_df], axis=1)
    df.to_csv(save_path, index=False)
    print('Done.')

In [None]:
get_mfcc_feature(train_df, 'train', './train_mfcc_data(64).csv')
get_mfcc_feature(test_df, 'test', './test_mfcc_data(64).csv')

  0%|          | 0/3805 [00:00<?, ?it/s]

Done.


  0%|          | 0/5732 [00:00<?, ?it/s]

Done.


In [None]:
train_df_2 = pd.read_csv('./train_mfcc_data(64).csv')
test_df_2 = pd.read_csv('./test_mfcc_data(64).csv')