## Import

In [21]:
import random
import pandas as pd
import numpy as np
import os


import librosa

from tqdm.auto import tqdm

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder

import datetime as dt

import warnings
warnings.filterwarnings(action='ignore')


In [4]:
CFG = {
    'SR':16000,
    'N_MFCC':32, # MFCC 벡터를 추출할 개수
    'SEED':41
}

## Fixed Random-Seed

In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

## Data Pre-Processing 1

In [6]:
train_df = pd.read_csv('./train_data.csv')
test_df = pd.read_csv('./test_data.csv')
print(train_df.shape, test_df.shape)

(3805, 6) (5732, 5)


In [58]:
train_df.head(30)

Unnamed: 0,id,age,gender,respiratory_condition,fever_or_muscle_pain,covid19,mfcc_1,mfcc_2,mfcc_3,mfcc_4,...,mfcc_23,mfcc_24,mfcc_25,mfcc_26,mfcc_27,mfcc_28,mfcc_29,mfcc_30,mfcc_31,mfcc_32
0,1,24,female,0,1,0,-276.01898,30.51934,-20.31462,-6.689037,...,-2.679408,2.454339,-1.176285,2.314315,-0.339533,2.514413,-4.784703,1.239072,-1.556883,-1.54877
1,2,51,male,0,0,0,-312.99362,54.14133,-1.74855,-9.437219,...,-7.248304,1.238725,-6.89497,-1.810402,-7.259594,0.715028,-1.372265,-1.760624,-2.73518,1.13419
2,3,22,male,0,0,0,-438.60306,46.675842,-22.771935,-3.527923,...,-0.136723,-1.707353,2.649277,1.208829,-0.033701,-1.008729,-0.687255,-0.472232,0.850566,0.35384
3,4,29,female,1,0,0,-369.261,47.762012,-8.256503,-2.89135,...,-0.38923,4.033148,-2.658165,2.867084,1.679876,2.136411,0.289793,1.709179,-0.592465,1.754549
4,5,23,male,0,0,0,-535.68915,7.509357,-7.762263,2.56766,...,-0.27936,-0.292286,-1.559678,0.328864,-1.053423,0.84406,-0.788914,1.18274,-0.527028,1.208361
5,6,50,male,0,0,0,-466.56113,48.221058,0.981815,-5.019971,...,-6.165534,-2.498237,-3.924848,-1.116564,-1.391424,-0.872687,-2.562458,-2.176331,-2.892324,-2.088193
6,7,33,male,0,0,1,-392.494,20.924644,5.937879,-0.093119,...,-2.169045,-0.130023,-2.887418,-0.308717,-1.502242,0.788942,0.384869,2.016832,0.892808,0.571174
7,8,24,female,1,0,0,-494.9477,13.273375,-18.981974,6.067473,...,0.021254,0.408402,-0.353228,1.308711,-1.117597,1.724223,-1.864057,-1.428695,-0.704404,-0.183166
8,9,46,female,0,0,0,-445.49744,14.977826,-10.269142,1.112762,...,-3.916616,0.684809,-2.865527,1.779373,-1.57021,-0.500411,0.248475,-0.361006,1.072051,0.139806
9,10,29,male,0,0,0,-758.77875,9.09663,0.964701,1.325225,...,-0.820875,0.42445,-1.110906,0.288346,-0.726582,0.700959,-0.59736,0.322036,-0.124574,0.309019


#### 30개 파일들을 직접 듣고 대략적으로 어떤 느낌인지 들어보자
#### . . .
#### 전혀 모르겠다. 안 걸린 녀석이 걸린 녀석처럼 기침한다...
#### 이게 가능한 주제일까?

In [8]:
test_df.head()

Unnamed: 0,id,age,gender,respiratory_condition,fever_or_muscle_pain
0,3806,48,female,1,0
1,3807,24,female,0,0
2,3808,29,male,0,0
3,3809,39,female,0,0
4,3810,34,male,0,0


In [57]:
train_df[train_df['covid19']==1]

Unnamed: 0,id,age,gender,respiratory_condition,fever_or_muscle_pain,covid19,mfcc_1,mfcc_2,mfcc_3,mfcc_4,...,mfcc_23,mfcc_24,mfcc_25,mfcc_26,mfcc_27,mfcc_28,mfcc_29,mfcc_30,mfcc_31,mfcc_32
6,7,33,male,0,0,1,-392.49400,20.924644,5.937879,-0.093119,...,-2.169045,-0.130023,-2.887418,-0.308717,-1.502242,0.788942,0.384869,2.016832,0.892808,0.571174
14,15,30,male,0,0,1,-412.97330,44.789910,-8.668566,11.191884,...,0.952792,1.688580,0.732676,1.733848,0.207823,1.893757,0.144773,-0.967192,-1.463389,0.279266
15,16,24,female,0,0,1,-233.63118,39.077225,-15.857936,-6.900169,...,-5.625091,1.376487,-4.725483,3.535540,-4.929983,2.132927,-1.723180,0.493292,1.273371,-0.321275
22,23,31,female,1,0,1,-716.39970,29.925772,-2.197279,2.240374,...,-1.179505,-1.754365,-1.844654,-1.171494,-0.696895,0.258444,-1.404548,-0.520643,-1.533232,-0.667150
26,27,12,female,0,1,1,-428.77863,63.758984,-36.968487,-18.417837,...,-2.291507,-4.197083,-0.514003,3.390683,-0.009868,2.103752,4.144619,6.069761,2.793978,2.994054
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3756,3757,41,male,0,0,1,-601.51996,14.495886,-2.129152,0.518551,...,-1.121872,-0.115825,-0.526331,-0.039565,-0.893215,0.544577,-0.842302,0.421216,0.194496,-0.074324
3758,3759,13,other,1,0,1,-298.47144,102.732090,-14.013776,2.078021,...,-3.258604,-0.852211,-1.616615,-0.719892,-0.836753,-0.372074,-1.362027,1.931951,-0.946276,-1.495326
3772,3773,27,male,1,1,1,-477.26105,23.522530,-13.082051,-6.686589,...,0.359540,1.995889,0.919962,0.758989,-0.305339,1.554368,-0.852398,0.371463,-0.496212,0.135323
3779,3780,34,male,0,0,1,-746.53600,71.652770,-22.068201,42.747646,...,-0.532155,-4.653461,-0.201263,-4.644353,2.041364,2.707447,-1.563985,3.313875,3.301421,1.862299


In [9]:
train_df[train_df['covid19']==1].shape

(306, 6)

In [12]:
def get_mfcc_feature(df, data_type, save_path):
    # Data Folder path
    root_folder = './wav_dataset' 
    if os.path.exists(save_path):
        print(f'{save_path} is exist.')
        return
    features = []
    for uid in tqdm(df['id']):
        root_path = os.path.join(root_folder, data_type)
        path = os.path.join(root_path, str(uid).zfill(5)+'.wav')

        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=CFG['SR'])
        
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=CFG['N_MFCC'])

        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:
            y_feature.append(np.mean(e))
        features.append(y_feature)
    
    # 기존의 자가진단 정보를 담은 데이터프레임에 추출된 오디오 Feature를 추가
    mfcc_df = pd.DataFrame(features, columns=['mfcc_'+str(x) for x in range(1,CFG['N_MFCC']+1)])
    df = pd.concat([df, mfcc_df], axis=1)
    df.to_csv(save_path, index=False)
    print('Done.')

In [13]:
get_mfcc_feature(train_df, 'train', './train_mfcc_data.csv')
get_mfcc_feature(test_df, 'test', './test_mfcc_data.csv')

  0%|          | 0/3805 [00:00<?, ?it/s]

Done.


  0%|          | 0/5732 [00:00<?, ?it/s]

Done.


In [None]:
test_df.head()

## Data Pre-Processing 2

In [14]:
# wav 파일의 MFCC Feature와 상태정보를 합친 학습데이터를 불러옵니다.
train_df = pd.read_csv('./train_mfcc_data.csv')

# 학습데이터를 모델의 input으로 들어갈 x와 label로 사용할 y로 분할
train_x = train_df.drop(columns=['id', 'covid19'])
train_y = train_df['covid19']

In [15]:
def onehot_encoding(ohe, x):
    # 학습데이터로 부터 fit된 one-hot encoder (ohe)를 받아 transform 시켜주는 함수
    encoded = ohe.transform(x['gender'].values.reshape(-1,1))
    encoded_df = pd.DataFrame(encoded, columns=ohe.categories_[0])
    x = pd.concat([x.drop(columns=['gender']), encoded_df], axis=1)
    return x

In [16]:
# 'gender' column의 경우 추가 전처리가 필요 -> OneHotEncoder 적용
ohe = OneHotEncoder(sparse=False)
ohe.fit(train_x['gender'].values.reshape(-1,1))
train_x = onehot_encoding(ohe, train_x)

## Train

In [17]:
model = MLPClassifier(random_state=CFG['SEED']) # Sklearn에서 제공하는 Multi-layer Perceptron classifier 사용
model.fit(train_x, train_y) # Model Train

## Inference

In [18]:
# 위의 학습데이터를 전처리한 과정과 동일하게 test data에도 적용
test_x = pd.read_csv('./test_mfcc_data.csv')
test_x = test_x.drop(columns=['id'])
# Data Leakage에 유의하여 train data로만 학습된 ohe를 사용
test_x = onehot_encoding(ohe, test_x)

# Model 추론
preds = model.predict(test_x)

## Submission

In [56]:
submission = pd.read_csv('./sample_submission.csv')
submission['covid19'] = preds
submission.to_csv('submit2.csv', index=False)

In [None]:
date_now = datetime.now().strftime('%Y%m%d_%H%M%S')

In [32]:
import datetime

In [39]:
datetime.timedelta(weeks=1)

datetime.timedelta(days=7)

In [34]:
datetime.timedelta

datetime.timedelta