# Setting

## Library

In [1]:
import pandas as pd
import numpy as np

from scipy import stats

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA, KernelPCA, SparsePCA, TruncatedSVD, IncrementalPCA
from sklearn.decomposition import TruncatedSVD

import os
from tqdm.auto import tqdm
import random
import time
import datetime 

In [2]:
import librosa
import librosa.display
import IPython.display as ipd

In [3]:
import warnings
warnings.filterwarnings(action='ignore') 

## Fixed Random Seed

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

# Load Data Set

## Google Drive Mount

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Unzip File

In [6]:
!unzip -qq '/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/기계 고장 진단/data/기계_고장.zip'

## Load Train / Test Set

In [7]:
df_train = pd.read_csv('./train.csv') # 모두 정상 Sample
df_test = pd.read_csv('./test.csv')

In [8]:
print(df_train.shape)
df_train.head()

(1279, 4)


Unnamed: 0,SAMPLE_ID,SAMPLE_PATH,FAN_TYPE,LABEL
0,TRAIN_0000,./train/TRAIN_0000.wav,2,0
1,TRAIN_0001,./train/TRAIN_0001.wav,0,0
2,TRAIN_0002,./train/TRAIN_0002.wav,0,0
3,TRAIN_0003,./train/TRAIN_0003.wav,2,0
4,TRAIN_0004,./train/TRAIN_0004.wav,2,0


In [9]:
print(df_test.shape)
df_test.head()

(1514, 3)


Unnamed: 0,SAMPLE_ID,SAMPLE_PATH,FAN_TYPE
0,TEST_0000,./test/TEST_0000.wav,2
1,TEST_0001,./test/TEST_0001.wav,2
2,TEST_0002,./test/TEST_0002.wav,0
3,TEST_0003,./test/TEST_0003.wav,0
4,TEST_0004,./test/TEST_0004.wav,0


# Feature Engineering

# Scaling

In [17]:
def scaled_df(df_train, df_train_fan_type, df_test, df_test_fan_type, scaler, fan_type=False):

  df_train_fan_type = df_train_fan_type[['FAN_TYPE']]

  df_train = pd.concat([
                        df_train.reset_index(drop=True),
                        df_train_fan_type.reset_index(drop=True)
                       ],
                       axis=1)
  
  df_test_fan_type = df_test_fan_type[['FAN_TYPE']]

  df_test = pd.concat([
                       df_test.reset_index(drop=True),
                       df_test_fan_type.reset_index(drop=True)
                      ],
                      axis=1)
  
  train_type_0 = df_train.loc[(df_train['FAN_TYPE']==0)]
  train_type_2 = df_train.loc[(df_train['FAN_TYPE']==2)]

  test_type_0 = df_test.loc[(df_test['FAN_TYPE']==0)]
  test_type_2 = df_test.loc[(df_test['FAN_TYPE']==2)]

  train_type_0.drop(columns='FAN_TYPE', inplace=True)
  train_type_2.drop(columns='FAN_TYPE', inplace=True)
  test_type_0.drop(columns='FAN_TYPE', inplace=True)
  test_type_2.drop(columns='FAN_TYPE', inplace=True)

  list_train_0_index = list(train_type_0.index)
  list_train_2_index = list(train_type_2.index)

  list_test_0_index = list(test_type_0.index)
  list_test_2_index = list(test_type_2.index)

  scaled_train_type_0 = scaler.fit_transform(train_type_0)
  scaled_test_type_0 = scaler.transform(test_type_0)

  scaled_train_type_2 = scaler.fit_transform(train_type_2)
  scaled_test_type_2 = scaler.transform(test_type_2)

  train_type_0 = pd.DataFrame(scaled_train_type_0)
  train_type_2 = pd.DataFrame(scaled_train_type_2)

  test_type_0 = pd.DataFrame(scaled_test_type_0)
  test_type_2 = pd.DataFrame(scaled_test_type_2)

  train_type_0.index = list_train_0_index
  train_type_2.index = list_train_2_index

  test_type_0.index = list_test_0_index
  test_type_2.index = list_test_2_index

  df_train = pd.concat([train_type_0, train_type_2], axis=0)
  df_test = pd.concat([test_type_0, test_type_2], axis=0)

  df_train.sort_index(inplace=True)
  df_test.sort_index(inplace=True)

  if fan_type == False:

    pass
  
  if fan_type == True:

    df_train = pd.concat([df_train_fan_type, df_train], axis=1)
    df_test = pd.concat([df_test_fan_type, df_test], axis=1)

  return df_train, df_test

## Feature Extraction

### Zero Crossing Rate

In [13]:
def get_zero_crossing_feature(df, delta=False):
    features = []
    for path in tqdm(df['SAMPLE_PATH']):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=16000)
        
        # librosa패키지를 사용하여 mfcc 추출
        zero = librosa.feature.zero_crossing_rate(y=y)
                              
        if delta == True:

          zero = librosa.feature.delta(zero, order=1)

        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in zero:

            e = np.mean(e)

            y_feature.append(e)

        features.append(y_feature)
    
    zero_df = pd.DataFrame(features,
                           columns=['Zero_Crossing_Rate'])
    
    if delta == True:
      
      zero_df = pd.DataFrame(features,
                             columns=['Zero_Crossing_Rate_delta'])

    print(zero_df.shape)

    return zero_df

In [14]:
zero_train = get_zero_crossing_feature(df_train, delta=False)
zero_test = get_zero_crossing_feature(df_test, delta=False)

  0%|          | 0/1279 [00:00<?, ?it/s]

(1279, 1)


  0%|          | 0/1514 [00:00<?, ?it/s]

(1514, 1)


In [15]:
zero_train.head()

Unnamed: 0,Zero_Crossing_Rate
0,0.133064
1,0.047472
2,0.057276
3,0.130589
4,0.142584


In [18]:
scaler = RobustScaler()
zero_train, zero_test= scaled_df(zero_train,
                                 df_train,
                                 zero_test,
                                 df_test,
                                 scaler,
                                 fan_type=True)

zero_train.columns = ['FAN_TYPE', 'Zero_Crossing_Rate']
zero_test.columns = ['FAN_TYPE', 'Zero_Crossing_Rate']

In [19]:
zero_train.head()

Unnamed: 0,FAN_TYPE,Zero_Crossing_Rate
0,2,0.034359
1,0,-0.34379
2,0,0.507758
3,2,-0.332601
4,2,1.446437


In [20]:
zero_delta_train = get_zero_crossing_feature(df_train, delta=True)
zero_delta_test = get_zero_crossing_feature(df_test, delta=True)

  0%|          | 0/1279 [00:00<?, ?it/s]

(1279, 1)


  0%|          | 0/1514 [00:00<?, ?it/s]

(1514, 1)


In [21]:
zero_delta_train.head()

Unnamed: 0,Zero_Crossing_Rate_delta
0,9.8e-05
1,7.4e-05
2,0.000116
3,0.000155
4,4.7e-05


In [22]:
scaler = RobustScaler()
zero_delta_train, zero_delta_test= scaled_df(zero_delta_train,
                                             df_train,
                                             zero_delta_test,
                                             df_test,
                                             scaler,
                                             fan_type=True)

zero_delta_train.columns = ['FAN_TYPE','Zero_Crossing_Rate_delta']
zero_delta_test.columns = ['FAN_TYPE','Zero_Crossing_Rate_delta']

### RMS

In [23]:
def get_rms_feature(df, delta=False):
    features = []
    for path in tqdm(df['SAMPLE_PATH']):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=16000)
        
        # librosa패키지를 사용하여 RMS 추출
        rms = librosa.feature.rms(y=y)

        if delta == True:

          rms = librosa.feature.delta(rms, order=1)

        y_feature = []
        # 추출된 RMS의 절사평균을 Feature로 사용
        for e in rms:

            e = np.mean(e)

            y_feature.append(e)

        features.append(y_feature)
    
    rms_df = pd.DataFrame(features,
                           columns=['RMS'])
    
    if delta == True:

      rms_df = pd.DataFrame(features,
                           columns=['RMS_delta'])

    print(rms_df.shape)

    return rms_df

In [24]:
rms_train = get_rms_feature(df_train, delta=False)
rms_test = get_rms_feature(df_test, delta=False)

  0%|          | 0/1279 [00:00<?, ?it/s]

(1279, 1)


  0%|          | 0/1514 [00:00<?, ?it/s]

(1514, 1)


In [25]:
rms_train.head()

Unnamed: 0,RMS
0,0.005121
1,0.004604
2,0.004401
3,0.005163
4,0.004931


In [26]:
scaler = RobustScaler()
rms_train, rms_test= scaled_df(rms_train,
                               df_train,
                               rms_test,
                               df_test,
                               scaler,
                               fan_type=True)

rms_train.columns = ['FAN_TYPE', 'RMS']
rms_test.columns = ['FAN_TYPE', 'RMS']

In [27]:
rms_train.head()

Unnamed: 0,FAN_TYPE,RMS
0,2,0.469431
1,0,-0.054741
2,0,-0.924593
3,2,0.749485
4,2,-0.802393


In [39]:
rms_delta_train = get_rms_feature(df_train, delta=True)
rms_delta_test = get_rms_feature(df_test, delta=True)

  0%|          | 0/1279 [00:00<?, ?it/s]

(1279, 1)


  0%|          | 0/1514 [00:00<?, ?it/s]

(1514, 1)


In [40]:
rms_delta_train.head()

Unnamed: 0,RMS_delta
0,-2e-06
1,-1e-06
2,-2e-06
3,-1e-06
4,-3e-06


In [41]:
scaler = RobustScaler()
rms_delta_train, rms_delta_test= scaled_df(rms_delta_train,
                                           df_train,
                                           rms_delta_test,
                                           df_test,
                                           scaler,
                                           fan_type=True)

rms_delta_train.columns = ['FAN_TYPE', 'RMS_delta']
rms_delta_test.columns = ['FAN_TYPE', 'RMS_delta']

### MFCC

In [42]:
def get_mfcc_feature(df):
    features = []
    for path in tqdm(df['SAMPLE_PATH']):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=16000)
        
        # librosa패키지를 사용하여 mfcc 추출
        #S = librosa.feature.melspectrogram(y=y, sr=sr)
        mfcc = librosa.feature.mfcc(y=y,
                                    sr=sr,
                                    n_mfcc=128,
                                    dct_type=2)

        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:

            e = np.mean(e)

            y_feature.append(e)

        features.append(y_feature)

    columns = ['MFCC_'+str(i) for i in range(len(features[0]))]
    
    mfcc_df = pd.DataFrame(features,
                           columns=columns)

    print(mfcc_df.shape)

    return mfcc_df

In [47]:
mfcc_train = get_mfcc_feature(df_train)
mfcc_test = get_mfcc_feature(df_test)

  0%|          | 0/1279 [00:00<?, ?it/s]

(1279, 128)


  0%|          | 0/1514 [00:00<?, ?it/s]

(1514, 128)


In [48]:
mfcc_train.head()

Unnamed: 0,MFCC_0,MFCC_1,MFCC_2,MFCC_3,MFCC_4,MFCC_5,MFCC_6,MFCC_7,MFCC_8,MFCC_9,...,MFCC_118,MFCC_119,MFCC_120,MFCC_121,MFCC_122,MFCC_123,MFCC_124,MFCC_125,MFCC_126,MFCC_127
0,-332.689484,96.704391,-14.929521,21.968111,-8.563829,-2.02196,-11.857611,3.893353,-5.748076,3.539912,...,0.53368,0.660617,0.524346,-0.307885,-0.814918,-0.123952,0.535305,0.113357,-0.800878,-0.867296
1,-438.377899,142.276978,-2.118732,30.589058,0.734739,15.532813,-2.802753,4.227826,-1.891904,3.577837,...,0.179785,-0.031554,0.05012,0.377868,0.766223,0.740194,0.287944,0.007076,0.350023,0.168382
2,-419.17099,123.297798,10.11094,21.655056,-1.095648,11.256332,-3.402523,1.567492,3.890199,3.804655,...,0.472421,0.330321,0.200077,0.07306,0.516295,0.852534,0.380594,-0.057465,-0.105068,-0.298017
3,-333.733124,97.450333,-13.966936,22.235878,-9.349174,-2.870443,-11.308705,6.399221,-2.479952,3.890206,...,0.084635,0.459112,-0.024202,0.227796,-0.581687,-0.259305,-0.126211,0.116488,-0.928069,-0.161903
4,-333.012543,90.00338,-21.694469,14.749146,-18.316071,-9.914346,-16.342524,2.575432,-6.690783,-0.875636,...,0.058081,0.142688,-0.039779,0.551953,-0.547507,-0.372035,-0.214538,0.094469,-0.619701,-0.231777


In [49]:
scaler = RobustScaler()
mfcc_train, mfcc_test= scaled_df(mfcc_train,
                                 df_train,
                                 mfcc_test,
                                 df_test,
                                 scaler,
                                 fan_type=True)

mfcc_train.columns = ['FAN_TYPE'] + ['MFCC_'+str(i) for i in range(len(mfcc_train.columns)-1)]
mfcc_test.columns = ['FAN_TYPE'] + ['MFCC_'+str(i) for i in range(len(mfcc_test.columns)-1)]

In [50]:
mfcc_train.head()

Unnamed: 0,FAN_TYPE,MFCC_0,MFCC_1,MFCC_2,MFCC_3,MFCC_4,MFCC_5,MFCC_6,MFCC_7,MFCC_8,...,MFCC_118,MFCC_119,MFCC_120,MFCC_121,MFCC_122,MFCC_123,MFCC_124,MFCC_125,MFCC_126,MFCC_127
0,2,0.690073,0.114611,0.123688,0.07154,0.52138,0.803779,0.856026,-0.493195,-0.880081,...,1.228098,1.704697,1.202587,-0.981697,-1.002171,-0.184173,1.812308,-0.012843,-0.669357,-1.577192
1,0,-1.703092,0.668559,-0.2341,0.031342,0.113181,0.24357,-0.116445,-0.638444,0.364353,...,-0.253236,-0.94408,-0.756294,-0.038057,0.697295,0.526572,0.05762,-0.350332,0.215434,0.008132
2,0,0.376408,-0.810147,1.061598,-0.635059,-0.387792,-0.655307,-0.27163,-2.172539,3.149782,...,0.523302,-0.275989,-0.360429,-0.682915,0.16913,0.826519,0.255326,-0.505536,-0.829216,-1.12563
3,2,0.322113,0.314324,0.464049,0.119507,0.319502,0.5472,1.188367,0.886476,0.742252,...,-0.144778,1.086816,-0.164982,0.154664,-0.448467,-0.608437,-0.181108,-0.003359,-1.061261,0.141075
4,2,0.576171,-1.679476,-2.268337,-1.22165,-1.985492,-1.582854,-1.85941,-1.21881,-1.348051,...,-0.225962,0.116558,-0.203816,0.842311,-0.367322,-0.961788,-0.447271,-0.070059,-0.111112,-0.02913


## Dimension Reduction

In [53]:
def dimension_reduction(train, test, method, fan_type=False):

  df_fan_type_train = train[['FAN_TYPE']]
  df_fan_type_test = test[['FAN_TYPE']]

  train_0 = train.loc[train['FAN_TYPE']==0]
  train_2 = train.loc[train['FAN_TYPE']==2]

  test_0 = test.loc[test['FAN_TYPE']==0]
  test_2 = test.loc[test['FAN_TYPE']==2]

  index_train_0 = list(train_0.index)
  index_train_2 = list(train_2.index)

  index_test_0 = list(test_0.index)
  index_test_2 = list(test_2.index)

  train_0.drop(columns='FAN_TYPE', inplace=True)
  train_2.drop(columns='FAN_TYPE', inplace=True)
  test_0.drop(columns='FAN_TYPE', inplace=True)
  test_2.drop(columns='FAN_TYPE', inplace=True)

  train_0 = method.fit_transform(train_0)
  test_0 = method.transform(test_0)

  train_2 = method.fit_transform(train_2)
  test_2 = method.transform(test_2)

  train_0 = pd.DataFrame(train_0)
  train_2 = pd.DataFrame(train_2)
  test_0 = pd.DataFrame(test_0)
  test_2 = pd.DataFrame(test_2)

  train_0.index = index_train_0
  train_2.index = index_train_2

  test_0.index = index_test_0
  test_2.index = index_test_2

  train = pd.concat([train_0, train_2], axis=0)
  test = pd.concat([test_0, test_2], axis=0)

  train.sort_index(inplace=True)
  test.sort_index(inplace=True)

  if fan_type == False:

    pass

  if fan_type == True:

    train = pd.concat([df_fan_type_train, train], axis=1)
    test = pd.concat([df_fan_type_test, test], axis=1)

  return train, test

## Zero Crossing Rate

In [60]:
method = SparsePCA(n_components=1, alpha=0.001)

pca_train_zero, pca_test_zero = dimension_reduction(zero_train,
                                                    zero_test,
                                                    method)

In [61]:
pca_train_zero.head()

Unnamed: 0,0
0,-0.223134
1,-0.526685
2,0.316432
3,-0.586461
4,1.174963


In [62]:
method = SparsePCA(n_components=1, alpha=0.001)

pca_train_zero_delta, pca_test_zero_delta = dimension_reduction(zero_delta_train,
                                                                zero_delta_test,
                                                                method)

In [63]:
pca_train_zero_delta.head()

Unnamed: 0,0
0,-0.102517
1,-0.22926
2,0.356011
3,0.724337
4,-0.840928


## RMS

In [64]:
method = SparsePCA(n_components=1, alpha=0.001)

pca_train_rms, pca_test_rms = dimension_reduction(rms_train,
                                                  rms_test,
                                                  method)

In [None]:
pca_train_rms.head()

In [65]:
method = SparsePCA(n_components=1, alpha=0.001)

pca_train_rms_delta, pca_test_rms_delta = dimension_reduction(rms_delta_train,
                                                              rms_delta_test,
                                                              method)

In [66]:
pca_train_rms_delta.head()

Unnamed: 0,0
0,-0.745011
1,-0.145898
2,-0.283579
3,-0.393632
4,-1.003171


## MFCC

In [73]:
pca = PCA()
pca.fit(mfcc_train.loc[mfcc_train['FAN_TYPE']==0].drop(columns='FAN_TYPE'))
cumsum = np.cumsum(pca.explained_variance_ratio_)
N_COMPONETS = np.argmax(cumsum>=0.999) + 1
print(N_COMPONETS)

121


In [74]:
pca = PCA()
pca.fit(mfcc_train.loc[mfcc_train['FAN_TYPE']==2].drop(columns='FAN_TYPE'))
cumsum = np.cumsum(pca.explained_variance_ratio_)
N_COMPONETS = np.argmax(cumsum>=0.999) + 1
print(N_COMPONETS)

119


In [None]:
start = time.time()

method = SparsePCA(n_components=120, alpha=0.001)

pca_train_mfcc, pca_test_mfcc = dimension_reduction(mfcc_train,
                                                    mfcc_test,
                                                    method)

end = time.time()

times = start - end
times = str(datetime.timedelta(seconds=times))
print(times)

## Concat Data Set

In [None]:
preprocessed_train = pd.concat([
                                pca_train_zero,
                                pca_train_zero_delta,
                                pca_train_rms,
                                pca_train_rms_delta,
                                pca_train_mfcc
                               ], axis=1)

preprocessed_test = pd.concat([
                               pca_test_zero,
                               pca_test_zero_delta,
                               pca_test_rms,
                               pca_test_rms_delta,
                               pca_test_mfcc
                              ], axis=1)

In [None]:
preprocessed_train.columns = [i for i in range(len(preprocessed_train.columns))]
preprocessed_test.columns = [i for i in range(len(preprocessed_test.columns))]

In [None]:
print(preprocessed_train.shape)
preprocessed_train.head()

# Modeling

## 모델 정의

In [None]:
model = LocalOutlierFactor(n_neighbors=1, 
                           p=2, # 민코프스키 거리 -> 1 : 맨하탄 거리와 같음 / 2 : 유클리드 거리와 같음
                           algorithm='auto',
                           contamination='auto',
                           novelty=True)

## 모델 추론

In [None]:
model.fit(preprocessed_train)

In [None]:
def get_pred_label(model_pred):
    # IsolationForest 모델 출력 (1:정상, -1:불량) 이므로 (0:정상, 1:불량)로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

In [None]:
test_pred = model.predict(preprocessed_test) 
test_pred = get_pred_label(test_pred)

# Submission

In [None]:
submit = pd.read_csv('./sample_submission.csv')

In [None]:
submit['LABEL'] = test_pred
submit.head()

In [None]:
submit['LABEL'].value_counts()

In [None]:
submit.to_csv('./submit149.csv', index=False)