# Reference

* 차원 축소 파트의 부가 설명은 [핸즈온 비지도 학습](https://github.com/francis-kang/handson-unsupervised-learning) (Book & Git Hub) 을 참조했습니다

* LOF 파트의 부가 설명은 [고려대학교 산업경영공학부 03-4 : Anomaly Detecton](https://www.youtube.com/watch?v=ODNAyt1h6Eg) (You Tube) 를 참조했습니다

# Setting

## Check CPU

* Colab의 CPU를 활용하여 머신러닝 엔지니어링 수행

* 추후 Sparse PCA에의 경우 실행이 완료되기까지 오래 걸림

In [473]:
!head /proc/cpuinfo

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 79
model name	: Intel(R) Xeon(R) CPU @ 2.20GHz
stepping	: 0
microcode	: 0xffffffff
cpu MHz		: 2200.230
cache size	: 56320 KB
physical id	: 0


## Library

In [1]:
import pandas as pd
import numpy as np

from scipy import stats

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA, KernelPCA, SparsePCA, TruncatedSVD, IncrementalPCA
from sklearn.decomposition import TruncatedSVD

import os
from tqdm.auto import tqdm
import random
import time
import datetime 

In [2]:
import librosa
import librosa.display
import IPython.display as ipd

In [3]:
import warnings
warnings.filterwarnings(action='ignore') 

## Fixed Random Seed

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

# Load Data Set

## Google Drive Mount

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Unzip File


In [6]:
!unzip -qq '/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/기계 고장 진단/data/기계_고장.zip'

## Load Train / Test Split

In [7]:
df_train = pd.read_csv('./train.csv') # 모두 정상 Sample
df_test = pd.read_csv('./test.csv')

In [8]:
print(df_train.shape)
df_train.head()

(1279, 4)


Unnamed: 0,SAMPLE_ID,SAMPLE_PATH,FAN_TYPE,LABEL
0,TRAIN_0000,./train/TRAIN_0000.wav,2,0
1,TRAIN_0001,./train/TRAIN_0001.wav,0,0
2,TRAIN_0002,./train/TRAIN_0002.wav,0,0
3,TRAIN_0003,./train/TRAIN_0003.wav,2,0
4,TRAIN_0004,./train/TRAIN_0004.wav,2,0


In [9]:
print(df_test.shape)
df_test.head()

(1514, 3)


Unnamed: 0,SAMPLE_ID,SAMPLE_PATH,FAN_TYPE
0,TEST_0000,./test/TEST_0000.wav,2
1,TEST_0001,./test/TEST_0001.wav,2
2,TEST_0002,./test/TEST_0002.wav,0
3,TEST_0003,./test/TEST_0003.wav,0
4,TEST_0004,./test/TEST_0004.wav,0


# Feature Engineering

## Scaling

**Scaling 하는 이유**

* 추후 Sparse PCA 및 Kernel PCA 수행

* PCA는 원본 피처들을 상대적 범위에 매우 민감하기 때문이다

**Processing**

* FAN TYPE 별 (0 & 2)로 각각 Scaling

* 별도로 Scaling하는 이유는 [DACON 코드 공유](https://dacon.io/competitions/official/236036/codeshare/7134?page=1&dtype=recent)에서 추출된 피처를 시각화한 결과 FAN TYPE별로 데이터 분포가 크게 다르다는 것을 확인하였기 때문입니다

* 추후 Robust Scaler를 사용

* 디폴트 값인 quantile_range=(25.0, 75.0) 
대신 quantile_range=(15.0, 85.0)로 설정 -> 이상치 기준을 좀 더 약하게 설정

In [10]:
def scaled_df(df_train, df_train_fan_type, df_test, df_test_fan_type, scaler, fan_type=False):

  df_train_fan_type = df_train_fan_type[['FAN_TYPE']]

  df_train = pd.concat([
                        df_train.reset_index(drop=True),
                        df_train_fan_type.reset_index(drop=True)
                       ],
                       axis=1)
  
  df_test_fan_type = df_test_fan_type[['FAN_TYPE']]

  df_test = pd.concat([
                       df_test.reset_index(drop=True),
                       df_test_fan_type.reset_index(drop=True)
                      ],
                      axis=1)
  
  train_type_0 = df_train.loc[(df_train['FAN_TYPE']==0)]
  train_type_2 = df_train.loc[(df_train['FAN_TYPE']==2)]

  test_type_0 = df_test.loc[(df_test['FAN_TYPE']==0)]
  test_type_2 = df_test.loc[(df_test['FAN_TYPE']==2)]

  train_type_0.drop(columns='FAN_TYPE', inplace=True)
  train_type_2.drop(columns='FAN_TYPE', inplace=True)
  test_type_0.drop(columns='FAN_TYPE', inplace=True)
  test_type_2.drop(columns='FAN_TYPE', inplace=True)

  list_train_0_index = list(train_type_0.index)
  list_train_2_index = list(train_type_2.index)

  list_test_0_index = list(test_type_0.index)
  list_test_2_index = list(test_type_2.index)

  scaled_train_type_0 = scaler.fit_transform(train_type_0)
  scaled_test_type_0 = scaler.transform(test_type_0)

  scaled_train_type_2 = scaler.fit_transform(train_type_2)
  scaled_test_type_2 = scaler.transform(test_type_2)

  train_type_0 = pd.DataFrame(scaled_train_type_0)
  train_type_2 = pd.DataFrame(scaled_train_type_2)

  test_type_0 = pd.DataFrame(scaled_test_type_0)
  test_type_2 = pd.DataFrame(scaled_test_type_2)

  train_type_0.index = list_train_0_index
  train_type_2.index = list_train_2_index

  test_type_0.index = list_test_0_index
  test_type_2.index = list_test_2_index

  df_train = pd.concat([train_type_0, train_type_2], axis=0)
  df_test = pd.concat([test_type_0, test_type_2], axis=0)

  df_train.sort_index(inplace=True)
  df_test.sort_index(inplace=True)

  if fan_type == False:

    pass
  
  if fan_type == True:

    df_train = pd.concat([df_train_fan_type, df_train], axis=1)
    df_test = pd.concat([df_test_fan_type, df_test], axis=1)

  return df_train, df_test

## Feature Extraction

다음의 다양한 Feature Extraction 기법들의 설명은 [DACON 코드 공유](https://dacon.io/competitions/official/236036/codeshare/7415?page=1&dtype=recent) 를 참조해주세요

### Zero Crossing Rate

In [11]:
def get_zero_crossing_feature(df, delta=False):
    features = []
    for path in tqdm(df['SAMPLE_PATH']):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=16000)
        
        # librosa패키지를 사용하여 zero crossing rate 추출
        zero = librosa.feature.zero_crossing_rate(y=y)
                              
        if delta == True:

          zero = librosa.feature.delta(zero, order=1)

        y_feature = []
        # 추출된 zero crossing rate들의 산술평균을 Feature로 사용
        for e in zero:

            e = np.mean(e)

            y_feature.append(e)

        features.append(y_feature)
    
    zero_df = pd.DataFrame(features,
                           columns=['Zero_Crossing_Rate'])
    
    if delta == True:
      
      zero_df = pd.DataFrame(features,
                             columns=['Zero_Crossing_Rate_delta'])

    print(zero_df.shape)

    return zero_df

In [12]:
zero_train = get_zero_crossing_feature(df_train, delta=False)
zero_test = get_zero_crossing_feature(df_test, delta=False)

  0%|          | 0/1279 [00:00<?, ?it/s]

(1279, 1)


  0%|          | 0/1514 [00:00<?, ?it/s]

(1514, 1)


In [13]:
zero_train.head()

Unnamed: 0,Zero_Crossing_Rate
0,0.133064
1,0.047472
2,0.057276
3,0.130589
4,0.142584


In [14]:
scaler = RobustScaler(quantile_range=(15.0, 85.0))
scaled_zero_train, scaled_zero_test= scaled_df(zero_train,
                                               df_train,
                                               zero_test,
                                               df_test,
                                               scaler,
                                               fan_type=True)

scaled_zero_train.columns = ['FAN_TYPE', 'Zero_Crossing_Rate']
scaled_zero_test.columns = ['FAN_TYPE', 'Zero_Crossing_Rate']

In [15]:
scaled_zero_train.head()

Unnamed: 0,FAN_TYPE,Zero_Crossing_Rate
0,2,0.023551
1,0,-0.237934
2,0,0.351415
3,2,-0.227974
4,2,0.991428


In [16]:
zero_delta_train = get_zero_crossing_feature(df_train, delta=True)
zero_delta_test = get_zero_crossing_feature(df_test, delta=True)

  0%|          | 0/1279 [00:00<?, ?it/s]

(1279, 1)


  0%|          | 0/1514 [00:00<?, ?it/s]

(1514, 1)


In [17]:
zero_delta_train.head()

Unnamed: 0,Zero_Crossing_Rate_delta
0,9.8e-05
1,7.4e-05
2,0.000116
3,0.000155
4,4.7e-05


In [18]:
scaler = RobustScaler(quantile_range=(15.0, 85.0))
scaled_zero_delta_train, scaled_zero_delta_test= scaled_df(zero_delta_train,
                                                           df_train,
                                                           zero_delta_test,
                                                           df_test,
                                                           scaler,
                                                           fan_type=True)

scaled_zero_delta_train.columns = ['FAN_TYPE', 'Zero_Crossing_Rate_delta']
scaled_zero_delta_test.columns = ['FAN_TYPE', 'Zero_Crossing_Rate_delta']

### RMS

In [19]:
def get_rms_feature(df, delta=False):
    features = []
    for path in tqdm(df['SAMPLE_PATH']):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=16000)
        
        # librosa패키지를 사용하여 RMS 추출
        rms = librosa.feature.rms(y=y)

        if delta == True:

          rms = librosa.feature.delta(rms, order=1)

        y_feature = []
        # 추출된 RMS의 산술평균을 Feature로 사용
        for e in rms:

            e = np.mean(e)

            y_feature.append(e)

        features.append(y_feature)
    
    rms_df = pd.DataFrame(features,
                           columns=['RMS'])
    
    if delta == True:

      rms_df = pd.DataFrame(features,
                           columns=['RMS_delta'])

    print(rms_df.shape)

    return rms_df

In [20]:
rms_train = get_rms_feature(df_train, delta=False)
rms_test = get_rms_feature(df_test, delta=False)

  0%|          | 0/1279 [00:00<?, ?it/s]

(1279, 1)


  0%|          | 0/1514 [00:00<?, ?it/s]

(1514, 1)


In [21]:
rms_train.head()

Unnamed: 0,RMS
0,0.005121
1,0.004604
2,0.004401
3,0.005163
4,0.004931


In [22]:
scaler = RobustScaler(quantile_range=(15.0, 85.0))
scaled_rms_train, scaled_rms_test= scaled_df(rms_train,
                               df_train,
                               rms_test,
                               df_test,
                               scaler,
                               fan_type=True)

scaled_rms_train.columns = ['FAN_TYPE', 'RMS']
scaled_rms_test.columns = ['FAN_TYPE', 'RMS']

In [23]:
scaled_rms_train.head()

Unnamed: 0,FAN_TYPE,RMS
0,2,0.316796
1,0,-0.034583
2,0,-0.584116
3,2,0.50579
4,2,-0.541495


In [24]:
rms_delta_train = get_rms_feature(df_train, delta=True)
rms_delta_test = get_rms_feature(df_test, delta=True)

  0%|          | 0/1279 [00:00<?, ?it/s]

(1279, 1)


  0%|          | 0/1514 [00:00<?, ?it/s]

(1514, 1)


In [25]:
rms_delta_train.head()

Unnamed: 0,RMS_delta
0,-2e-06
1,-1e-06
2,-2e-06
3,-1e-06
4,-3e-06


In [26]:
scaler = RobustScaler(quantile_range=(15.0, 85.0))
scaled_rms_delta_train, scaled_rms_delta_test= scaled_df(rms_delta_train,
                                                         df_train,
                                                         rms_delta_test,
                                                         df_test,
                                                         scaler,
                                                         fan_type=True)

scaled_rms_delta_train.columns = ['FAN_TYPE', 'RMS_delta']
scaled_rms_delta_test.columns = ['FAN_TYPE', 'RMS_delta']

### Poly

In [27]:
def get_poly_feature(df, delta=False):
    features = []
    for path in tqdm(df['SAMPLE_PATH']):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=16000)
        
        # librosa패키지를 사용하여 poly 추출
        poly = librosa.feature.poly_features(y=y,
                                             sr=sr,
                                             order=2)

        if delta == True:

          poly = librosa.feature.delta(poly, order=1)

        y_feature = []
        for e in poly:

            # 추출된 Poly들의 산술평균을 Feature로 사용
            e = np.mean(e)

            y_feature.append(e)

        features.append(y_feature)

    columns = ['Poly'+str(i) for i in range(len(features[0]))]
    
    poly_df = pd.DataFrame(features,
                           columns=columns)

    print(poly_df.shape)

    return poly_df

In [28]:
poly_train = get_poly_feature(df_train, delta=False)
poly_test = get_poly_feature(df_test, delta=False)

  0%|          | 0/1279 [00:00<?, ?it/s]

(1279, 3)


  0%|          | 0/1514 [00:00<?, ?it/s]

(1514, 3)


In [29]:
poly_train.head()

Unnamed: 0,Poly0,Poly1,Poly2
0,9.069388e-09,-0.000104,0.299626
1,9.505948e-09,-9.8e-05,0.226299
2,8.967728e-09,-9.3e-05,0.219882
3,9.204813e-09,-0.000106,0.300868
4,7.493559e-09,-9e-05,0.275208


In [30]:
scaler = RobustScaler(quantile_range=(15.0, 85.0))
scaled_poly_train, scaled_poly_test= scaled_df(poly_train,
                                               df_train,
                                               poly_test,
                                               df_test,
                                               scaler,
                                               fan_type=True)

scaled_poly_train.columns = ['FAN_TYPE'] + ['Poly_'+str(i) for i in range(len(scaled_poly_train.columns)-1)]
scaled_poly_test.columns = ['FAN_TYPE'] + ['Poly_'+str(i) for i in range(len(scaled_poly_test.columns)-1)]

In [31]:
scaled_poly_train.head()

Unnamed: 0,FAN_TYPE,Poly_0,Poly_1,Poly_2
0,2,0.223591,-0.255098,0.327539
1,0,0.121875,-0.078487,-0.096472
2,0,-0.460621,0.46512,-0.474343
3,2,0.338239,-0.353803,0.383175
4,2,-1.110466,0.999232,-0.765924


### MFCC

In [32]:
def get_mfcc_feature(df, delta=False):
    features = []
    for path in tqdm(df['SAMPLE_PATH']):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=16000)
        
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y,
                                    sr=sr,
                                    n_mfcc=128,
                                    dct_type=2)
        
        if delta == True:

          mfcc = librosa.feature.delta(mfcc, order=1)

        y_feature = []
        # 추출된 MFCC들의 산술평균을 Feature로 사용
        for e in mfcc:

            e = np.mean(e)

            y_feature.append(e)

        features.append(y_feature)

    columns = ['MFCC_'+str(i) for i in range(len(features[0]))]

    if delta == True:

      mfcc_df = pd.DataFrame(features,
                           columns=['MFCC_delta_'+str(i) for i in range(len(features[0]))])
    
    mfcc_df = pd.DataFrame(features,
                           columns=columns)

    print(mfcc_df.shape)

    return mfcc_df

In [33]:
mfcc_train = get_mfcc_feature(df_train)
mfcc_test = get_mfcc_feature(df_test)

  0%|          | 0/1279 [00:00<?, ?it/s]

(1279, 128)


  0%|          | 0/1514 [00:00<?, ?it/s]

(1514, 128)


In [34]:
mfcc_train.head()

Unnamed: 0,MFCC_0,MFCC_1,MFCC_2,MFCC_3,MFCC_4,MFCC_5,MFCC_6,MFCC_7,MFCC_8,MFCC_9,...,MFCC_118,MFCC_119,MFCC_120,MFCC_121,MFCC_122,MFCC_123,MFCC_124,MFCC_125,MFCC_126,MFCC_127
0,-332.689484,96.704391,-14.929521,21.968111,-8.563829,-2.02196,-11.857611,3.893353,-5.748076,3.539912,...,0.53368,0.660617,0.524346,-0.307885,-0.814918,-0.123952,0.535305,0.113357,-0.800878,-0.867296
1,-438.377899,142.276978,-2.118732,30.589058,0.734739,15.532813,-2.802753,4.227826,-1.891904,3.577837,...,0.179785,-0.031554,0.05012,0.377868,0.766223,0.740194,0.287944,0.007076,0.350023,0.168382
2,-419.17099,123.297798,10.11094,21.655056,-1.095648,11.256332,-3.402523,1.567492,3.890199,3.804655,...,0.472421,0.330321,0.200077,0.07306,0.516295,0.852534,0.380594,-0.057465,-0.105068,-0.298017
3,-333.733124,97.450333,-13.966936,22.235878,-9.349174,-2.870443,-11.308705,6.399221,-2.479952,3.890206,...,0.084635,0.459112,-0.024202,0.227796,-0.581687,-0.259305,-0.126211,0.116488,-0.928069,-0.161903
4,-333.012543,90.00338,-21.694469,14.749146,-18.316071,-9.914346,-16.342524,2.575432,-6.690783,-0.875636,...,0.058081,0.142688,-0.039779,0.551953,-0.547507,-0.372035,-0.214538,0.094469,-0.619701,-0.231777


In [35]:
scaler = RobustScaler(quantile_range=(15.0, 85.0))
scaled_mfcc_train, scaled_mfcc_test= scaled_df(mfcc_train,
                                               df_train,
                                               mfcc_test,
                                               df_test,
                                               scaler,
                                               fan_type=True)

scaled_mfcc_train.columns = ['FAN_TYPE'] + ['MFCC_'+str(i) for i in range(len(scaled_mfcc_train.columns)-1)]
scaled_mfcc_test.columns = ['FAN_TYPE'] + ['MFCC_'+str(i) for i in range(len(scaled_mfcc_test.columns)-1)]

### Spectral Centroid

In [36]:
def get_spectral_centroid_feature(df):
    features = []
    for path in tqdm(df['SAMPLE_PATH']):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=16000)
        
        # librosa패키지를 사용하여 Spectral Centroid 추출
        centroid = librosa.feature.spectral_centroid(y=y, sr=sr)

        y_feature = []
        for e in centroid:

            # 추출된 Spectral Centroid들의 산술평균을 Feature로 사용
            e = np.mean(e)

            y_feature.append(e)

        features.append(y_feature)

    columns = ['spectral_centroid_'+str(i) for i in range(len(features[0]))]
    
    centroid_df = pd.DataFrame(features,
                           columns=columns)

    print(centroid_df.shape)

    return centroid_df

In [37]:
spectral_centroid_train = get_spectral_centroid_feature(df_train)
spectral_centroid_test = get_spectral_centroid_feature(df_test)

  0%|          | 0/1279 [00:00<?, ?it/s]

(1279, 1)


  0%|          | 0/1514 [00:00<?, ?it/s]

(1514, 1)


In [38]:
spectral_centroid_train.head()

Unnamed: 0,spectral_centroid_0
0,1746.248047
1,966.565838
2,1206.676823
3,1731.6838
4,1845.114687


In [39]:
scaler = RobustScaler(quantile_range=(15.0, 85.0))
scaled_centroid_train, scaled_centroid_test= scaled_df(spectral_centroid_train,
                                                       df_train,
                                                       spectral_centroid_test,
                                                       df_test,
                                                       scaler,
                                                       fan_type=True)

scaled_centroid_train.columns = ['FAN_TYPE'] + ['Centroid_'+str(i) for i in range(len(scaled_centroid_train.columns)-1)]
scaled_centroid_test.columns = ['FAN_TYPE'] + ['Centroid_'+str(i) for i in range(len(scaled_centroid_test.columns)-1)]

### Spectral Flatness

In [40]:
def get_spectral_flatness_feature(df):
    features = []
    for path in tqdm(df['SAMPLE_PATH']):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=16000)
        
        # librosa패키지를 사용하여 Spectral Flatness 추출
        flatness = librosa.feature.spectral_flatness(y=y)

        y_feature = []
        for e in flatness:

            # 추출된 Spectral Flatness들의 산술평균을 Feature로 사용
            e = np.mean(e)

            y_feature.append(e)

        features.append(y_feature)

    columns = ['spectral_flatness_'+str(i) for i in range(len(features[0]))]
    
    flatness_df = pd.DataFrame(features,
                           columns=columns)

    print(flatness_df.shape)

    return flatness_df

In [41]:
spectral_flatness_train = get_spectral_flatness_feature(df_train)
spectral_flatness_test = get_spectral_flatness_feature(df_test)

  0%|          | 0/1279 [00:00<?, ?it/s]

(1279, 1)


  0%|          | 0/1514 [00:00<?, ?it/s]

(1514, 1)


In [42]:
scaler = RobustScaler(quantile_range=(15.0, 85.0))
scaled_flatness_train, scaled_flatness_test= scaled_df(spectral_flatness_train,
                                                       df_train,
                                                       spectral_flatness_test,
                                                       df_test,
                                                       scaler,
                                                       fan_type=True)

scaled_flatness_train.columns = ['FAN_TYPE'] + ['Flatness_'+str(i) for i in range(len(scaled_flatness_train.columns)-1)]
scaled_flatness_test.columns = ['FAN_TYPE'] + ['Flatness_'+str(i) for i in range(len(scaled_flatness_test.columns)-1)]

In [43]:
scaled_flatness_train.head()

Unnamed: 0,FAN_TYPE,Flatness_0
0,2,0.061541
1,0,-0.390927
2,0,0.444324
3,2,-0.239922
4,2,0.984202


### Spectral Bandwidth

In [44]:
def get_spectral_bandwidth_feature(df):
    features = []
    for path in tqdm(df['SAMPLE_PATH']):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=16000)
        
        # librosa패키지를 사용하여 Spectral Bandwidth 추출
        bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)

        y_feature = []
        for e in bandwidth:

            # 추출된 Spectral Bandwidth들의 산술평균을 Feature로 사용
            e = np.mean(e)

            y_feature.append(e)

        features.append(y_feature)

    columns = ['spectral_bandwidth_'+str(i) for i in range(len(features[0]))]
    
    bandwidth_df = pd.DataFrame(features,
                           columns=columns)

    print(bandwidth_df.shape)

    return bandwidth_df

In [45]:
spectral_bandwidth_train = get_spectral_bandwidth_feature(df_train)
spectral_bandwidth_test = get_spectral_bandwidth_feature(df_test)

  0%|          | 0/1279 [00:00<?, ?it/s]

(1279, 1)


  0%|          | 0/1514 [00:00<?, ?it/s]

(1514, 1)


In [46]:
spectral_bandwidth_train.head()

Unnamed: 0,spectral_bandwidth_0
0,1731.017118
1,1345.701719
2,1619.794231
3,1727.921959
4,1727.019823


In [47]:
scaler = RobustScaler(quantile_range=(15.0, 85.0))
scaled_bandwidth_train, scaled_bandwidth_test= scaled_df(spectral_bandwidth_train,
                                                         df_train,
                                                         spectral_bandwidth_test,
                                                         df_test,
                                                         scaler,
                                                         fan_type=True)

scaled_bandwidth_train.columns = ['FAN_TYPE'] + ['Bandwidth_'+str(i) for i in range(len(scaled_bandwidth_train.columns)-1)]
scaled_bandwidth_test.columns = ['FAN_TYPE'] + ['Bandwidth_'+str(i) for i in range(len(scaled_bandwidth_test.columns)-1)]

In [48]:
scaled_bandwidth_train.head()

Unnamed: 0,FAN_TYPE,Bandwidth_0
0,2,0.150569
1,0,-0.392872
2,0,0.636029
3,2,0.038035
4,2,0.005235


### Spectral Contrast

In [49]:
def get_spectral_contrast_feature(df):
    features = []
    for path in tqdm(df['SAMPLE_PATH']):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=16000)
        
        # librosa패키지를 사용하여 Spectral Contrast 추출
        contrast = librosa.feature.spectral_contrast(y=y, sr=sr)

        y_feature = []
        for e in contrast:

            # 추출된 Spectral Contrast들의 산술평균을 Feature로 사용
            e = np.mean(e)

            y_feature.append(e)

        features.append(y_feature)

    columns = ['spectral_contrast_'+str(i) for i in range(len(features[0]))]
    
    contrast_df = pd.DataFrame(features,
                           columns=columns)

    print(contrast_df.shape)

    return contrast_df

In [50]:
spectral_contrast_train = get_spectral_contrast_feature(df_train)
spectral_contrast_test = get_spectral_contrast_feature(df_test)

  0%|          | 0/1279 [00:00<?, ?it/s]

(1279, 7)


  0%|          | 0/1514 [00:00<?, ?it/s]

(1514, 7)


In [51]:
spectral_contrast_train.head()

Unnamed: 0,spectral_contrast_0,spectral_contrast_1,spectral_contrast_2,spectral_contrast_3,spectral_contrast_4,spectral_contrast_5,spectral_contrast_6
0,16.696441,12.598988,14.744823,15.444423,15.381834,15.325508,52.93793
1,20.836776,11.471659,15.647303,15.401937,15.448171,17.106672,47.677406
2,21.56204,12.946827,16.768408,14.494164,15.234417,16.276049,51.275824
3,17.150536,12.129708,15.350182,15.838211,15.511526,15.375888,53.091559
4,16.29705,12.64251,14.279309,15.827333,15.504191,15.40679,53.62698


In [52]:
scaler = RobustScaler(quantile_range=(15.0, 85.0))
scaled_contrast_train, scaled_contrast_test= scaled_df(spectral_contrast_train,
                                                       df_train,
                                                       spectral_contrast_test,
                                                       df_test,
                                                       scaler,
                                                       fan_type=True)

scaled_contrast_train.columns = ['FAN_TYPE'] + ['Contrast_'+str(i) for i in range(len(scaled_contrast_train.columns)-1)]
scaled_contrast_test.columns = ['FAN_TYPE'] + ['Contrast_'+str(i) for i in range(len(scaled_contrast_test.columns)-1)]

In [53]:
scaled_contrast_train.head()

Unnamed: 0,FAN_TYPE,Contrast_0,Contrast_1,Contrast_2,Contrast_3,Contrast_4,Contrast_5,Contrast_6
0,2,-0.273056,0.204572,0.026918,-0.090305,-0.159104,-0.309974,-0.493331
1,0,-0.127188,-0.394326,0.337215,0.143336,0.41583,0.350862,-0.372677
2,0,0.239205,2.013587,1.061655,-0.053658,0.205869,-0.142922,0.636158
3,2,-0.033861,-0.559301,1.198136,0.295089,0.185306,-0.066247,-0.116739
4,2,-0.483436,0.275415,-0.873733,0.284443,0.165827,0.083247,1.195744


### Spectral Rolloff

In [54]:
def get_spectral_rolloff_feature(df):
    features = []
    for path in tqdm(df['SAMPLE_PATH']):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=16000)
        
        # librosa패키지를 사용하여 Spectral Rolloff 추출
        rolloff = librosa.feature.spectral_rolloff(y=y,
                                                   roll_percent=0.99
                                                   )

        y_feature = []
        for e in rolloff:

            # 추출된 Spectral Rolloff들의 산술평균을 Feature로 사용
            e = np.mean(e)

            y_feature.append(e)

        features.append(y_feature)

    columns = ['spectral_rolloff_'+str(i) for i in range(len(features[0]))]
    
    rolloff_df = pd.DataFrame(features,
                           columns=columns)

    print(rolloff_df.shape)

    return rolloff_df

In [55]:
spectral_rolloff_train = get_spectral_rolloff_feature(df_train)
spectral_rolloff_test = get_spectral_rolloff_feature(df_test)

  0%|          | 0/1279 [00:00<?, ?it/s]

(1279, 1)


  0%|          | 0/1514 [00:00<?, ?it/s]

(1514, 1)


In [56]:
spectral_rolloff_train.head()

Unnamed: 0,spectral_rolloff_0
0,9802.31996
1,8825.827036
2,9794.030021
3,9772.978391
4,9821.548491


In [57]:
scaler = RobustScaler(quantile_range=(15.0, 85.0))
scaled_rolloff_train, scaled_rolloff_test= scaled_df(spectral_rolloff_train,
                                                     df_train,
                                                     spectral_rolloff_test,
                                                     df_test,
                                                     scaler,
                                                     fan_type=True)

scaled_rolloff_train.columns = ['FAN_TYPE'] + ['Rolloff_'+str(i) for i in range(len(scaled_rolloff_train.columns)-1)]
scaled_rolloff_test.columns = ['FAN_TYPE'] + ['Rolloff_'+str(i) for i in range(len(scaled_rolloff_test.columns)-1)]

In [58]:
scaled_rolloff_train.head()

Unnamed: 0,FAN_TYPE,Rolloff_0
0,2,0.169205
1,0,-0.28355
2,0,0.701038
3,2,-0.121787
4,2,0.359902


### Chroma

In [59]:
def get_chroma_stft_feature(df, delta=False):
    features = []
    for path in tqdm(df['SAMPLE_PATH']):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=16000)
        
        # librosa패키지를 사용하여 Chroma 추출
        chroma = librosa.feature.chroma_stft(y=y, sr=sr,
                                             n_chroma=12)
        
        if delta == True:

          chroma = librosa.feature.delta(chroma, order=1)

        y_feature = []
        for e in chroma:

            # 추출된 Chroma들의 산술평균을 Feature로 사용
            e = np.mean(e)

            y_feature.append(e)

        features.append(y_feature)

    columns = ['Chroma_stft_'+str(i) for i in range(len(features[0]))]

    if delta == True:

      columns = ['Chroma_stft_delta_'+str(i) for i in range(len(features[0]))]

    chroma_df = pd.DataFrame(features,
                             columns=columns)
    
    print(chroma_df.shape)

    return chroma_df

In [60]:
chroma_stft_train = get_chroma_stft_feature(df_train)
chroma_stft_test = get_chroma_stft_feature(df_test)

  0%|          | 0/1279 [00:00<?, ?it/s]

(1279, 12)


  0%|          | 0/1514 [00:00<?, ?it/s]

(1514, 12)


In [61]:
chroma_stft_train.head()

Unnamed: 0,Chroma_stft_0,Chroma_stft_1,Chroma_stft_2,Chroma_stft_3,Chroma_stft_4,Chroma_stft_5,Chroma_stft_6,Chroma_stft_7,Chroma_stft_8,Chroma_stft_9,Chroma_stft_10,Chroma_stft_11
0,0.354076,0.449614,0.691313,0.737164,0.868736,0.368256,0.373235,0.550208,0.682635,0.442081,0.371972,0.448557
1,0.699832,0.883752,0.749316,0.479447,0.313469,0.294811,0.24159,0.255539,0.305991,0.353667,0.5045,0.578966
2,0.643748,0.743568,0.532853,0.347367,0.660792,0.713795,0.348799,0.212565,0.232006,0.249673,0.320929,0.44495
3,0.351917,0.345404,0.483628,0.583686,0.923054,0.483008,0.306648,0.415996,0.64316,0.518243,0.316253,0.422899
4,0.394154,0.36135,0.502068,0.585573,0.874034,0.493424,0.344644,0.505876,0.762814,0.576599,0.379295,0.471015


In [62]:
scaler = RobustScaler(quantile_range=(15.0, 85.0))
scaled_chroma_stft_train, scaled_chroma_stft_test= scaled_df(chroma_stft_train,
                                               df_train,
                                               chroma_stft_test,
                                               df_test,
                                               scaler,
                                               fan_type=True)

scaled_chroma_stft_train.columns = ['FAN_TYPE'] + ['Chroma_'+str(i) for i in range(len(scaled_chroma_stft_train.columns)-1)]
scaled_chroma_stft_test.columns = ['FAN_TYPE'] + ['Chroma_'+str(i) for i in range(len(scaled_chroma_stft_test.columns)-1)]

In [63]:
scaled_chroma_stft_train.head()

Unnamed: 0,FAN_TYPE,Chroma_0,Chroma_1,Chroma_2,Chroma_3,Chroma_4,Chroma_5,Chroma_6,Chroma_7,Chroma_8,Chroma_9,Chroma_10,Chroma_11
0,2,-0.394228,0.377207,1.164614,1.055614,-0.355021,-0.952141,0.772094,0.860575,0.120377,-0.637291,0.214456,-0.024834
1,0,-0.373921,-0.186859,0.213031,0.28403,-0.055385,0.007851,-0.181074,-0.302759,-0.19261,-0.105073,0.235855,-0.107041
2,0,-0.61805,-0.787387,-0.538009,-0.288858,2.39096,3.166592,0.596117,-0.563794,-0.608292,-0.681294,-0.851684,-0.764548
3,2,-0.424546,-0.484012,-0.588819,-0.500971,0.451603,0.155424,-0.213623,-0.452014,-0.295258,0.003206,-0.557816,-0.349953
4,2,0.168704,-0.352225,-0.433129,-0.481836,-0.276351,0.255962,0.348851,0.427013,0.964559,0.493961,0.315956,0.25972


## Dimension Reduction

FAN TYPE 별 (0 &2)로 각각 차원 축소 실행

차원 축소가 좋은 이유

* 차원 축소를 통해 고차원 데이터를 저차원 공간에 투영해 중복 정보를 제거하면서 가능한 핵심 정보 유지

* 데이터를 낮은 차원으로 축소시키면 노이즈가 많이 줄어들기 때문에 머신러닝 알고리즘이 흥미로운 패턴을 더 효과적이고 효율적으로 식별할 수 있음

In [64]:
def dimension_reduction(train, test, method, fan_type=False):

  df_fan_type_train = train[['FAN_TYPE']]
  df_fan_type_test = test[['FAN_TYPE']]

  train_0 = train.loc[train['FAN_TYPE']==0]
  train_2 = train.loc[train['FAN_TYPE']==2]

  test_0 = test.loc[test['FAN_TYPE']==0]
  test_2 = test.loc[test['FAN_TYPE']==2]

  index_train_0 = list(train_0.index)
  index_train_2 = list(train_2.index)

  index_test_0 = list(test_0.index)
  index_test_2 = list(test_2.index)

  train_0.drop(columns='FAN_TYPE', inplace=True)
  train_2.drop(columns='FAN_TYPE', inplace=True)
  test_0.drop(columns='FAN_TYPE', inplace=True)
  test_2.drop(columns='FAN_TYPE', inplace=True)

  train_0 = method.fit_transform(train_0)
  test_0 = method.transform(test_0)

  train_2 = method.fit_transform(train_2)
  test_2 = method.transform(test_2)

  train_0 = pd.DataFrame(train_0)
  train_2 = pd.DataFrame(train_2)
  test_0 = pd.DataFrame(test_0)
  test_2 = pd.DataFrame(test_2)

  train_0.index = index_train_0
  train_2.index = index_train_2

  test_0.index = index_test_0
  test_2.index = index_test_2

  train = pd.concat([train_0, train_2], axis=0)
  test = pd.concat([test_0, test_2], axis=0)

  train.sort_index(inplace=True)
  test.sort_index(inplace=True)

  if fan_type == False:

    pass

  if fan_type == True:

    train = pd.concat([df_fan_type_train, train], axis=1)
    test = pd.concat([df_fan_type_test, test], axis=1)

  return train, test

**PCA**

* 가능한 한 분산 (핵심 정보)를 보존하면서 데이터의 저차원 표현을 찾음

* 피처들 간 상관관계를 다룸

* 일부 피처들 간 상관관계가 매우 높으면 PCA는 상관관계가 높은 피처들을 결합해 선형적인 상관관계가 없는, 더 작은 수의 피처들로 데이터를 표현

**Sparse PCA**

* 일반 PCA 알고리즘은 모든 입력 변수에 선형 결합을 탐색해 원본 피처 공간을 최대한 조밀하게 줄인다

* 일반 PCA 알고리즘은 모든 입력 변수에 선형 결합을 탐색해 원본 피처 공간을 최대한 조밀하게 줄인다

* alpha라는 하이퍼 파라미터로 제어함으로써 희소성을 어느 정도 유지할 수 있다

* 희소 PCA는 일부 입력 변수에서만 선형 결합을 탐색해 원본 피처 공간을 어느 정도 줄이지만 일반 PCA만큼 조밀하게 만들지는 않음



```
method = SparsePCA(n_components=N_COMPONETS, alpha=0.001)
```



**Kernel PCA**

* 비선형 PCA 유형 중 하나인 커널 PCA는 원본 데이터 포인트 쌍들에 대해 유사성 함수를 실행시켜 비선형적으로 차원을 축소

* 커널 PCA는 이 유사성 함수를 학습함으로써 데이터 포인트 대부분이 있는 암시적 피처 공간을 매핑하고 이 공간을 원본 피처 셋 보다 훨씬 더 작은 수의 차원을 만듬

* 이 방법은 원본 피처 셋을 선형으로 분리할 수 없는 경우에 특히 효과적이다



```
method = KernelPCA(n_components=N_COMPONETS)
```



### Zero Crossing Rate

In [65]:
method = SparsePCA(n_components=1, alpha=0.001)

pca_train_zero, pca_test_zero = dimension_reduction(scaled_zero_train,
                                                    scaled_zero_test,
                                                    method)

In [66]:
pca_train_zero.head()

Unnamed: 0,0
0,-0.152942
1,-0.364515
2,0.219
3,-0.401976
4,0.805352


In [67]:
method = SparsePCA(n_components=1, alpha=0.01)

pca_train_zero_delta, pca_test_zero_delta = dimension_reduction(scaled_zero_delta_train,
                                                                scaled_zero_delta_test,
                                                                method)

In [68]:
pca_train_zero_delta.head()

Unnamed: 0,0
0,-0.065346
1,-0.147925
2,0.229709
3,0.461705
4,-0.536022


### RMS

In [69]:
method = SparsePCA(n_components=1, alpha=0.001)

pca_train_rms, pca_test_rms = dimension_reduction(scaled_rms_train,
                                                  scaled_rms_test,
                                                  method)

In [70]:
pca_train_rms.head()

Unnamed: 0,0
0,0.361154
1,-0.026647
2,-0.57074
3,0.548277
4,-0.488639


In [71]:
method = SparsePCA(n_components=1, alpha=0.001)

pca_train_rms_delta, pca_test_rms_delta = dimension_reduction(scaled_rms_delta_train,
                                                              scaled_rms_delta_test,
                                                              method)

In [72]:
pca_train_rms_delta.head()

Unnamed: 0,0
0,-0.487964
1,-0.090191
2,-0.175303
3,-0.257819
4,-0.657053


### Poly

In [437]:
pca = PCA()
pca.fit(scaled_poly_train.drop(columns='FAN_TYPE'))
cumsum = np.cumsum(pca.explained_variance_ratio_)
N_COMPONETS = np.argmax(cumsum>=0.999) + 1
print(N_COMPONETS)

2


In [456]:
method = KernelPCA(n_components=N_COMPONETS)

pca_train_poly, pca_test_poly = dimension_reduction(scaled_poly_train,
                                                    scaled_poly_test,
                                                    method)

In [457]:
pca_train_poly.head()

Unnamed: 0,0,1
0,-0.727118,0.058641
1,-0.224784,-0.234795
2,0.655601,-0.187121
3,-0.884441,0.02355
4,1.404687,0.121559


### MFCC

In [76]:
pca = PCA()
pca.fit(scaled_mfcc_train.drop(columns='FAN_TYPE'))
cumsum = np.cumsum(pca.explained_variance_ratio_)
N_COMPONETS = np.argmax(cumsum>=0.999) + 1
print(N_COMPONETS)

124


In [77]:
start = time.time()

method = SparsePCA(n_components=N_COMPONETS, alpha=0.001)

pca_train_mfcc, pca_test_mfcc = dimension_reduction(scaled_mfcc_train,
                                                    scaled_mfcc_test,
                                                    method)

end = time.time()

times = end - start
times = str(datetime.timedelta(seconds=times)).split('.')[0]
print('Colab CPU 기준 ->'+ times)

0:46:26


### Spectral Centroid

In [104]:
pca = PCA()
pca.fit(scaled_centroid_train.drop(columns='FAN_TYPE'))
cumsum = np.cumsum(pca.explained_variance_ratio_)
N_COMPONETS = np.argmax(cumsum>=0.999) + 1
print(N_COMPONETS)

1


In [105]:
method = SparsePCA(n_components=N_COMPONETS, alpha=0.001)

pca_train_centroid, pca_test_centroid = dimension_reduction(scaled_centroid_train,
                                                            scaled_centroid_test,
                                                            method)

In [106]:
pca_train_centroid.head()

Unnamed: 0,0
0,-0.292144
1,-0.579412
2,0.299278
3,-0.46429
4,0.876438


### Spectral Flatness

In [107]:
pca = PCA()
pca.fit(scaled_flatness_train.drop(columns='FAN_TYPE'))
cumsum = np.cumsum(pca.explained_variance_ratio_)
N_COMPONETS = np.argmax(cumsum>=0.999) + 1
print(N_COMPONETS)

1


In [108]:
method = SparsePCA(n_components=N_COMPONETS, alpha=0.001)

pca_train_flatness, pca_test_flatness = dimension_reduction(scaled_flatness_train,
                                                            scaled_flatness_test,
                                                            method)

### Spectral Bandwidth

In [109]:
pca = PCA()
pca.fit(scaled_bandwidth_train.drop(columns='FAN_TYPE'))
cumsum = np.cumsum(pca.explained_variance_ratio_)
N_COMPONETS = np.argmax(cumsum>=0.999) + 1
print(N_COMPONETS)

1


In [110]:
method = SparsePCA(n_components=N_COMPONETS, alpha=0.001)

pca_train_bandwidth, pca_test_bandwidth = dimension_reduction(scaled_bandwidth_train,
                                                              scaled_bandwidth_test,
                                                              method)

### Spectral Contrast

In [111]:
pca = PCA()
pca.fit(scaled_contrast_train.drop(columns='FAN_TYPE'))
cumsum = np.cumsum(pca.explained_variance_ratio_)
N_COMPONETS = np.argmax(cumsum>=0.999) + 1
print(N_COMPONETS)

7


In [112]:
method = SparsePCA(n_components=N_COMPONETS, alpha=0.001)

pca_train_contrast, pca_test_contrast = dimension_reduction(scaled_contrast_train,
                                                            scaled_contrast_test,
                                                            method)

### Spectral Rolloff

In [113]:
pca = PCA()
pca.fit(scaled_rolloff_train.drop(columns='FAN_TYPE'))
cumsum = np.cumsum(pca.explained_variance_ratio_)
N_COMPONETS = np.argmax(cumsum>=0.999) + 1
print(N_COMPONETS)

1


In [114]:
method = SparsePCA(n_components=N_COMPONETS, alpha=0.001)

pca_train_rolloff, pca_test_rolloff = dimension_reduction(scaled_rolloff_train,
                                                          scaled_rolloff_test,
                                                          method)

### Chroma

In [115]:
pca = PCA()
pca.fit(scaled_chroma_stft_train.drop(columns='FAN_TYPE'))
cumsum = np.cumsum(pca.explained_variance_ratio_)
N_COMPONETS = np.argmax(cumsum>=0.999) + 1
print(N_COMPONETS)

12


In [116]:
method = SparsePCA(n_components=N_COMPONETS, alpha=0.001)

pca_train_chroma, pca_test_chroma = dimension_reduction(scaled_chroma_stft_train,
                                                        scaled_chroma_stft_test,
                                                        method)

## Concat Data Set

In [458]:
preprocessed_train = pd.concat([
                                df_train[['FAN_TYPE']],
                                pca_train_zero,
                                pca_train_zero_delta,
                                pca_train_rms,
                                pca_train_rms_delta,
                                pca_train_poly,
                                pca_train_mfcc,
                                #pca_train_centroid,
                                pca_train_flatness,
                                #pca_train_bandwidth,
                                #pca_train_contrast,
                                #pca_train_rolloff,
                                #pca_train_chroma
                               ], axis=1)

preprocessed_test = pd.concat([
                               df_test[['FAN_TYPE']],
                               pca_test_zero,
                               pca_test_zero_delta,
                               pca_test_rms,
                               pca_test_rms_delta,
                               pca_test_poly,
                               pca_test_mfcc,
                               #pca_test_centroid,
                               pca_test_flatness,
                               #pca_test_bandwidth,
                               #pca_test_contrast,
                               #pca_test_rolloff,
                               #pca_test_chroma
                              ], axis=1)

# Modeling

**LOF (Local Outlier Factor)**

* 해당 데이터의 Local Density를 기반으로 Novelty Score 산출

* Novelty Score가 정규화 되어있지 않기에 해당 모델을 다른 Data Set에 적용하는 것이 좋지 못할 수 있음

* 그래서 FAN TYPE 별로 각기 다른 Data Set이라고 가정

* 결론적으로 FAN TYPE 별로 각각 모델링 수행

## FAN TYPE (0 & 2) 별로 데이터셋 분리

In [459]:
train_0 = preprocessed_train.loc[preprocessed_train['FAN_TYPE']==0]
train_2 = preprocessed_train.loc[preprocessed_train['FAN_TYPE']==2]

test_0 = preprocessed_test.loc[preprocessed_test['FAN_TYPE']==0]
test_2 = preprocessed_test.loc[preprocessed_test['FAN_TYPE']==2]

In [460]:
train_0.drop(columns='FAN_TYPE', inplace=True)
train_2.drop(columns='FAN_TYPE', inplace=True)

test_0.drop(columns='FAN_TYPE', inplace=True)
test_2.drop(columns='FAN_TYPE', inplace=True)

In [461]:
index_0 = list(test_0.index)
index_2 = list(test_2.index)

## FAN TYPE 별로 각각 모델 추론

**하이퍼 파라미터**

* *n_neighbors = 1*

  -  경험적으로 1로 설정하였을 때 성능이 가장 좋았습니다

* *p = 2*

  - 유클리드 거리를 사용 시 성능이 가장 좋았습니다

  - 부가적으로 맨하탄 거리 (p = 1) 가 유클리드 거리보다 좋은 경우는 다음의 [깃허브 블로그](https://seoyoungh.github.io/deep-learning/distance-metrics/)를 참조해주세요

* *contamination='auto'*

  - Test Set에 contamination이 얼마나 있는 지 알 수 없음
  - 해당 하이퍼 파라미터는 Threshold 설정에 영향을 줌
  - 대회 규칙 상 Anomaly Score를 바탕으로 Threshold를 산정하는 것은 Data Leakage에 해당
  - 규칙 위반이 우려스러워 'auto'로 설정

* *novelty=True*

  - True로 설정해야 Novelty Detection이 가능

In [485]:
n_neighbors = 1
p = 2

model_0 = LocalOutlierFactor(n_neighbors=n_neighbors, 
                           p=p, # 민코프스키 거리 -> 1 : 맨하탄 거리와 같음 / 2 : 유클리드 거리와 같음
                           algorithm='auto',
                           contamination='auto',
                           novelty=True)

model_2 = LocalOutlierFactor(n_neighbors=n_neighbors, 
                           p=p, # 민코프스키 거리 -> 1 : 맨하탄 거리와 같음 / 2 : 유클리드 거리와 같음
                           algorithm='auto',
                           contamination='auto',
                           novelty=True)

In [486]:
model_0.fit(train_0)
model_2.fit(train_2)

LocalOutlierFactor(n_neighbors=1, novelty=True)

In [487]:
def get_pred_label(model_pred):
    # IsolationForest 모델 출력 (1:정상, -1:불량) 이므로 (0:정상, 1:불량)로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

In [488]:
test_pred_0 = model_0.predict(test_0) 
test_pred_0 = get_pred_label(test_pred_0)

test_pred_2 = model_2.predict(test_2) 
test_pred_2 = get_pred_label(test_pred_2)

In [489]:
test_pred_0 = pd.DataFrame(test_pred_0, columns=['LABEL'])
test_pred_2 = pd.DataFrame(test_pred_2, columns=['LABEL'])

test_pred_0.index = index_0
test_pred_2.index = index_2

In [490]:
final = pd.concat([test_pred_0, test_pred_2], axis=0)
final.sort_index(inplace=True)

# Submission

In [491]:
submit = pd.read_csv('./sample_submission.csv')
submit['LABEL'] = final['LABEL']

submit.head()

Unnamed: 0,SAMPLE_ID,LABEL
0,TEST_0000,0
1,TEST_0001,0
2,TEST_0002,1
3,TEST_0003,1
4,TEST_0004,1


In [492]:
submit['LABEL'].value_counts()

1    821
0    693
Name: LABEL, dtype: int64

In [493]:
submit.to_csv('./s3.csv', index=False)

In [494]:
best = pd.read_csv('/content/s1.csv')
best.head()

Unnamed: 0,SAMPLE_ID,LABEL
0,TEST_0000,0
1,TEST_0001,0
2,TEST_0002,1
3,TEST_0003,1
4,TEST_0004,1


In [495]:
count = 0
for test, best in zip(submit['LABEL'], best['LABEL']):

  if test != best:

    count += 1

print(count)

0
