# Setting

## Library

In [None]:
import pandas as pd
import numpy as np

import librosa

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA, KernelPCA, SparsePCA, TruncatedSVD, IncrementalPCA

from tqdm.auto import tqdm
import random

In [None]:
import warnings
warnings.filterwarnings(action='ignore') 

## Fixed RandomSeed

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(41) # Seed 고정

# Load Data Set

## Google Drive Mount

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Unzip File

In [None]:
!unzip '/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/기계 고장 진단/data/기계_고장.zip'

Archive:  /content/drive/MyDrive/머신러닝 엔지니어링/데이콘/기계 고장 진단/data/기계_고장.zip
  inflating: sample_submission.csv   
   creating: test/
  inflating: test/TEST_0000.wav      
  inflating: test/TEST_0001.wav      
  inflating: test/TEST_0002.wav      
  inflating: test/TEST_0003.wav      
  inflating: test/TEST_0004.wav      
  inflating: test/TEST_0005.wav      
  inflating: test/TEST_0006.wav      
  inflating: test/TEST_0007.wav      
  inflating: test/TEST_0008.wav      
  inflating: test/TEST_0009.wav      
  inflating: test/TEST_0010.wav      
  inflating: test/TEST_0011.wav      
  inflating: test/TEST_0012.wav      
  inflating: test/TEST_0013.wav      
  inflating: test/TEST_0014.wav      
  inflating: test/TEST_0015.wav      
  inflating: test/TEST_0016.wav      
  inflating: test/TEST_0017.wav      
  inflating: test/TEST_0018.wav      
  inflating: test/TEST_0019.wav      
  inflating: test/TEST_0020.wav      
  inflating: test/TEST_0021.wav      
  in

# Load Train / Test Set

In [None]:
train_df = pd.read_csv('./train.csv') # 모두 정상 Sample
test_df = pd.read_csv('./test.csv')

In [None]:
print(train_df.shape)
train_df.head()

(1279, 4)


Unnamed: 0,SAMPLE_ID,SAMPLE_PATH,FAN_TYPE,LABEL
0,TRAIN_0000,./train/TRAIN_0000.wav,2,0
1,TRAIN_0001,./train/TRAIN_0001.wav,0,0
2,TRAIN_0002,./train/TRAIN_0002.wav,0,0
3,TRAIN_0003,./train/TRAIN_0003.wav,2,0
4,TRAIN_0004,./train/TRAIN_0004.wav,2,0


In [None]:
print(test_df.shape)
test_df.head()

(1514, 3)


Unnamed: 0,SAMPLE_ID,SAMPLE_PATH,FAN_TYPE
0,TEST_0000,./test/TEST_0000.wav,2
1,TEST_0001,./test/TEST_0001.wav,2
2,TEST_0002,./test/TEST_0002.wav,0
3,TEST_0003,./test/TEST_0003.wav,0
4,TEST_0004,./test/TEST_0004.wav,0


# Preprocessing

## Log Spectogram

In [None]:
def get_Log_spectogram_feature(df, n_fft, hop_length, method):
    features = []
    for path in tqdm(df['SAMPLE_PATH']):
        # librosa패키지를 사용하여 wav 파일 load
        wav, sr = librosa.load(path, sr=16000)
        
        # librosa패키지를 사용하여 mfcc 추출
        stft = librosa.stft(wav, n_fft = n_fft, hop_length = hop_length)
        spectrogram = np.abs(stft)

        log_spectrogram = librosa.amplitude_to_db(spectrogram)

        y_feature = []
        # 추출된 log spectrogram들의 평균을 Feature로 사용
        for e in log_spectrogram:

          if method == 'mean':

            y_feature.append(np.mean(e))

          elif method == 'max':

            y_feature.append(np.max(e))
          
          elif method == 'min':

            y_feature.append(np.min(e))

          elif method == 'std':

            y_feature.append(np.std(e))

        features.append(y_feature)
        
    return features

In [None]:
train_mean_features = get_Log_spectogram_feature(train_df, 2048, 512, 'mean')
test_mean_features = get_Log_spectogram_feature(test_df, 2048, 512, 'mean')

  0%|          | 0/1279 [00:00<?, ?it/s]

  0%|          | 0/1514 [00:00<?, ?it/s]

In [None]:
print(len(train_mean_features))
print(len(train_mean_features[0]))

1279
1025


In [None]:
print(len(test_mean_features))
print(len(test_mean_features[0]))

1514
1025


In [None]:
train_mean = pd.DataFrame(train_mean_features)
test_mean = pd.DataFrame(test_mean_features)

In [None]:
train_mean.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,1024
0,-35.347874,-25.905037,-19.537817,-14.494222,-13.260056,-12.644492,-9.643282,-8.555812,-8.163176,-10.114017,...,-72.116684,-72.116951,-72.117035,-72.11705,-72.117073,-72.117104,-72.117126,-72.117126,-72.117149,-72.117149
1,-35.412148,-20.08873,-7.292066,-3.005912,-8.03932,-9.869509,-8.647006,-4.698098,2.474855,2.566782,...,-68.14325,-68.14325,-68.14325,-68.14325,-68.14325,-68.14325,-68.14325,-68.14325,-68.14325,-68.14325
2,-37.093132,-29.866112,-7.312017,-2.921619,-10.504422,-15.366261,-13.717054,-7.64609,1.957633,2.902606,...,-69.630402,-69.63028,-69.630188,-69.630142,-69.630028,-69.629982,-69.629951,-69.629921,-69.629913,-69.629913
3,-35.364395,-26.645517,-19.679157,-13.863233,-11.105224,-9.701788,-9.171187,-9.329742,-6.861063,-8.132969,...,-71.678711,-71.678749,-71.678848,-71.678947,-71.679024,-71.679092,-71.679161,-71.679199,-71.679214,-71.679222
4,-40.350456,-30.134981,-25.00893,-21.979902,-21.576363,-18.452465,-13.367365,-16.731131,-16.89319,-14.55592,...,-73.701256,-73.700607,-73.700233,-73.699821,-73.699554,-73.699341,-73.699219,-73.699066,-73.699005,-73.698944


In [None]:
test_mean.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,1024
0,-39.355,-30.789387,-25.561352,-21.873693,-20.098482,-16.698969,-12.333606,-13.840805,-13.36737,-14.496398,...,-69.498238,-69.498253,-69.498314,-69.498375,-69.498428,-69.498405,-69.498444,-69.498451,-69.498474,-69.498466
1,-35.324364,-15.370223,-11.399692,-12.068342,-9.710267,-9.931806,-9.560322,-11.447298,-10.471428,-5.833041,...,-73.835838,-73.835686,-73.835678,-73.835587,-73.835411,-73.835365,-73.835335,-73.835312,-73.835289,-73.835274
2,-36.689728,-17.221697,-1.499329,3.800602,-2.946502,-12.478737,-8.711549,-6.441259,2.124046,5.696384,...,-67.372917,-67.372902,-67.372902,-67.372902,-67.372902,-67.372894,-67.372879,-67.372879,-67.372879,-67.372879
3,-38.447994,-26.483551,-3.762146,0.01383,-8.301869,-11.10117,-7.351457,0.914515,14.103933,13.82081,...,-61.407848,-61.407902,-61.407921,-61.407959,-61.407986,-61.407997,-61.408009,-61.408028,-61.408035,-61.408035
4,-35.658146,-18.815004,-1.379795,4.058747,-2.595324,-10.339049,-8.676261,-6.728096,1.055684,4.162109,...,-68.244232,-68.244331,-68.244362,-68.2444,-68.244423,-68.244431,-68.244461,-68.244469,-68.244484,-68.244484


## Scaler

In [None]:
df_train_FAN_TYPE = train_df[['FAN_TYPE']]
df_test_FAN_TYPE = test_df[['FAN_TYPE']]

In [None]:
def two_2_one(df):

  cond_type_2 = (df['FAN_TYPE']==2)
  df.loc[cond_type_2, 'FAN_TYPE'] = 1
  df.loc[~cond_type_2, 'FAN_TYPE'] = 0

  return df

In [None]:
df_train_FAN_TYPE = two_2_one(df_train_FAN_TYPE)
df_test_FAN_TYPE = two_2_one(df_test_FAN_TYPE)

In [None]:
train_mean = pd.concat([df_train_FAN_TYPE, train_mean], axis=1)
test_mean = pd.concat([df_test_FAN_TYPE, test_mean], axis=1)

In [None]:
train_type_0 = train_mean.loc[(train_mean['FAN_TYPE']==0)]
train_type_1 = train_mean.loc[(train_mean['FAN_TYPE']==1)]

test_type_0 = test_mean.loc[(test_mean['FAN_TYPE']==0)]
test_type_1 = test_mean.loc[(test_mean['FAN_TYPE']==1)]

In [None]:
train_type_0.drop(columns='FAN_TYPE', inplace=True)
train_type_1.drop(columns='FAN_TYPE', inplace=True)
test_type_0.drop(columns='FAN_TYPE', inplace=True)
test_type_1.drop(columns='FAN_TYPE', inplace=True)

In [None]:
list_test_0_index = list(test_type_0.index)
list_test_1_index = list(test_type_1.index)

In [None]:
scaler = StandardScaler()

scaled_train_type_0 = scaler.fit_transform(train_type_0)
scaled_test_type_0 = scaler.transform(test_type_0)

scaled_train_type_1 = scaler.fit_transform(train_type_1)
scaled_test_type_1 = scaler.transform(test_type_1)

In [None]:
train_type_0 = pd.DataFrame(scaled_train_type_0)
train_type_1 = pd.DataFrame(scaled_train_type_1)

test_type_0 = pd.DataFrame(scaled_test_type_0)
test_type_1 = pd.DataFrame(scaled_test_type_1)

In [None]:
test_type_0.index = list_test_0_index
test_type_1.index = list_test_1_index

In [None]:
train_mean = pd.concat([train_type_0, train_type_1], axis=0).sample(frac=1)
test_mean = pd.concat([test_type_0, test_type_1], axis=0).sample(frac=1)

In [None]:
train_mean.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,1024
523,0.432437,1.292493,1.364098,1.501142,0.936076,0.501472,0.214237,0.179731,0.406727,1.21807,...,-0.561871,-0.561315,-0.561036,-0.560757,-0.560586,-0.5604,-0.560266,-0.56019,-0.56013,-0.560114
61,-1.59796,-1.281586,-0.280591,-0.408974,-1.01812,-1.134887,-0.804841,-1.207995,-0.626269,-0.888799,...,-0.720484,-0.720472,-0.720454,-0.72044,-0.720426,-0.720415,-0.720407,-0.720401,-0.720396,-0.720396
107,0.404953,0.534827,0.320743,0.477207,1.180661,0.786761,0.991483,-0.035026,-0.236867,-0.222757,...,-0.835557,-0.835578,-0.835552,-0.83553,-0.835524,-0.835513,-0.835497,-0.835499,-0.835478,-0.83547
550,-1.369684,-1.192651,-1.139571,-0.922914,-1.087537,-0.910867,-1.123925,-0.931405,-0.864792,-0.984684,...,1.500385,1.500479,1.500449,1.500461,1.500466,1.500485,1.50049,1.500489,1.500497,1.500498
133,2.758851,0.722974,1.83322,2.019969,2.378014,2.036689,0.332087,-0.127215,-0.806466,-0.676352,...,-1.388575,-1.388565,-1.388564,-1.388558,-1.388561,-1.388566,-1.388567,-1.38856,-1.388556,-1.388564


In [None]:
test_mean.sort_index(inplace=True)
test_mean.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,1024
0,-0.935695,-1.078155,-1.137769,-1.230735,-1.291903,-0.932995,-0.510092,-0.729744,-0.727521,-0.882619,...,3.142006,3.142042,3.142042,3.142032,3.142014,3.142058,3.142048,3.142059,3.142047,3.142056
1,0.609356,2.050255,1.828894,1.261829,1.483765,1.271384,0.776251,0.075191,0.147667,1.014868,...,-0.763313,-0.763149,-0.763126,-0.763028,-0.762851,-0.762796,-0.762758,-0.762731,-0.762705,-0.76269
2,-0.702899,1.78675,11.997741,15.373803,5.841507,0.162599,1.107177,0.056035,-0.931542,1.63247,...,0.904666,0.904698,0.904717,0.904733,0.904746,0.904766,0.90479,0.904797,0.904801,0.904802
3,-1.557471,-0.512114,7.024182,6.382328,1.102275,0.654025,1.759637,6.308419,10.380006,8.962739,...,7.237484,7.237459,7.237461,7.237442,7.237427,7.237427,7.237425,7.237412,7.237406,7.237409
4,-0.201518,1.391279,12.26047,15.986754,6.152284,0.9259,1.124105,-0.187776,-1.940302,0.24817,...,-0.020366,-0.020457,-0.020471,-0.020497,-0.020508,-0.020505,-0.020529,-0.020531,-0.020543,-0.020542


## PCA

In [None]:
pca = PCA(n_components=750)

In [None]:
pca_train_mean = pca.fit_transform(train_mean)
pca_test_mean = pca.transform(test_mean)

In [None]:
print(len(pca_train_mean))
print(len(pca_train_mean[0]))

1279
750


In [None]:
print(len(pca_test_mean))
print(len(pca_test_mean[0]))

1514
750


# Modeling

## Model Fit

In [None]:
model = IsolationForest(n_estimators=1000,
                        max_samples=256,
                        contamination='auto',
                        random_state=41,
                        verbose=1)

In [None]:
model.fit(pca_train_mean)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.7s finished


IsolationForest(max_samples=256, n_estimators=1000, random_state=41, verbose=1)

## Prediction

In [None]:
def get_pred_label(model_pred):
    # IsolationForest 모델 출력 (1:정상, -1:불량) 이므로 (0:정상, 1:불량)로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

In [None]:
test_pred = model.predict(pca_test_mean) # model prediction
test_pred = get_pred_label(test_pred)

# Submission

In [None]:
submit = pd.read_csv('./sample_submission.csv')

In [None]:
submit['LABEL'] = test_pred
submit.head()

Unnamed: 0,SAMPLE_ID,LABEL
0,TEST_0000,1
1,TEST_0001,0
2,TEST_0002,1
3,TEST_0003,1
4,TEST_0004,0


In [None]:
submit['LABEL'].value_counts()

1    783
0    731
Name: LABEL, dtype: int64

In [None]:
save_path = '/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/기계 고장 진단/submission/'
submit.to_csv(save_path + 'Log_spectogram_Standard_PCA_700.csv', index=False)