<a href="https://colab.research.google.com/github/namwootree/Breakdown-in-Machine/blob/main/MFCC_%EA%B8%B0%EB%B0%98_Feature_%EC%B6%94%EC%B6%9C_%2B_PCA_%2B_Isolation_Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting

## Library

In [40]:
import pandas as pd
import numpy as np

import librosa

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA, KernelPCA, SparsePCA, TruncatedSVD, IncrementalPCA

from tqdm.auto import tqdm
import random

In [2]:
import librosa
import librosa.display
import IPython.display as ipd

In [3]:
import warnings
warnings.filterwarnings(action='ignore') 

## Fixed RandomSeed

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(41) # Seed 고정

# Load Data Set

## Google Drive Mount

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Unzip File

In [6]:
!unzip '/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/기계 고장 진단/data/기계_고장.zip'

Archive:  /content/drive/MyDrive/머신러닝 엔지니어링/데이콘/기계 고장 진단/data/기계_고장.zip
  inflating: sample_submission.csv   
   creating: test/
  inflating: test/TEST_0000.wav      
  inflating: test/TEST_0001.wav      
  inflating: test/TEST_0002.wav      
  inflating: test/TEST_0003.wav      
  inflating: test/TEST_0004.wav      
  inflating: test/TEST_0005.wav      
  inflating: test/TEST_0006.wav      
  inflating: test/TEST_0007.wav      
  inflating: test/TEST_0008.wav      
  inflating: test/TEST_0009.wav      
  inflating: test/TEST_0010.wav      
  inflating: test/TEST_0011.wav      
  inflating: test/TEST_0012.wav      
  inflating: test/TEST_0013.wav      
  inflating: test/TEST_0014.wav      
  inflating: test/TEST_0015.wav      
  inflating: test/TEST_0016.wav      
  inflating: test/TEST_0017.wav      
  inflating: test/TEST_0018.wav      
  inflating: test/TEST_0019.wav      
  inflating: test/TEST_0020.wav      
  inflating: test/TEST_0021.wav      
  in

## Load Train / Test Set

In [7]:
train_df = pd.read_csv('./train.csv') # 모두 정상 Sample
test_df = pd.read_csv('./test.csv')

In [8]:
print(train_df.shape)
train_df.head()

(1279, 4)


Unnamed: 0,SAMPLE_ID,SAMPLE_PATH,FAN_TYPE,LABEL
0,TRAIN_0000,./train/TRAIN_0000.wav,2,0
1,TRAIN_0001,./train/TRAIN_0001.wav,0,0
2,TRAIN_0002,./train/TRAIN_0002.wav,0,0
3,TRAIN_0003,./train/TRAIN_0003.wav,2,0
4,TRAIN_0004,./train/TRAIN_0004.wav,2,0


In [9]:
print(test_df.shape)
test_df.head()

(1514, 3)


Unnamed: 0,SAMPLE_ID,SAMPLE_PATH,FAN_TYPE
0,TEST_0000,./test/TEST_0000.wav,2
1,TEST_0001,./test/TEST_0001.wav,2
2,TEST_0002,./test/TEST_0002.wav,0
3,TEST_0003,./test/TEST_0003.wav,0
4,TEST_0004,./test/TEST_0004.wav,0


# Preprocessing

## MFCC

In [10]:
def get_mfcc_feature(df):
    features = []
    for path in tqdm(df['SAMPLE_PATH']):
        # librosa패키지를 사용하여 wav 파일 load
        y, sr = librosa.load(path, sr=32000)
        
        # librosa패키지를 사용하여 mfcc 추출
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=128)

        y_feature = []
        # 추출된 MFCC들의 평균을 Feature로 사용
        for e in mfcc:

            y_feature.append(np.mean(e))

        features.append(y_feature)
    return features

In [11]:
train_mean_features = get_mfcc_feature(train_df)
test_mean_features = get_mfcc_feature(test_df)

  0%|          | 0/1279 [00:00<?, ?it/s]

  0%|          | 0/1514 [00:00<?, ?it/s]

In [12]:
print(len(train_mean_features))
print(len(train_mean_features[0]))

1279
128


In [13]:
print(len(test_mean_features))
print(len(test_mean_features[0]))

1514
128


In [31]:
train_mean = pd.DataFrame(train_mean_features)
test_mean = pd.DataFrame(test_mean_features)

## Scaler

In [32]:
df_train_FAN_TYPE = train_df[['FAN_TYPE']]
df_test_FAN_TYPE = test_df[['FAN_TYPE']]

In [33]:
def two_2_one(df):

  cond_type_2 = (df['FAN_TYPE']==2)
  df.loc[cond_type_2, 'FAN_TYPE'] = 1
  df.loc[~cond_type_2, 'FAN_TYPE'] = 0

  return df

In [34]:
df_train_FAN_TYPE = two_2_one(df_train_FAN_TYPE)
df_test_FAN_TYPE = two_2_one(df_test_FAN_TYPE)

In [35]:
train_mean = pd.concat([df_train_FAN_TYPE, train_mean], axis=1)
test_mean = pd.concat([df_test_FAN_TYPE, test_mean], axis=1)

In [36]:
train_type_0 = train_mean.loc[(train_mean['FAN_TYPE']==0)]
train_type_1 = train_mean.loc[(train_mean['FAN_TYPE']==1)]

test_type_0 = test_mean.loc[(test_mean['FAN_TYPE']==0)]
test_type_1 = test_mean.loc[(test_mean['FAN_TYPE']==1)]

In [37]:
train_type_0.drop(columns='FAN_TYPE', inplace=True)
train_type_1.drop(columns='FAN_TYPE', inplace=True)
test_type_0.drop(columns='FAN_TYPE', inplace=True)
test_type_1.drop(columns='FAN_TYPE', inplace=True)

In [38]:
list_test_0_index = list(test_type_0.index)
list_test_1_index = list(test_type_1.index)

In [41]:
scaler = StandardScaler()

scaled_train_type_0 = scaler.fit_transform(train_type_0)
scaled_test_type_0 = scaler.transform(test_type_0)

scaled_train_type_1 = scaler.fit_transform(train_type_1)
scaled_test_type_1 = scaler.transform(test_type_1)

In [42]:
train_type_0 = pd.DataFrame(scaled_train_type_0)
train_type_1 = pd.DataFrame(scaled_train_type_1)

test_type_0 = pd.DataFrame(scaled_test_type_0)
test_type_1 = pd.DataFrame(scaled_test_type_1)

In [43]:
test_type_0.index = list_test_0_index
test_type_1.index = list_test_1_index

In [44]:
train_mean = pd.concat([train_type_0, train_type_1], axis=0).sample(frac=1)
test_mean = pd.concat([test_type_0, test_type_1], axis=0).sample(frac=1)

In [45]:
train_mean.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
301,0.364857,0.365368,0.032132,-0.216262,-1.517586,0.108418,-1.229697,-0.482914,1.110092,-0.737181,...,0.995086,1.111533,-0.984323,1.492683,0.232445,-0.460429,-0.720559,-2.619105,0.803335,0.294926
602,-0.167857,-0.258583,0.483605,-0.502193,0.06176,0.220245,1.243183,1.433317,0.161966,0.123262,...,0.469356,0.973106,1.232352,-0.443513,0.688837,-1.310771,-0.81839,2.751305,1.192629,1.668512
215,-1.549584,1.076044,1.048852,-0.322436,0.939959,0.052006,-0.032668,0.453038,-0.90528,0.484523,...,-0.630069,-0.627409,0.169338,-0.3983,0.230314,0.66098,-0.025067,0.002382,-0.793084,-0.430481
361,0.396813,-0.175353,0.568949,-0.220057,0.811756,1.010279,0.980538,1.132334,0.550156,1.078653,...,-1.352394,-0.025324,0.684899,-1.28097,-1.274932,-1.038861,-1.356441,-0.664401,-0.245658,0.834869
407,-0.112066,0.788206,0.444398,0.426954,0.590016,0.078867,0.176109,0.46274,0.271171,0.486583,...,0.075424,-0.147753,-0.670601,-1.399189,-0.1382,0.623872,-1.075272,-0.600172,0.700387,0.484894


In [46]:
test_mean.sort_index(inplace=True)
test_mean.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,1.3384,-2.5698,2.051633,-1.501721,0.442548,-0.966467,-2.178181,-0.304918,-1.989249,0.260554,...,1.43881,0.136429,0.535596,-0.731663,0.382937,0.698665,-1.028586,0.69189,0.544138,0.144299
1,-0.718562,0.428065,0.62321,0.431034,0.805687,0.382957,0.129573,0.209586,-0.023102,0.722876,...,-0.788645,-0.367236,-0.334699,-1.161288,-0.268045,1.403638,-0.081697,-0.8502,0.008999,-0.081465
2,0.972004,2.00491,0.025305,-1.176084,0.471019,-1.244178,-0.442965,0.589031,-0.559023,1.196105,...,0.791017,-0.110557,-0.147829,-0.629898,0.819616,0.719169,1.034276,-0.010646,-2.357399,0.62583
3,0.52809,-1.549946,4.734266,-2.035259,0.020047,3.674541,-0.012407,2.565498,0.50672,1.133868,...,-0.27612,-0.146947,2.347306,0.924715,-0.365262,-0.343103,-1.048537,0.373566,-0.399537,-1.568171
4,-0.069009,2.239444,0.160408,-1.290154,0.354189,-1.194461,-0.251774,0.671878,-0.587287,0.728987,...,-0.879554,-0.647192,0.272658,-0.470288,1.267057,2.341174,1.656096,0.492615,-0.897662,0.799759


## PCA

In [116]:
pca = PCA(n_components=128)

In [117]:
pca_train_mean = pca.fit_transform(train_mean)
pca_test_mean = pca.transform(test_mean)

In [105]:
print(len(pca_train_mean))
print(len(pca_train_mean[0]))

1279
128


In [106]:
print(len(pca_test_mean))
print(len(pca_test_mean[0]))

1514
128


# Modeling

## Model Fit

In [107]:
model = IsolationForest(n_estimators=1000,
                        max_samples=256,
                        contamination='auto',
                        random_state=41,
                        verbose=1)

In [108]:
model.fit(pca_train_mean)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.7s finished


IsolationForest(max_samples=256, n_estimators=1000, random_state=41, verbose=1)

## Prediction

In [109]:
def get_pred_label(model_pred):
    # IsolationForest 모델 출력 (1:정상, -1:불량) 이므로 (0:정상, 1:불량)로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

In [110]:
test_pred = model.predict(pca_test_mean) # model prediction
test_pred = get_pred_label(test_pred)

# Submission

In [111]:
submit = pd.read_csv('./sample_submission.csv')

In [112]:
submit['LABEL'] = test_pred
submit.head()

Unnamed: 0,SAMPLE_ID,LABEL
0,TEST_0000,0
1,TEST_0001,0
2,TEST_0002,1
3,TEST_0003,1
4,TEST_0004,1


In [113]:
submit['LABEL'].value_counts()

1    779
0    735
Name: LABEL, dtype: int64

In [91]:
save_path = '/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/기계 고장 진단/submission/'
submit.to_csv(save_path + 'MFCC_Standard_PCA_100.csv', index=False)