In [2]:
import numpy as np # for general handling of numerical data
import pandas as pd # for dataframes
import librosa
import librosa.display as dis
import matplotlib.pyplot as plt
import scipy
import csv
import os
from sklearn.svm import OneClassSVM
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn import metrics
from scipy import signal 
np.random.seed(10)
from sklearn.covariance import EllipticEnvelope 
from sklearn.metrics import average_precision_score
outliers_fraction = 0.25
from sklearn.metrics import roc_auc_score
from window_slider import Slider

In [3]:
# creates header of csv file with number filename, cwt and stft 
def create_header_with_label():
    header = 'filename'
    for i in range(1, 21):
        header += f' cwt{i}'
    for i in range(1, 11):
        header += f' stft{i}'
    header += ' label'
    header = header.split()
    return header

In [4]:
# creates csv file with header from create_header_() and a given name.csv
def create_csv(name,header):
    file = open(name, 'w', newline='')
    with file:
        writer = csv.writer(file)
        writer.writerow(header)
    return name

In [5]:
# extracte features from each file and write into csv file
def write_data_2_csv_from_wav(path,name):
    header = create_header_with_label()
    create_csv(name,header)
    # extracte cwt and stft 
    for filename in os.listdir(f'{path}'):
        if not filename.startswith('.') and os.path.isfile(os.path.join(path, filename)):
            file = f'{path}{filename}'
            y, sr = librosa.load(file)
            # extract stft matrix from section
            stftmatr = np.abs(librosa.stft(y, n_fft = 19))
            stftmatr_square = np.square(np.square(stftmatr))
            # extract cwt matrix from section
            widths = np.arange(1, 21)
            cwtmatr = signal.cwt(y, signal.ricker, widths)
            cwtmatr_square = np.square(np.square(cwtmatr))
             # write data into csv file
            to_append = f'{filename}'
            for c in cwtmatr_square:
                to_append += f' {np.mean(c)}'
            for s in stftmatr_square:
                to_append += f' {np.mean(s)}'
            # label abnormal data with 1 and normal data with 0
            if filename.startswith('ab'):
                to_append += f' {1}' 
            else:
                to_append += f' {0}' 
            file = open(name, 'a', newline='')
            with file:
                writer = csv.writer(file)
                writer.writerow(to_append.split())

In [6]:
# implements sliding window
# extracte features from each file and write into csv file
def write_data_2_csv_from_wav_slider(path,name):
    header = create_header_with_label()
    create_csv(name,header)
    for filename in os.listdir(f'{path}'):
        if not filename.startswith('.') and os.path.isfile(os.path.join(path, filename)):
            file = f'{path}{filename}'
            y, sr = librosa.load(file)
            size_overlap = round(sr/2)
            size_window = round(sr)
            slider = Slider(size_window,size_overlap)
            slider.fit(y)
            sections = []
            while True:
                window_data = slider.slide()
                sections.append(window_data)
                if slider.reached_end_of_list(): break
            i = 0
            for sec in sections:
                y = sections[i]
                
                stftmatr = np.abs(librosa.stft(y, n_fft = 19))
                stftmatr_square = np.square(np.square(stftmatr))
                widths = np.arange(1, 21)
                cwtmatr = signal.cwt(y, signal.ricker, widths)
                cwtmatr_square = np.square(np.square(cwtmatr))
                to_append = f'{filename}'
                for c in cwtmatr_square:
                    to_append += f' {np.mean(c)}'
                    
                for s in stftmatr_square:
                    to_append += f' {np.mean(s)}'
                if filename.startswith('ab'):
                    to_append += f' {1}' 
                else:
                    to_append += f' {0}' 
                file = open(name, 'a', newline='')
                with file:
                    writer = csv.writer(file)
                    writer.writerow(to_append.split())
                i = i+1

In [8]:
# loads csv file
def load_csv(path):
    df = pd.read_csv(path)
    compare = df #df zum vergleichen für AUC
    
    data = df.drop(['filename' , 'label'],axis=1)
    return data ,compare 
    

In [9]:
# preprocesses data
def preprocess_data(data):
    scaler = StandardScaler()
    processed_data = scaler.fit_transform(np.array(data, dtype = float))
    return processed_data

In [10]:
# one class support vector machine implementation with sklearn + auc score
def train_OCSVM_AUC(data):
    model =  OneClassSVM(nu=0.95 * outliers_fraction, gamma=0.1) #nu=0.95 * outliers_fraction  + 0.05
    model.fit(data)
    compare['anomalyOCSVM'] = pd.Series(model.predict(data))
    compare['anomalyOCSVM'] = compare['anomalyOCSVM'].map( {1: 0, -1: 1} )
    print(compare['anomalyOCSVM'].value_counts())
    cmp = compare.drop(['filename'],axis=1)
    y = cmp['label'].to_numpy()
    pred = cmp['anomalyOCSVM'].to_numpy()
    fpr, tpr, thresholds = metrics.roc_curve(y, pred)
    print(metrics.auc(fpr, tpr))

In [11]:
# isolation forest implementation with sklearn + auc score
def train_IF_AUC(data):
    rng = np.random.RandomState(10)
    model = IsolationForest(contamination = outliers_fraction, max_samples=100, random_state=rng)
    model.fit(data)
    compare['anomalyIF'] = pd.Series(model.predict(data))
    compare['anomalyIF'] = compare['anomalyIF'].map( {1: 0, -1: 1} )
    print(compare['anomalyIF'].value_counts())
    cmp = compare.drop(['filename'],axis=1)
    y = cmp['label'].to_numpy()
    pred = cmp['anomalyIF'].to_numpy()
    fpr, tpr, thresholds = metrics.roc_curve(y, pred)
    print(metrics.auc(fpr, tpr))

In [12]:
# k-means implementation with sklearn + auc score
def getDistanceByPoint(data, model):
    distance = []
    for i in range(0,len(data)):
        Xa = np.array(data.loc[i])
        Xb = model.cluster_centers_[model.labels_[i]-1]
        distance.append(np.linalg.norm(Xa-Xb))
    return pd.Series(distance, index=data.index)
def train_kmeans_AUC(data):
    pca = PCA(n_components=2)
    data = pca.fit_transform(data)
    # standardize these 2 new features
    min_max_scaler = preprocessing.StandardScaler()
    np_scaled = min_max_scaler.fit_transform(data)
    data = pd.DataFrame(np_scaled)
    n_cluster = range(1, 50)
    kmeans = [KMeans(n_clusters=i).fit(data) for i in n_cluster]
    scores = [kmeans[i].score(data) for i in range(len(kmeans))]
    
    compare['cluster'] = kmeans[1].predict(data)
    compare['principal_feature1'] = data[0]
    compare['principal_feature2'] = data[1]
    print(compare['cluster'].value_counts())
    
    distance = getDistanceByPoint(data, kmeans[0])
    number_of_outliers = int(outliers_fraction*len(distance))
    threshold = distance.nlargest(number_of_outliers).min()
    # anomaly21 contain the anomaly result of method 2.1 Cluster (0:normal, 1:anomaly) 
    compare['anomalyKmeans'] = (distance >= threshold).astype(int)
    
    y = compare['label'].to_numpy()
    pred = compare['anomalyKmeans'].to_numpy()
    fpr, tpr, thresholds = metrics.roc_curve(y, pred)
    print(metrics.auc(fpr, tpr))
    

In [18]:
# elliptic envelope implementation with sklearn + auc score
def train_EE_AUC(data):
    model = EllipticEnvelope(random_state=0, contamination = outliers_fraction, support_fraction = 1.5)
    model.fit(data)
    compare['anomalyEE'] = pd.Series(model.predict(data))
    compare['anomalyEE'] = compare['anomalyEE'].map( {1: 0, -1: 1} )
    print(compare['anomalyEE'].value_counts())
    y = compare['label'].to_numpy()
    pred = compare['anomalyEE'].to_numpy()
    fpr, tpr, thresholds = metrics.roc_curve(y, pred)
    print(metrics.auc(fpr, tpr))



In [27]:
# extracte features an run ml algorithms
header = create_header_with_label()
name = create_csv("data_id_06_slide.csv",header)
write_data_2_csv_from_wav_slider("Dataset/id_06/",name)
data, compare = load_csv("data_id_06_slide.csv")
processed_data = preprocess_data(data)
print("_____________")
print("____OCSVM____")
train_OCSVM_AUC(processed_data)
print("_____________")
print("______IF_____")
train_IF_AUC(processed_data)
print("_____________")
print("______EE_____")
train_EE_AUC(processed_data)
print("_____________")
print("____KMeans___")
train_kmeans_AUC(processed_data)
print("_____________")

_____________
____OCSVM____
0    9500
1    2960
Name: anomalyOCSVM, dtype: int64
0.6875
_____________
______IF_____
0    9345
1    3115
Name: anomalyIF, dtype: int64
0.6556647940074907
_____________
______EE_____




0    9345
1    3115
Name: anomalyEE, dtype: int64
0.72874531835206
_____________
____KMeans___
0    9747
1    2713
Name: cluster, dtype: int64
0.6166666666666666
_____________


In [25]:
# load file with extracted features and run ml algorithms 
data, compare = load_csv("Extracted Data/MIMII/data_id_02_20_10.csv")
processed_data = preprocess_data(data)
print("_____________")
print("____OCSVM____")
train_OCSVM_AUC(processed_data)
print("_____________")
print("______IF_____")
train_IF_AUC(processed_data)
print("_____________")
print("______EE_____")
train_EE_AUC(processed_data)
print("_____________")
print("____KMeans___")
train_kmeans_AUC(processed_data)
print("_____________")

_____________
____OCSVM____
0    1016
1     319
Name: anomalyOCSVM, dtype: int64
0.6924157303370787
_____________
______IF_____
0    1001
1     334
Name: anomalyIF, dtype: int64
0.7673220973782773
_____________
______EE_____




0    1001
1     334
Name: anomalyEE, dtype: int64
0.7556179775280898
_____________
____KMeans___
1    690
0    645
Name: cluster, dtype: int64
0.8122659176029963
_____________


In [20]:
# remove extrated features for overview
anomalies = compare.drop(["cwt1","cwt2","cwt3","cwt4","cwt5","cwt6","cwt7","cwt8","cwt9","cwt10","cwt11","cwt12","cwt13","cwt14","cwt15","cwt16","cwt17","cwt18","cwt19","cwt20","stft1","stft2","stft3","stft4","stft5","stft6","stft7","stft8","stft9","stft10","cluster","principal_feature1","principal_feature2"],axis = 1)

In [21]:
# extract specific anomolies 
find_file = anomalies[anomalies.anomalyIF == 1]

In [22]:
# shows list sections where anomolies are fund 
find_file

Unnamed: 0,filename,label,anomalyOCSVM,anomalyIF,anomalyEE,anomalyKmeans
140,ab_000000058.wav,1,1,1,1,1
141,ab_000000058.wav,1,1,1,1,1
142,ab_000000058.wav,1,1,1,1,1
143,ab_000000058.wav,1,1,1,1,1
144,ab_000000058.wav,1,1,1,1,1
...,...,...,...,...,...,...
28475,ab_000000096.wav,1,1,1,1,1
28476,ab_000000096.wav,1,1,1,1,1
28477,ab_000000096.wav,1,1,1,1,1
28478,ab_000000096.wav,1,1,1,1,1
