# Preprocess Files

In [1]:
import librosa
import matplotlib.pyplot as plt
import os
import preprocessing
import soundfile

In [2]:
testSet = 'TestSet//'
trainSet = 'TrainSet//'

In [3]:
testSave = 'TestSetPrepped//'
trainSave = 'TrainSetPrepped//'

In [4]:
for file in os.listdir(testSet):
    if file.endswith('.wav'):
        file_path = testSet+file
        signal, sr = librosa.load(file_path)
        signal = preprocessing.endpoint_detection(signal)
        signal = preprocessing.normalization(signal)
        file_save_path = testSave+file
        soundfile.write(file_save_path, signal, sr)

In [5]:
for file in os.listdir(trainSet):
    if file.endswith('.wav'):
        file_path = trainSet+file
        signal, sr = librosa.load(file_path)
        signal = preprocessing.endpoint_detection(signal)
        signal = preprocessing.normalization(signal)
        file_save_path = trainSave+file
        soundfile.write(file_save_path, signal, sr)

# Extract Features

In [6]:
import FeatureExtract
import pandas as pd
import parselmouth

In [7]:
testSet = 'TestSetPrepped//'
trainSet = 'TrainSetPrepped//'

In [8]:
def extract_feature_df(filepath, start=0):
    feat_df = pd.DataFrame()
    for file in os.listdir(filepath):
        filename = os.fsdecode(file)
        if filename.endswith(".wav"):
            y, sr = librosa.load(filepath+filename)
            dur = librosa.get_duration(y=y, sr=sr)
            f_id = filename.split('.')[0]
            path = filepath+file
            report = FeatureExtract.get_report(path, start=start, end=dur)
            feats = FeatureExtract.get_feats(report, f_id)
            feat_df = pd.concat([feat_df, feats])
    return feat_df

In [9]:
test_df = extract_feature_df(testSet) 
test_df = test_df.drop(['Median pitch', 'Mean pitch', 'Minimum pitch', 'Maximum pitch'], axis=1)

In [10]:
train_df = extract_feature_df(trainSet) 
train_df = train_df.drop(['Median pitch', 'Mean pitch', 'Minimum pitch', 'Maximum pitch'], axis=1)

In [11]:
train_df.shape

(159, 23)

In [12]:
train_df.head()

Unnamed: 0,name,Standard deviation,Number of pulses,Number of periods,Mean period,Standard deviation of period,Fraction of locally unvoiced frames,Number of voice breaks,Degree of voice breaks,Jitter (local),...,Jitter (ddp),Shimmer (local),"Shimmer (local, dB)",Shimmer (apq3),Shimmer (apq5),Shimmer (apq11),Shimmer (dda),Mean autocorrelation,Mean noise-to-harmonics ratio,Mean harmonics-to-noise ratio
0,1047-a_n,3.584,358.0,357.0,0.004759,8.7e-05,0.0,0.0,0.0,0.828,...,1.498,1.165,0.101,0.573,0.71,1.314,1.719,0.990686,0.009443,21.104
0,1047-i_n,1.813,417.0,416.0,0.004627,4e-05,0.0,0.0,0.0,0.165,...,0.243,1.256,0.109,0.415,0.712,1.772,1.244,0.999518,0.000483,34.82
0,1047-u_n,2.307,294.0,293.0,0.004723,5.3e-05,0.0,0.0,0.0,0.328,...,0.578,1.9,0.166,0.969,1.127,1.831,2.907,0.998707,0.001295,29.423
0,1048-a_n,31.825,232.0,228.0,0.005992,0.001424,6.0,3.0,9.149,1.269,...,1.858,3.604,0.37,1.678,2.092,2.794,5.035,0.947451,0.073036,22.139
0,1048-i_n,6.612,279.0,278.0,0.004413,0.000132,0.0,0.0,0.0,0.452,...,0.613,2.452,0.206,1.266,1.267,2.177,3.797,0.996343,0.003687,26.142


In [13]:
train_df.columns

Index(['name', 'Standard deviation', 'Number of pulses', 'Number of periods',
       'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks', 'Jitter (local)', 'Jitter (local, absolute)',
       'Jitter (rap)', 'Jitter (ppq5)', 'Jitter (ddp)', 'Shimmer (local)',
       'Shimmer (local, dB)', 'Shimmer (apq3)', 'Shimmer (apq5)',
       'Shimmer (apq11)', 'Shimmer (dda)', 'Mean autocorrelation',
       'Mean noise-to-harmonics ratio', 'Mean harmonics-to-noise ratio'],
      dtype='object')

# Add Demographic Data

In [14]:
test_meta = pd.read_csv('testSet.csv', index_col=0)

test_df[['RecordingID', 'RestOfName']] = test_df['name'].str.split('-',expand=True)
test_df['Sound'] = test_df['RestOfName'].str.split('_',expand=True)[0]
test_df['RecordingID'] = test_df['RecordingID'].astype('int')

test_df = test_df.drop('name', axis=1)
test_df = test_df.drop('RestOfName', axis=1)
test_df = test_df.reset_index(drop=True)

test_df = pd.merge(test_df, test_meta, how='left', on = 'RecordingID')

In [15]:
train_meta = pd.read_csv('trainSet.csv', index_col=0)

train_df[['RecordingID', 'RestOfName']] = train_df['name'].str.split('-',expand=True)
train_df['Sound'] = train_df['RestOfName'].str.split('_',expand=True)[0]
train_df['RecordingID'] = train_df['RecordingID'].astype('int')

train_df = train_df.drop('name', axis=1)
train_df = train_df.drop('RestOfName', axis=1)
train_df = train_df.reset_index(drop=True)

train_df = pd.merge(train_df, train_meta, how='left', on = 'RecordingID')

In [16]:
test_df.shape

(69, 30)

In [17]:
train_df.shape

(159, 30)

In [18]:
train_df.to_csv('train_features2.csv')

In [19]:
test_df.to_csv('test_features2.csv')

# Prepare for Classification

Train Data

In [20]:
X_train = train_df.drop(['RecordingID', 'Sound', 'Type', 'Date', 'SubjectID', 'Sex', 'Age', 'Pathology'], axis=1)
y_train = train_df['Type']

In [21]:
X_train = X_train.dropna()

In [22]:
df_index = X_train.index.values.tolist()
y_train = y_train.iloc[df_index]

In [23]:
print(X_train.shape)
print(y_train.shape)

(157, 22)
(157,)


Test Data

In [24]:
X_test = test_df.drop(['RecordingID', 'Sound', 'Type', 'Date', 'SubjectID', 'Sex', 'Age', 'Pathology'], axis=1)
y_test = test_df['Type']

In [25]:
X_test = X_test.dropna()

In [26]:
df_index = X_test.index.values.tolist()
y_test = y_test.iloc[df_index]

In [27]:
print(X_test.shape)
print(y_test.shape)

(66, 22)
(66,)


Results

In [28]:
results = test_df[['RecordingID', 'Sound', 'Type', 'Date', 'SubjectID', 'Sex', 'Age', 'Pathology']].copy()

In [29]:
results.head()

Unnamed: 0,RecordingID,Sound,Type,Date,SubjectID,Sex,Age,Pathology
0,100,a,n,19/01/1998,117,m,66,Healthy
1,100,i,n,19/01/1998,117,m,66,Healthy
2,100,u,n,19/01/1998,117,m,66,Healthy
3,1005,a,n,29/10/1998,804,m,43,Healthy
4,1005,i,n,29/10/1998,804,m,43,Healthy


# Build Decision Tree

In [30]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import pickle
import matplotlib.pyplot as plt
import numpy as np
import EvaluateModel
from sklearn.model_selection import GridSearchCV

In [31]:
np.random.seed(42)

In [32]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=7, random_state=42)

In [33]:
import time
start_time = time.time()
clf = clf.fit(X_train,y_train)
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.0040035247802734375 seconds ---


In [34]:
y_pred = clf.predict(X_test)

In [35]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7424242424242424


In [36]:
print(metrics.confusion_matrix(y_test, y_pred))

[[21  9]
 [ 8 28]]


In [37]:
print('precision -', metrics.precision_score(y_test, y_pred, pos_label='p'))
print('recall -', metrics.recall_score(y_test, y_pred, pos_label='p'))

precision - 0.7567567567567568
recall - 0.7777777777777778


In [38]:
pickle.dump(clf, open("DT2.sav", 'wb'))

In [39]:
results, _, _, _ = EvaluateModel.rec_results(clf, X_test, y_test, results)

In [40]:
results = results.rename(columns={'Pred':'DT_pred'})

# Build Random Forest

In [41]:
from sklearn.ensemble import RandomForestClassifier

In [42]:
clf=RandomForestClassifier(n_estimators=45, random_state=42)

In [43]:
import time
start_time = time.time()
clf.fit(X_train,y_train)
print("--- %s seconds ---" % (time.time() - start_time))#Train the model using the training sets y_pred=clf.predict(X_test)

--- 0.07683372497558594 seconds ---


In [44]:
y_pred=clf.predict(X_test)

In [45]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.803030303030303


In [46]:
print(metrics.confusion_matrix(y_test, y_pred))

[[22  8]
 [ 5 31]]


In [47]:
print('precision -', metrics.precision_score(y_test, y_pred, pos_label='p'))
print('recall -', metrics.recall_score(y_test, y_pred, pos_label='p'))

precision - 0.7948717948717948
recall - 0.8611111111111112


In [48]:
pickle.dump(clf, open("RF2.sav", 'wb'))

In [49]:
results, _, _, _ = EvaluateModel.rec_results(clf, X_test, y_test, results)

In [50]:
results = results.rename(columns={'Pred':'RF_pred'})

# Build SVM

In [51]:
from sklearn.svm import SVC

In [52]:
clf = SVC(C=100, gamma=1, kernel='linear')

In [53]:
import time
start_time = time.time()
clf.fit(X_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))

--- 8.193437099456787 seconds ---


In [54]:
y_pred=clf.predict(X_test)

In [55]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.5909090909090909


In [56]:
print(metrics.confusion_matrix(y_test, y_pred))

[[22  8]
 [19 17]]


In [57]:
print('precision -', metrics.precision_score(y_test, y_pred, pos_label='p'))
print('recall -', metrics.recall_score(y_test, y_pred, pos_label='p'))

precision - 0.68
recall - 0.4722222222222222


In [58]:
pickle.dump(clf, open("SVM2.sav", 'wb'))

In [59]:
results, _, _, _ = EvaluateModel.rec_results(clf, X_test, y_test, results)

In [60]:
results = results.rename(columns={'Pred':'SVM_pred'})

# Build Logistic Regression

In [61]:
from sklearn.linear_model import LogisticRegression

In [62]:
clf=LogisticRegression(random_state=42)

In [63]:
import time
start_time = time.time()
#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)
print("--- %s seconds ---" % (time.time() - start_time))#Train the model using the training sets y_pred=clf.predict(X_test)

--- 0.015233278274536133 seconds ---


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [64]:
y_pred=clf.predict(X_test)

In [65]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6212121212121212


In [66]:
print(metrics.confusion_matrix(y_test, y_pred))

[[23  7]
 [18 18]]


In [67]:
print('precision -', metrics.precision_score(y_test, y_pred, pos_label='p'))
print('recall -', metrics.recall_score(y_test, y_pred, pos_label='p'))

precision - 0.72
recall - 0.5


In [68]:
pickle.dump(clf, open("LR2.sav", 'wb'))

In [69]:
results, _, _, _ = EvaluateModel.rec_results(clf, X_test, y_test, results)

In [70]:
results = results.rename(columns={'Pred':'LR_pred'})

In [71]:
results.to_csv('ClassifierResults2.csv')

# Add Noise to Test Set


In [72]:
import math
import random

In [73]:
def get_noise_from_sound(signal,noise,SNR):
    RMS_s=math.sqrt(np.mean(signal**2))
    #required RMS of noise
    RMS_n=math.sqrt(RMS_s**2/(pow(10,SNR/10)))
    
    #current RMS of noise
    RMS_n_current=math.sqrt(np.mean(noise**2))
    noise=noise*(RMS_n/RMS_n_current)
    
    return noise

In [74]:
save_path = 'TestNoisy2//'
path = 'TestSetPrepped//'

In [75]:
for file in os.listdir(path):
    filename = os.fsdecode(file)
    if filename.endswith(".mp3") or filename.endswith(".wav"):
        f_id = filename.split('.')[0]
        file_type = filename.split('.')[-1]
        y, sr = librosa.load(path+filename)
        noise=np.random.normal(0, 0.1, y.shape[0])
        snr = random.choice([5, 10, 20])
        noise = get_noise_from_sound(y,noise,snr)
        y_noisy = y+noise
        
        y_noisy = preprocessing.endpoint_detection(y_noisy)
        y_noisy = preprocessing.normalization(y_noisy)
        
        soundfile.write(save_path+filename, y_noisy, sr)

# Evaluate on Noisy

In [76]:
noisy_path = 'TestNoisy2//'

In [77]:
noisy_df = extract_feature_df(noisy_path) 
noisy_df = noisy_df.drop(['Median pitch', 'Mean pitch', 'Minimum pitch', 'Maximum pitch'], axis=1)

In [78]:
noisy_df.shape

(69, 23)

In [79]:
test_meta = pd.read_csv('testSet.csv', index_col=0)

noisy_df[['RecordingID', 'RestOfName']] = noisy_df['name'].str.split('-',expand=True)
noisy_df['Sound'] = noisy_df['RestOfName'].str.split('_',expand=True)[0]
noisy_df['RecordingID'] = noisy_df['RecordingID'].astype('int')

noisy_df = noisy_df.drop('name', axis=1)
noisy_df = noisy_df.drop('RestOfName', axis=1)
noisy_df = noisy_df.reset_index(drop=True)

noisy_df = pd.merge(noisy_df, test_meta, how='left', on = 'RecordingID')

In [80]:
X_test = noisy_df.drop(['RecordingID', 'Sound', 'Type', 'Date', 'SubjectID', 'Sex', 'Age', 'Pathology'], axis=1)
y_test = noisy_df['Type']

In [81]:
X_test = X_test.dropna()

In [82]:
df_index = X_test.index.values.tolist()
y_test = y_test.iloc[df_index]

In [83]:
print(X_test.shape)
print(y_test.shape)

(67, 22)
(67,)


In [84]:
noisy_results = noisy_df[['RecordingID', 'Sound', 'Type', 'Date', 'SubjectID', 'Sex', 'Age', 'Pathology']].copy()

## Decision Tree

In [85]:
clf = pickle.load(open('DT2.sav', 'rb'))

In [86]:
y_pred=clf.predict(X_test)

In [87]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6268656716417911


In [88]:
print(metrics.confusion_matrix(y_test, y_pred))

[[10 20]
 [ 5 32]]


In [89]:
print('precision -', metrics.precision_score(y_test, y_pred, pos_label='p'))
print('recall -', metrics.recall_score(y_test, y_pred, pos_label='p'))

precision - 0.6153846153846154
recall - 0.8648648648648649


In [90]:
noisy_results, _, _, _ = EvaluateModel.rec_results(clf, X_test, y_test, noisy_results)

In [91]:
noisy_results = noisy_results.rename(columns={'Pred':'DT_pred'})

## Random Forest

In [92]:
clf = pickle.load(open('RF2.sav', 'rb'))

In [93]:
y_pred=clf.predict(X_test)

In [94]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6268656716417911


In [95]:
print(metrics.confusion_matrix(y_test, y_pred))

[[ 8 22]
 [ 3 34]]


In [96]:
print('precision -', metrics.precision_score(y_test, y_pred, pos_label='p'))
print('recall -', metrics.recall_score(y_test, y_pred, pos_label='p'))

precision - 0.6071428571428571
recall - 0.918918918918919


In [97]:
noisy_results, _, _, _ = EvaluateModel.rec_results(clf, X_test, y_test, noisy_results)

In [98]:
noisy_results = noisy_results.rename(columns={'Pred':'RF_pred'})

## SVM

In [99]:
clf = pickle.load(open('SVM2.sav', 'rb'))

In [100]:
y_pred=clf.predict(X_test)

In [101]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.5223880597014925


In [102]:
print(metrics.confusion_matrix(y_test, y_pred))

[[ 7 23]
 [ 9 28]]


In [103]:
print('precision -', metrics.precision_score(y_test, y_pred, pos_label='p'))
print('recall -', metrics.recall_score(y_test, y_pred, pos_label='p'))

precision - 0.5490196078431373
recall - 0.7567567567567568


In [104]:
noisy_results, _, _, _ = EvaluateModel.rec_results(clf, X_test, y_test, noisy_results)

In [105]:
noisy_results = noisy_results.rename(columns={'Pred':'SVM_pred'})

## Logistic Regression

In [106]:
clf = pickle.load(open('LR2.sav', 'rb'))

In [107]:
y_pred=clf.predict(X_test)

In [108]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.4626865671641791


In [109]:
print(metrics.confusion_matrix(y_test, y_pred))

[[ 3 27]
 [ 9 28]]


In [110]:
print('precision -', metrics.precision_score(y_test, y_pred, pos_label='p'))
print('recall -', metrics.recall_score(y_test, y_pred, pos_label='p'))

precision - 0.509090909090909
recall - 0.7567567567567568


In [111]:
noisy_results, _, _, _ = EvaluateModel.rec_results(clf, X_test, y_test, noisy_results)

In [112]:
noisy_results = noisy_results.rename(columns={'Pred':'LR_pred'})

# Apply Bandpass

In [113]:
from scipy.signal import butter, lfilter
from scipy.spatial.distance import euclidean

In [114]:
def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a


def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y

In [115]:
save_dir = 'BandpassTestSet//'
noisy_dir = 'TestNoisy2//'

In [116]:
for file in os.listdir(noisy_dir):
    if file.endswith('.wav'):
        file_path = noisy_dir+file
        signal, sr = librosa.load(file_path)
        clean_signal = butter_bandpass_filter(signal, 10, 3500, sr, 1)
        
        clean_signal = preprocessing.endpoint_detection(clean_signal)
        clean_signal = preprocessing.normalization(clean_signal)
        
        file_save_path = save_dir+file
        soundfile.write(file_save_path, clean_signal, sr)

# Evaluate on Bandpass

In [117]:
noisy_path = 'BandpassTestSet//'

In [118]:
noisy_df = extract_feature_df(noisy_path) 
noisy_df = noisy_df.drop(['Median pitch', 'Mean pitch', 'Minimum pitch', 'Maximum pitch'], axis=1)

In [119]:
noisy_df.shape

(69, 23)

In [120]:
test_meta = pd.read_csv('testSet.csv', index_col=0)

noisy_df[['RecordingID', 'RestOfName']] = noisy_df['name'].str.split('-',expand=True)
noisy_df['Sound'] = noisy_df['RestOfName'].str.split('_',expand=True)[0]
noisy_df['RecordingID'] = noisy_df['RecordingID'].astype('int')

noisy_df = noisy_df.drop('name', axis=1)
noisy_df = noisy_df.drop('RestOfName', axis=1)
noisy_df = noisy_df.reset_index(drop=True)

noisy_df = pd.merge(noisy_df, test_meta, how='left', on = 'RecordingID')

In [121]:
X_test = noisy_df.drop(['RecordingID', 'Sound', 'Type', 'Date', 'SubjectID', 'Sex', 'Age', 'Pathology'], axis=1)
y_test = noisy_df['Type']

In [122]:
X_test = X_test.dropna()

In [123]:
df_index = X_test.index.values.tolist()
y_test = y_test.iloc[df_index]

In [124]:
print(X_test.shape)
print(y_test.shape)

(66, 22)
(66,)


In [125]:
bandpass_results = noisy_df[['RecordingID', 'Sound', 'Type', 'Date', 'SubjectID', 'Sex', 'Age', 'Pathology']].copy()

## Decision Tree

In [126]:
clf = pickle.load(open('DT2.sav', 'rb'))

In [127]:
y_pred=clf.predict(X_test)

In [128]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6515151515151515


In [129]:
print(metrics.confusion_matrix(y_test, y_pred))

[[12 18]
 [ 5 31]]


In [130]:
print('precision -', metrics.precision_score(y_test, y_pred, pos_label='p'))
print('recall -', metrics.recall_score(y_test, y_pred, pos_label='p'))

precision - 0.6326530612244898
recall - 0.8611111111111112


In [131]:
bandpass_results, _, _, _ = EvaluateModel.rec_results(clf, X_test, y_test, bandpass_results)

In [132]:
bandpass_results = bandpass_results.rename(columns={'Pred':'DT_pred'})

## Random Forest

In [133]:
clf = pickle.load(open('RF2.sav', 'rb'))

In [134]:
y_pred=clf.predict(X_test)

In [135]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.696969696969697


In [136]:
print(metrics.confusion_matrix(y_test, y_pred))

[[12 18]
 [ 2 34]]


In [137]:
print('precision -', metrics.precision_score(y_test, y_pred, pos_label='p'))
print('recall -', metrics.recall_score(y_test, y_pred, pos_label='p'))

precision - 0.6538461538461539
recall - 0.9444444444444444


In [138]:
bandpass_results, _, _, _ = EvaluateModel.rec_results(clf, X_test, y_test, bandpass_results)

In [139]:
bandpass_results = bandpass_results.rename(columns={'Pred':'RF_pred'})

## SVM

In [140]:
clf = pickle.load(open('SVM2.sav', 'rb'))

In [141]:
y_pred=clf.predict(X_test)

In [142]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.5303030303030303


In [143]:
print(metrics.confusion_matrix(y_test, y_pred))

[[ 7 23]
 [ 8 28]]


In [144]:
print('precision -', metrics.precision_score(y_test, y_pred, pos_label='p'))
print('recall -', metrics.recall_score(y_test, y_pred, pos_label='p'))

precision - 0.5490196078431373
recall - 0.7777777777777778


In [145]:
bandpass_results, _, _, _ = EvaluateModel.rec_results(clf, X_test, y_test, bandpass_results)

In [146]:
bandpass_results = bandpass_results.rename(columns={'Pred':'SVM_pred'})

## Logistic Regression

In [147]:
clf = pickle.load(open('LR2.sav', 'rb'))

In [148]:
y_pred=clf.predict(X_test)

In [149]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.5


In [150]:
print(metrics.confusion_matrix(y_test, y_pred))

[[ 5 25]
 [ 8 28]]


In [151]:
print('precision -', metrics.precision_score(y_test, y_pred, pos_label='p'))
print('recall -', metrics.recall_score(y_test, y_pred, pos_label='p'))

precision - 0.5283018867924528
recall - 0.7777777777777778


In [152]:
bandpass_results, _, _, _ = EvaluateModel.rec_results(clf, X_test, y_test, bandpass_results)

In [153]:
bandpass_results = bandpass_results.rename(columns={'Pred':'LR_pred'})

# Apply Weiner

In [154]:
from scipy.signal import wiener

In [155]:
save_dir = 'WeinerTestSet//'

In [156]:
for file in os.listdir(noisy_dir):
    if file.endswith('.wav'):
        file_path = noisy_dir+file
        signal, sr = librosa.load(file_path)
        clean_signal = wiener(signal, mysize=3, noise=0.1)
        
        clean_signal = preprocessing.endpoint_detection(clean_signal)
        clean_signal = preprocessing.normalization(clean_signal)
        
        file_save_path = save_dir+file
        soundfile.write(file_save_path, clean_signal, sr)

In [157]:
noisy_path = 'WeinerTestSet//'

In [158]:
noisy_df = extract_feature_df(noisy_path) 
noisy_df = noisy_df.drop(['Median pitch', 'Mean pitch', 'Minimum pitch', 'Maximum pitch'], axis=1)

In [159]:
noisy_df.shape

(69, 23)

In [160]:
test_meta = pd.read_csv('testSet.csv', index_col=0)

noisy_df[['RecordingID', 'RestOfName']] = noisy_df['name'].str.split('-',expand=True)
noisy_df['Sound'] = noisy_df['RestOfName'].str.split('_',expand=True)[0]
noisy_df['RecordingID'] = noisy_df['RecordingID'].astype('int')

noisy_df = noisy_df.drop('name', axis=1)
noisy_df = noisy_df.drop('RestOfName', axis=1)
noisy_df = noisy_df.reset_index(drop=True)

noisy_df = pd.merge(noisy_df, test_meta, how='left', on = 'RecordingID')

In [161]:
X_test = noisy_df.drop(['RecordingID', 'Sound', 'Type', 'Date', 'SubjectID', 'Sex', 'Age', 'Pathology'], axis=1)
y_test = noisy_df['Type']

In [162]:
X_test = X_test.dropna()

In [163]:
df_index = X_test.index.values.tolist()
y_test = y_test.iloc[df_index]

In [164]:
print(X_test.shape)
print(y_test.shape)

(66, 22)
(66,)


In [165]:
weiner_results = noisy_df[['RecordingID', 'Sound', 'Type', 'Date', 'SubjectID', 'Sex', 'Age', 'Pathology']].copy()

## Decision Tree

In [166]:
clf = pickle.load(open('DT2.sav', 'rb'))

In [167]:
y_pred=clf.predict(X_test)

In [168]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6666666666666666


In [169]:
print(metrics.confusion_matrix(y_test, y_pred))

[[14 16]
 [ 6 30]]


In [170]:
print('precision -', metrics.precision_score(y_test, y_pred, pos_label='p'))
print('recall -', metrics.recall_score(y_test, y_pred, pos_label='p'))

precision - 0.6521739130434783
recall - 0.8333333333333334


In [171]:
weiner_results, _, _, _ = EvaluateModel.rec_results(clf, X_test, y_test, weiner_results)

In [172]:
weiner_results = weiner_results.rename(columns={'Pred':'DT_pred'})

## Random Forest

In [173]:
clf = pickle.load(open('RF2.sav', 'rb'))

In [174]:
y_pred=clf.predict(X_test)

In [175]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6818181818181818


In [176]:
print(metrics.confusion_matrix(y_test, y_pred))

[[11 19]
 [ 2 34]]


In [177]:
print('precision -', metrics.precision_score(y_test, y_pred, pos_label='p'))
print('recall -', metrics.recall_score(y_test, y_pred, pos_label='p'))

precision - 0.6415094339622641
recall - 0.9444444444444444


In [178]:
weiner_results, _, _, _ = EvaluateModel.rec_results(clf, X_test, y_test, weiner_results)

In [179]:
weiner_results = weiner_results.rename(columns={'Pred':'RF_pred'})

## SVM

In [180]:
clf = pickle.load(open('SVM2.sav', 'rb'))

In [181]:
y_pred=clf.predict(X_test)

In [182]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.5606060606060606


In [183]:
print(metrics.confusion_matrix(y_test, y_pred))

[[ 9 21]
 [ 8 28]]


In [184]:
print('precision -', metrics.precision_score(y_test, y_pred, pos_label='p'))
print('recall -', metrics.recall_score(y_test, y_pred, pos_label='p'))

precision - 0.5714285714285714
recall - 0.7777777777777778


In [185]:
weiner_results, _, _, _ = EvaluateModel.rec_results(clf, X_test, y_test, weiner_results)

In [186]:
weiner_results = weiner_results.rename(columns={'Pred':'SVM_pred'})

## Logistic Regression

In [187]:
clf = pickle.load(open('LR2.sav', 'rb'))

In [188]:
y_pred=clf.predict(X_test)

In [189]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.5151515151515151


In [190]:
print(metrics.confusion_matrix(y_test, y_pred))

[[ 4 26]
 [ 6 30]]


In [191]:
print('precision -', metrics.precision_score(y_test, y_pred, pos_label='p'))
print('recall -', metrics.recall_score(y_test, y_pred, pos_label='p'))

precision - 0.5357142857142857
recall - 0.8333333333333334


In [192]:
weiner_results, _, _, _ = EvaluateModel.rec_results(clf, X_test, y_test, weiner_results)

In [193]:
weiner_results = weiner_results.rename(columns={'Pred':'LR_pred'})

# Apply Wavelet

In [194]:
import pywt
import numpy as np

In [195]:
def madev(d, axis=None):
    """ Mean absolute deviation of a signal """
    return np.mean(np.absolute(d - np.mean(d, axis)), axis)

def wavelet_denoising(x, wavelet, level, mode):
    coeff = pywt.wavedec(x, wavelet, mode="per")
    sigma = (1/0.6745) * madev(coeff[-level])
    uthresh = sigma * np.sqrt(2 * np.log(len(x)))
    coeff[1:] = (pywt.threshold(i, value=uthresh, mode=mode) for i in coeff[1:])
    return pywt.waverec(coeff, wavelet, mode='per')

In [196]:
save_dir = 'WaveletTestSet//'

In [197]:
for file in os.listdir(noisy_dir):
    if file.endswith('.wav'):
        file_path = noisy_dir+file
        signal, sr = librosa.load(file_path)
        clean_signal = wavelet_denoising(signal, wavelet='bior3.9', level=1, mode='hard')
        
        clean_signal = preprocessing.endpoint_detection(clean_signal)
        clean_signal = preprocessing.normalization(clean_signal)
        
        file_save_path = save_dir+file
        soundfile.write(file_save_path, clean_signal, sr)

## Evaluate on Wavelet

In [198]:
noisy_path = 'WaveletTestSet//'

In [199]:
noisy_df = extract_feature_df(noisy_path) 
noisy_df = noisy_df.drop(['Median pitch', 'Mean pitch', 'Minimum pitch', 'Maximum pitch'], axis=1)

In [200]:
noisy_df.shape

(69, 23)

In [201]:
test_meta = pd.read_csv('testSet.csv', index_col=0)

noisy_df[['RecordingID', 'RestOfName']] = noisy_df['name'].str.split('-',expand=True)
noisy_df['Sound'] = noisy_df['RestOfName'].str.split('_',expand=True)[0]
noisy_df['RecordingID'] = noisy_df['RecordingID'].astype('int')

noisy_df = noisy_df.drop('name', axis=1)
noisy_df = noisy_df.drop('RestOfName', axis=1)
noisy_df = noisy_df.reset_index(drop=True)

noisy_df = pd.merge(noisy_df, test_meta, how='left', on = 'RecordingID')

In [202]:
X_test = noisy_df.drop(['RecordingID', 'Sound', 'Type', 'Date', 'SubjectID', 'Sex', 'Age', 'Pathology'], axis=1)
y_test = noisy_df['Type']

In [203]:
X_test = X_test.dropna()

In [204]:
df_index = X_test.index.values.tolist()
y_test = y_test.iloc[df_index]

In [205]:
print(X_test.shape)
print(y_test.shape)

(65, 22)
(65,)


In [206]:
wavelet_results = noisy_df[['RecordingID', 'Sound', 'Type', 'Date', 'SubjectID', 'Sex', 'Age', 'Pathology']].copy()

## Decision Tree

In [207]:
clf = pickle.load(open('DT2.sav', 'rb'))

In [208]:
y_pred=clf.predict(X_test)

In [209]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6153846153846154


In [210]:
print(metrics.confusion_matrix(y_test, y_pred))

[[ 9 21]
 [ 4 31]]


In [211]:
print('precision -', metrics.precision_score(y_test, y_pred, pos_label='p'))
print('recall -', metrics.recall_score(y_test, y_pred, pos_label='p'))

precision - 0.5961538461538461
recall - 0.8857142857142857


In [212]:
wavelet_results, _, _, _ = EvaluateModel.rec_results(clf, X_test, y_test, wavelet_results)

In [213]:
wavelet_results = wavelet_results.rename(columns={'Pred':'DT_pred'})

## Random Forest

In [214]:
clf = pickle.load(open('RF2.sav', 'rb'))

In [215]:
y_pred=clf.predict(X_test)

In [216]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7230769230769231


In [217]:
print(metrics.confusion_matrix(y_test, y_pred))

[[13 17]
 [ 1 34]]


In [218]:
print('precision -', metrics.precision_score(y_test, y_pred, pos_label='p'))
print('recall -', metrics.recall_score(y_test, y_pred, pos_label='p'))

precision - 0.6666666666666666
recall - 0.9714285714285714


In [219]:
wavelet_results, _, _, _ = EvaluateModel.rec_results(clf, X_test, y_test, wavelet_results)

In [220]:
wavelet_results = wavelet_results.rename(columns={'Pred':'RF_pred'})

## SVM

In [221]:
clf = pickle.load(open('SVM2.sav', 'rb'))

In [222]:
y_pred=clf.predict(X_test)

In [223]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.47692307692307695


In [224]:
print(metrics.confusion_matrix(y_test, y_pred))

[[ 5 25]
 [ 9 26]]


In [225]:
print('precision -', metrics.precision_score(y_test, y_pred, pos_label='p'))
print('recall -', metrics.recall_score(y_test, y_pred, pos_label='p'))

precision - 0.5098039215686274
recall - 0.7428571428571429


In [226]:
wavelet_results, _, _, _ = EvaluateModel.rec_results(clf, X_test, y_test, wavelet_results)

In [227]:
wavelet_results = wavelet_results.rename(columns={'Pred':'SVM_pred'})

## Logistic Regression

In [228]:
clf = pickle.load(open('LR2.sav', 'rb'))

In [229]:
y_pred=clf.predict(X_test)

In [230]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.5692307692307692


In [231]:
print(metrics.confusion_matrix(y_test, y_pred))

[[ 5 25]
 [ 3 32]]


In [232]:
print('precision -', metrics.precision_score(y_test, y_pred, pos_label='p'))
print('recall -', metrics.recall_score(y_test, y_pred, pos_label='p'))

precision - 0.5614035087719298
recall - 0.9142857142857143


In [233]:
wavelet_results, _, _, _ = EvaluateModel.rec_results(clf, X_test, y_test, wavelet_results)

In [234]:
wavelet_results = wavelet_results.rename(columns={'Pred':'LR_pred'})

# Save Results

In [235]:
results.to_csv('results2.csv')

In [236]:
noisy_results.to_csv('noisy_results2.csv')

In [237]:
bandpass_results.to_csv('bandpass_results2.csv')

In [238]:
weiner_results.to_csv('weiner_results2.csv')

In [239]:
wavelet_results.to_csv('wavelet_results2.csv')