In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sys
sys.path.append('/content/drive/MyDrive/CancerVsLaryngitis/FeatureExtraction')

In [3]:
%%capture

!pip install praat-parselmouth

In [4]:
import pandas as pd
import os
import librosa
import FeatureExtract

from sklearn.model_selection import train_test_split

In [5]:
def extract_feature_df(filepath, start=0):
    feat_df = pd.DataFrame()
    for subdir, dirs, files in os.walk(filepath):
        for file in files:
            filename = os.fsdecode(file)
            if filename.endswith(".wav"):
                y, sr = librosa.load(subdir+'//'+filename)
                dur = librosa.get_duration(y=y, sr=sr)
                f_id = filename.split('.')[0]
                path = subdir+'//'+filename
                report = FeatureExtract.get_report(path, start=start, end=dur)
                feats = FeatureExtract.get_feats(report, f_id)
                pathology = subdir.split('/')[-1]
                feats['pathology']=pathology

                feat_df = pd.concat([feat_df, feats])
    return feat_df

In [66]:
data_type = 'TrainSetNotPrepped_a'

In [67]:
features_df = extract_feature_df(f'/content/drive/MyDrive/CancerVsLaryngitis/DealWithData/{data_type}')

In [68]:
features_df = features_df.reset_index(drop=True)

In [69]:
features_df = features_df.rename({'name': 'file'}, axis=1)

In [70]:
features_df.head()

Unnamed: 0,file,Median pitch,Mean pitch,Standard deviation,Minimum pitch,Maximum pitch,Number of pulses,Number of periods,Mean period,Standard deviation of period,...,Shimmer (local),"Shimmer (local, dB)",Shimmer (apq3),Shimmer (apq5),Shimmer (apq11),Shimmer (dda),Mean autocorrelation,Mean noise-to-harmonics ratio,Mean harmonics-to-noise ratio,pathology
0,1311-a_n_noisy,168.53,169.103,2.256,163.253,176.994,174.0,173.0,0.005914,8.5e-05,...,2.655,0.249,1.376,1.515,2.039,4.128,0.984254,0.016435,20.3,Laryngitis
1,139-a_n_pitch,134.434,134.634,1.157,132.453,137.835,211.0,210.0,0.00743,0.000232,...,3.672,0.384,2.072,1.872,2.403,6.216,0.960431,0.051513,24.582,Laryngitis
2,1315-a_n_pitch,104.943,104.94,3.051,98.939,114.338,130.0,129.0,0.009528,0.000279,...,8.711,0.871,3.582,6.011,9.426,10.747,0.917601,0.10287,13.024,Laryngitis
3,139-a_n_noisy,126.815,127.053,1.239,124.866,131.494,199.0,198.0,0.007872,0.000294,...,5.707,0.595,3.122,2.222,2.216,9.366,0.948952,0.073983,23.849,Laryngitis
4,1315-a_n_noisy,99.041,99.031,3.085,92.902,108.977,123.0,122.0,0.010096,0.000314,...,7.146,0.628,3.321,4.832,6.858,9.963,0.952935,0.051534,14.204,Laryngitis


In [71]:
max_value=features_df.max()

features_df.fillna(value=max_value, inplace=True)

In [72]:
from sklearn import preprocessing

In [73]:
column_list=list(features_df.columns)
column_list=column_list[1:-1]

In [74]:
min_max_scaler = preprocessing.MinMaxScaler()
features_df[column_list] = min_max_scaler.fit_transform(features_df[column_list])

In [75]:
features_df.head()

Unnamed: 0,file,Median pitch,Mean pitch,Standard deviation,Minimum pitch,Maximum pitch,Number of pulses,Number of periods,Mean period,Standard deviation of period,...,Shimmer (local),"Shimmer (local, dB)",Shimmer (apq3),Shimmer (apq5),Shimmer (apq11),Shimmer (dda),Mean autocorrelation,Mean noise-to-harmonics ratio,Mean harmonics-to-noise ratio,pathology
0,1311-a_n_noisy,0.309028,0.30184,0.016766,0.374535,0.183043,0.32216,0.32216,0.324018,0.020991,...,0.071395,0.081216,0.073189,0.039233,0.055821,0.073214,0.967847,0.017597,0.629444,Laryngitis
1,139-a_n_pitch,0.194737,0.185766,0.006426,0.255965,0.105578,0.391061,0.391061,0.477129,0.066832,...,0.114512,0.14848,0.125488,0.054556,0.072544,0.125513,0.915965,0.057547,0.766965,Laryngitis
2,1315-a_n_pitch,0.095882,0.085771,0.024246,0.126947,0.059095,0.240223,0.240223,0.689073,0.081367,...,0.328147,0.391131,0.238954,0.232219,0.395204,0.239004,0.822689,0.116036,0.395767,Laryngitis
3,139-a_n_noisy,0.169197,0.160236,0.007198,0.226758,0.093034,0.368715,0.368715,0.521798,0.086271,...,0.200789,0.253612,0.204388,0.06958,0.063953,0.204413,0.890966,0.083137,0.743424,Laryngitis
4,1315-a_n_noisy,0.076098,0.065872,0.024566,0.103706,0.04849,0.227188,0.227188,0.746489,0.092408,...,0.261797,0.270055,0.219342,0.181611,0.277221,0.219367,0.89964,0.05757,0.433664,Laryngitis


In [76]:
features_df.shape

(320, 28)

In [77]:
features_df.to_csv(f'/content/drive/MyDrive/CancerVsLaryngitis/FeatureSets/NotPrepped/praat_{data_type}.csv')

In [None]:
features_df = extract_feature_df('/content/drive/MyDrive/CancerVsLaryngitis/TestSet/')

In [None]:
features_df = features_df.reset_index(drop=True)

In [None]:
features_df = features_df.rename({'name': 'file'}, axis=1)

In [None]:
features_df.head()

Unnamed: 0,file,Median pitch,Mean pitch,Standard deviation,Minimum pitch,Maximum pitch,Number of pulses,Number of periods,Mean period,Standard deviation of period,...,Shimmer (local),"Shimmer (local, dB)",Shimmer (apq3),Shimmer (apq5),Shimmer (apq11),Shimmer (dda),Mean autocorrelation,Mean noise-to-harmonics ratio,Mean harmonics-to-noise ratio,pathology
0,2402-phrase,137.872,136.679,16.078,77.495,167.353,185.0,180.0,0.007344,0.000982,...,9.399,0.887,2.962,5.235,11.599,8.886,0.941574,0.069793,14.824,Cancer
1,1403-phrase,121.156,166.51,133.346,75.312,601.335,245.0,200.0,0.006931,0.003098,...,12.924,1.271,5.218,7.909,11.339,15.653,0.838872,0.247572,10.733,Cancer
2,820-phrase,134.32,142.037,47.573,75.898,242.298,205.0,197.0,0.007066,0.002501,...,10.407,1.128,3.123,5.299,8.252,9.369,0.884008,0.149943,10.776,Cancer
3,1451-phrase,132.661,130.038,22.421,79.357,161.956,206.0,200.0,0.007741,0.001538,...,8.91,0.995,2.997,4.123,8.296,8.991,0.901992,0.130894,13.516,Cancer
4,2343-phrase,184.644,177.509,35.42,96.278,260.245,382.0,377.0,0.005622,0.001187,...,13.165,1.137,6.101,7.158,11.186,18.304,0.902721,0.13546,12.354,Cancer


In [None]:
features_df[column_list] = min_max_scaler.transform(features_df[column_list])

In [None]:
features_df.head()

Unnamed: 0,file,Median pitch,Mean pitch,Standard deviation,Minimum pitch,Maximum pitch,Number of pulses,Number of periods,Mean period,Standard deviation of period,...,Shimmer (local),"Shimmer (local, dB)",Shimmer (apq3),Shimmer (apq5),Shimmer (apq11),Shimmer (dda),Mean autocorrelation,Mean noise-to-harmonics ratio,Mean harmonics-to-noise ratio,pathology
0,2402-phrase,0.145569,0.161868,0.031745,0.068851,0.088376,0.210216,0.226141,0.577655,0.191522,...,0.242257,0.276642,0.157063,0.190702,0.174703,0.157042,0.935168,0.043677,0.706753,Cancer
1,1403-phrase,0.0872,0.281119,0.744088,0.054916,0.988288,0.328094,0.267635,0.523147,0.944018,...,0.432448,0.556934,0.364397,0.343423,0.168848,0.364358,0.642786,0.321359,0.479197,Cancer
2,820-phrase,0.133166,0.183287,0.223061,0.058657,0.243783,0.249509,0.261411,0.540982,0.732002,...,0.296644,0.452555,0.171859,0.194357,0.099322,0.171839,0.771283,0.168867,0.481589,Cancer
3,1451-phrase,0.127373,0.135321,0.070276,0.080737,0.077184,0.251473,0.267635,0.630262,0.389411,...,0.215874,0.355474,0.160279,0.127192,0.100313,0.160259,0.822482,0.139114,0.633997,Cancer
4,2343-phrase,0.308886,0.325088,0.149238,0.18875,0.280998,0.59725,0.634855,0.350118,0.264343,...,0.445452,0.459124,0.445547,0.300531,0.165402,0.445575,0.824557,0.146246,0.569363,Cancer


In [None]:
features_df.shape

(40, 28)

In [None]:
features_df.to_csv('/content/drive/MyDrive/CancerVsLaryngitis/FeatureSets/praat_test.csv')