In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%%capture

!pip install opensmile

In [3]:
import opensmile
import pandas as pd

In [4]:
data_type='TestSetNotPrepped_i'

In [5]:
cancer_directory = f'/content/drive/MyDrive/CancerVsLaryngitis/DealWithData/{data_type}/Cancer/'
laryngitis_directory = f'/content/drive/MyDrive/CancerVsLaryngitis/DealWithData/{data_type}/Laryngitis/'

In [6]:
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.Functionals,
)


In [7]:
cancer_feats = smile.process_folder(cancer_directory)
laryngitis_feats = smile.process_folder(laryngitis_directory)

In [8]:
def format_features(df):
    df = df.reset_index()
    df = df.rename({'file': 'filename'}, axis=1)
    df['file'] = df['filename'].str.split('/').str[-1]
    df['pathology'] = df['filename'].str.split('/').str[-2]
    df = df.drop(['start', 'end', 'filename'], axis=1)

    return df

In [9]:
cancer_formatted = format_features(cancer_feats)
laryngitis_formatted = format_features(laryngitis_feats)

In [10]:
opensmile_features = pd.concat([cancer_formatted,laryngitis_formatted])

In [11]:
opensmile_features = opensmile_features.reset_index(drop=True)

In [12]:
opensmile_features.head()

Unnamed: 0,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope,F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope,...,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp,file,pathology
0,35.720882,0.003777,35.602367,35.719742,35.832245,0.229877,2.468703,0.874749,1.671981,1.009795,...,0.0,5.670103,0.531915,1.87,0.0,0.0,0.0,-20.204012,1047-i_n.wav,Cancer
1,35.73325,0.008257,35.426579,35.711403,35.903687,0.477108,5.268397,3.822458,5.024073,2.237234,...,0.0,7.594937,0.657895,1.51,0.0,0.0,0.0,-18.306902,1245-i_n.wav,Cancer
2,32.224842,0.019306,31.770058,32.450523,32.706463,0.936405,4.426754,1.163015,3.615649,3.901762,...,0.0,4.347826,0.917431,1.08,0.0,0.0,0.0,-11.693852,1383-i_n.wav,Cancer
3,27.768379,0.00872,27.520676,27.742825,28.00782,0.487144,7.011222,4.310144,4.984915,2.415398,...,0.0,6.493506,0.675676,1.47,0.0,0.0,0.0,-13.456758,1391-i_n.wav,Cancer
4,28.213228,0.009052,27.99991,28.127954,28.414236,0.414326,8.907628,3.785568,6.170148,2.295791,...,0.0,4.477612,1.639344,0.6,0.0,0.0,0.0,-10.453439,1403-i_n.wav,Cancer


In [13]:
from sklearn import preprocessing

In [14]:
column_list=list(opensmile_features.columns)
column_list=column_list[0:-2]

In [15]:
min_max_scaler = preprocessing.MinMaxScaler()
opensmile_features[column_list] = min_max_scaler.fit_transform(opensmile_features[column_list])

In [16]:
opensmile_features.to_csv(f'/content/drive/MyDrive/CancerVsLaryngitis/FeatureSets/NotPrepped/open_smile_{data_type}.csv')

## Train

In [17]:
cancer_directory = cancer_directory.replace('Test', 'Train')
laryngitis_directory = laryngitis_directory.replace('Test', 'Train')

In [18]:
cancer_feats = smile.process_folder(cancer_directory)
laryngitis_feats = smile.process_folder(laryngitis_directory)

In [19]:
cancer_formatted = format_features(cancer_feats)
laryngitis_formatted = format_features(laryngitis_feats)

In [20]:
opensmile_features = pd.concat([cancer_formatted,laryngitis_formatted])

In [21]:
opensmile_features = opensmile_features.reset_index(drop=True)

In [22]:
opensmile_features.head()

Unnamed: 0,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope,F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope,...,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp,file,pathology
0,36.516285,0.013485,36.161934,36.580986,36.96859,0.806656,6.37433,1.217807,3.971865,3.641822,...,0.0,1.612903,0.847458,1.17,0.0,0.0,0.0,-14.608781,1048-i_n.wav,Cancer
1,36.516628,0.013493,36.160419,36.584641,36.969711,0.809292,6.44453,1.184312,4.023733,3.667306,...,0.0,1.612903,0.847458,1.17,0.0,0.0,0.0,-14.606191,1048-i_n_noisy.wav,Cancer
2,37.526333,0.012854,37.180408,37.603889,37.970726,0.790318,5.166751,1.676697,2.122839,1.146301,...,0.0,2.419355,0.847458,1.17,0.0,0.0,0.0,-15.742085,1048-i_n_pitch.wav,Cancer
3,36.500591,0.013434,36.068657,36.590546,36.939445,0.870789,4.280561,0.779048,3.871889,1.540892,...,0.0,1.449275,0.757576,1.31,0.0,0.0,0.0,-18.772852,1048-i_n_speed.wav,Cancer
4,19.422689,0.136436,18.427055,18.604952,19.332083,0.905027,0.688651,0.0,7.859972,0.0,...,0.0,13.19797,0.52356,1.9,0.0,0.0,0.0,-13.877299,110-i_n.wav,Cancer


In [23]:
opensmile_features[column_list] = min_max_scaler.transform(opensmile_features[column_list])

In [24]:
data_type=data_type.replace('Test', 'Train')
opensmile_features.to_csv(f'/content/drive/MyDrive/CancerVsLaryngitis/FeatureSets/NotPrepped/open_smile_{data_type}.csv')