In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import librosa
import librosa.display
from IPython.display import Audio
import warnings
warnings.filterwarnings('ignore')


In [24]:
paths = []
labels = []
for _, folders, _ in os.walk(r'data'):
    for folder in folders:
        for dirname, _, filenames in os.walk(f'data\{folder}'):
            for filename in filenames:
                paths.append(os.path.join(dirname, filename))
                label = folder
                label = label.split('_')[0] # remove to segregate by gender
                labels.append(label.lower())

df = pd.DataFrame()
df['speech'] = paths
df['label'] = labels

print('Dataset is Loaded')
df.head()

Dataset is Loaded


Unnamed: 0,speech,label
0,data\Angry_Female\anger_xn_1.wav,angry
1,data\Angry_Female\anger_xn_10.wav,angry
2,data\Angry_Female\anger_xn_11.wav,angry
3,data\Angry_Female\anger_xn_12.wav,angry
4,data\Angry_Female\anger_xn_13.wav,angry


# Feature Engineering

In [36]:
from sklearn.preprocessing import OneHotEncoder

def extract_mfcc(filename):
    y, sr = librosa.load(filename, duration=3, offset=0.5)
    mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
    # print(mfcc)
    return mfcc

def extract_chroma(filename):
    y, sr = librosa.load(filename)
    chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0)
    return chroma

def extract_spectral_contrast(filename):
    y, sr = librosa.load(filename)
    contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr).T, axis=0)
    return contrast

def extract_tonnetz(filename):
    y, sr = librosa.load(filename)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr).T, axis=0)
    return tonnetz

def processData(df):
    # mfcc
    mfcc_columns = ['mfcc_' + str(i) for i in range(1, 41)]
    mfcc_df = pd.DataFrame(df['speech'].apply(lambda x: extract_mfcc(x)).tolist(), columns=mfcc_columns)
    new_df = pd.concat([df, mfcc_df], axis=1)

    # chroma 
    chroma_columns = ['chroma_' + str(i) for i in range(1, 13)]
    chroma_df = pd.DataFrame(df['speech'].apply(lambda x: extract_chroma(x)).tolist(), columns=chroma_columns)
    new_df = pd.concat([new_df, chroma_df], axis=1)

    # spectral_contrast
    spectral_columns = ['spectral_' + str(i) for i in range(1, 8)]
    spectral_df = pd.DataFrame(df['speech'].apply(lambda x: extract_spectral_contrast(x)).tolist(), columns=spectral_columns)
    new_df = pd.concat([new_df, spectral_df], axis=1)

    # tonnetz
    tonnetz_columns = ['tonnetz_' + str(i) for i in range(1, 8)]
    tonnetz_df = pd.DataFrame(df['speech'].apply(lambda x: extract_spectral_contrast(x)).tolist(), columns=tonnetz_columns)
    new_df = pd.concat([new_df, tonnetz_df], axis=1)
    # print(df)
    # enc = OneHotEncoder()
    # y = enc.fit_transform(df[['label']])
    # y = y.toarray()
    # print(y.shape)
    return new_df

In [40]:
data = processData(df)

In [66]:
correlation_matrix = data[data.columns[2:]].corr()
pairs = []
for i in range(correlation_matrix.shape[0]):
    for j in range(i+1, correlation_matrix.shape[1]):
        if correlation_matrix.iloc[i, j] > 0.75 or correlation_matrix.iloc[i, j] < -0.75:
            pairs.append((correlation_matrix.columns[i], correlation_matrix.columns[j], correlation_matrix.iloc[i, j]))

high_corr_df = pd.DataFrame(pairs, columns=['Feature 1', 'Feature 2', 'Correlation'])
high_corr_set = set(high_corr_df['Feature 2'])

dropped_data = data.drop(columns=[col for col in data.columns if col in high_corr_set])
dropped_data.head()

Unnamed: 0,speech,label,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_9,...,mfcc_34,mfcc_36,mfcc_37,mfcc_38,mfcc_39,mfcc_40,chroma_1,chroma_8,spectral_1,spectral_2
0,data\Angry_Female\anger_xn_1.wav,angry,-494.275452,134.563019,-25.692625,44.874004,-6.713496,-20.871096,13.515034,-5.726561,...,1.314803,-0.579906,1.490515,-0.378105,0.3677,2.949148,0.402734,0.503801,31.923484,17.95631
1,data\Angry_Female\anger_xn_10.wav,angry,-489.237396,95.241463,-6.124708,45.021778,-1.017477,-12.822159,15.77024,-8.326754,...,-4.694559,-1.933444,-1.629161,-1.426133,-0.34676,0.731342,0.409627,0.526004,29.46938,16.13576
2,data\Angry_Female\anger_xn_11.wav,angry,-520.552795,101.389832,-10.185467,41.562035,3.115262,-10.974776,11.411966,-1.241703,...,-2.491964,-2.160133,-1.386095,-2.235185,-1.890604,-2.381725,0.627573,0.540234,28.944638,13.118379
3,data\Angry_Female\anger_xn_12.wav,angry,-490.044342,127.468536,-25.104696,38.669411,-12.717003,-19.364574,20.204561,-6.294515,...,-2.95266,-3.929295,-4.234304,-3.584726,-1.760652,-0.776543,0.45455,0.499391,29.633752,15.092322
4,data\Angry_Female\anger_xn_13.wav,angry,-475.499969,120.513458,-31.929777,47.917557,-6.103209,-19.414783,9.694456,-2.136703,...,3.392644,-2.054071,-1.458658,0.357163,-0.206766,-0.764464,0.35028,0.550284,30.638892,15.346131


In [67]:
high_corr_df

Unnamed: 0,Feature 1,Feature 2,Correlation
0,mfcc_3,mfcc_8,0.892260
1,mfcc_3,mfcc_13,0.756054
2,mfcc_3,mfcc_18,0.794097
3,mfcc_5,mfcc_8,0.791473
4,mfcc_5,mfcc_13,0.761114
...,...,...,...
69,tonnetz_2,tonnetz_5,0.759097
70,tonnetz_3,tonnetz_4,0.799536
71,tonnetz_3,tonnetz_5,0.769417
72,tonnetz_4,tonnetz_5,0.813616


In [74]:
high_corr_df[high_corr_df['Feature 2'].str.contains('tonnetz')]

Unnamed: 0,Feature 1,Feature 2,Correlation
25,chroma_4,tonnetz_6,-0.780355
26,chroma_4,tonnetz_7,0.766721
31,chroma_5,tonnetz_5,-0.752563
32,chroma_5,tonnetz_6,-0.823989
33,chroma_5,tonnetz_7,0.816672
39,spectral_1,tonnetz_1,1.0
43,spectral_2,tonnetz_2,1.0
44,spectral_2,tonnetz_3,0.756604
45,spectral_2,tonnetz_4,0.776799
46,spectral_2,tonnetz_5,0.759097


In [73]:
high_corr_df[high_corr_df['Feature 2'].str.contains('chroma')]

Unnamed: 0,Feature 1,Feature 2,Correlation
18,chroma_1,chroma_2,0.878733
19,chroma_1,chroma_12,0.865792
20,chroma_2,chroma_3,0.885635
21,chroma_3,chroma_4,0.898515
22,chroma_4,chroma_5,0.901121
27,chroma_5,chroma_6,0.840049
34,chroma_6,chroma_7,0.867936
35,chroma_8,chroma_9,0.777899
36,chroma_9,chroma_10,0.758275
37,chroma_10,chroma_11,0.766005


In [75]:
high_corr_df[high_corr_df['Feature 2'].str.contains('spectral')]

Unnamed: 0,Feature 1,Feature 2,Correlation
23,chroma_4,spectral_6,-0.780355
24,chroma_4,spectral_7,0.766721
28,chroma_5,spectral_5,-0.752563
29,chroma_5,spectral_6,-0.823989
30,chroma_5,spectral_7,0.816672
40,spectral_2,spectral_3,0.756604
41,spectral_2,spectral_4,0.776799
42,spectral_2,spectral_5,0.759097
47,spectral_3,spectral_4,0.799536
48,spectral_3,spectral_5,0.769417


In [69]:
dropped_data.columns


Index(['speech', 'label', 'mfcc_1', 'mfcc_2', 'mfcc_3', 'mfcc_4', 'mfcc_5',
       'mfcc_6', 'mfcc_7', 'mfcc_9', 'mfcc_10', 'mfcc_12', 'mfcc_14',
       'mfcc_15', 'mfcc_17', 'mfcc_19', 'mfcc_20', 'mfcc_22', 'mfcc_23',
       'mfcc_24', 'mfcc_25', 'mfcc_26', 'mfcc_27', 'mfcc_28', 'mfcc_29',
       'mfcc_30', 'mfcc_31', 'mfcc_32', 'mfcc_33', 'mfcc_34', 'mfcc_36',
       'mfcc_37', 'mfcc_38', 'mfcc_39', 'mfcc_40', 'chroma_1', 'chroma_8',
       'spectral_1', 'spectral_2'],
      dtype='object')

### Tonnetz
It seems that tonnetz columns was completely dropped. They are all highly correlated to spectral features (>0.75). Spectral_X is highly correlated to Chroma_X feature

### Chroma
Chroma features are highly correlated to each other.

### Spectral
Spectral features are highly correlated to each other.


In [76]:
dropped_data.to_csv('IS4242 Data.csv', index=False)