In [10]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import librosa
import librosa.display
from IPython.display import Audio
import warnings
warnings.filterwarnings('ignore')


In [11]:
paths = []
labels = []

# Use os.walk to walk through the directory structure
for dirname, _, filenames in os.walk(r'data'):
    for filename in filenames:
        if filename == '.DS_Store':
            continue  # Skip this file
        # Construct the full file path
        full_path = os.path.join(dirname, filename)
        paths.append(full_path)
        
        # Extract label and gender from the directory name
        # Assumes the structure is data/label_gender/filename
        folder_name = os.path.basename(dirname)
        label = folder_name
        emotion = label.split('_')[0]
        labels.append(emotion.lower())

# Create a DataFrame with the collected data
df = pd.DataFrame({
    'speech': paths,
    'label': labels,
})

# Display the first few rows of the DataFrame to verify
df.head()

Unnamed: 0,speech,label
0,data/Neutral_Female/jt_neutral_21.wav,neutral
1,data/Neutral_Female/ws_neutral_25.wav,neutral
2,data/Neutral_Female/ws_neutral_31.wav,neutral
3,data/Neutral_Female/ws_neutral_19.wav,neutral
4,data/Neutral_Female/neutral_xn_1.wav,neutral


In [12]:
# paths = []
# labels = []
# for _, folders, _ in os.walk(r'data'):
#     for folder in folders:
#         for dirname, _, filenames in os.walk(f'data\{folder}'):
#             for filename in filenames:
#                 paths.append(os.path.join(dirname, filename))
#                 label = folder
#                 label = label.split('_')[0] # remove to segregate by gender
#                 labels.append(label.lower())

# df = pd.DataFrame()
# df['speech'] = paths
# df['label'] = labels

# print('Dataset is Loaded')
# df.head()

# Feature Engineering

In [13]:
from sklearn.preprocessing import OneHotEncoder

def extract_mfcc(filename):
    y, sr = librosa.load(filename, duration=3, offset=0.5)
    mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
    # print(mfcc)
    return mfcc

def extract_chroma(filename):
    y, sr = librosa.load(filename)
    chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0)
    return chroma

def extract_spectral_contrast(filename):
    y, sr = librosa.load(filename)
    contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr).T, axis=0)
    return contrast

def extract_tonnetz(filename):
    y, sr = librosa.load(filename)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr).T, axis=0)
    return tonnetz

def processData(df):
    # mfcc
    mfcc_columns = ['mfcc_' + str(i) for i in range(1, 41)]
    mfcc_df = pd.DataFrame(df['speech'].apply(lambda x: extract_mfcc(x)).tolist(), columns=mfcc_columns)
    new_df = pd.concat([df, mfcc_df], axis=1)

    # chroma 
    chroma_columns = ['chroma_' + str(i) for i in range(1, 13)]
    chroma_df = pd.DataFrame(df['speech'].apply(lambda x: extract_chroma(x)).tolist(), columns=chroma_columns)
    new_df = pd.concat([new_df, chroma_df], axis=1)

    # spectral_contrast
    spectral_columns = ['spectral_' + str(i) for i in range(1, 8)]
    spectral_df = pd.DataFrame(df['speech'].apply(lambda x: extract_spectral_contrast(x)).tolist(), columns=spectral_columns)
    new_df = pd.concat([new_df, spectral_df], axis=1)

    # tonnetz
    tonnetz_columns = ['tonnetz_' + str(i) for i in range(1, 8)]
    tonnetz_df = pd.DataFrame(df['speech'].apply(lambda x: extract_spectral_contrast(x)).tolist(), columns=tonnetz_columns)
    new_df = pd.concat([new_df, tonnetz_df], axis=1)
    # print(df)
    # enc = OneHotEncoder()
    # y = enc.fit_transform(df[['label']])
    # y = y.toarray()
    # print(y.shape)
    return new_df

In [14]:
data = processData(df)

In [15]:
correlation_matrix = data[data.columns[2:]].corr()
pairs = []
for i in range(correlation_matrix.shape[0]):
    for j in range(i+1, correlation_matrix.shape[1]):
        if correlation_matrix.iloc[i, j] > 0.75 or correlation_matrix.iloc[i, j] < -0.75:
            pairs.append((correlation_matrix.columns[i], correlation_matrix.columns[j], correlation_matrix.iloc[i, j]))

high_corr_df = pd.DataFrame(pairs, columns=['Feature 1', 'Feature 2', 'Correlation'])
high_corr_set = set(high_corr_df['Feature 2'])

dropped_data = data.drop(columns=[col for col in data.columns if col in high_corr_set])
dropped_data.head()

Unnamed: 0,speech,label,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_9,...,mfcc_34,mfcc_36,mfcc_37,mfcc_38,mfcc_39,mfcc_40,chroma_1,chroma_8,spectral_1,spectral_2
0,data/Neutral_Female/jt_neutral_21.wav,neutral,-353.485931,123.602257,-61.569126,39.862431,-27.980495,-24.715317,-0.133174,-2.212441,...,4.061761,3.313922,5.554673,-2.422093,-1.752038,0.314916,0.247846,0.645633,34.458182,14.673575
1,data/Neutral_Female/ws_neutral_25.wav,neutral,-564.134766,126.06131,-4.919215,37.116371,0.208729,9.16078,21.036594,23.856047,...,5.463705,0.146079,4.786485,-2.631112,0.266556,-0.360581,0.571912,0.62703,21.557366,16.469228
2,data/Neutral_Female/ws_neutral_31.wav,neutral,-539.20575,138.167038,-16.331873,54.644619,0.998931,10.805608,24.811714,24.078247,...,3.68065,2.151067,4.704717,-1.963487,4.114218,-0.95665,0.47585,0.465181,22.596028,15.552894
3,data/Neutral_Female/ws_neutral_19.wav,neutral,-492.10907,154.77478,-30.476576,45.752804,0.734304,3.639598,8.156684,20.397802,...,7.989627,-2.455146,4.81043,-5.10688,-0.704052,1.789229,0.43987,0.546113,18.194261,14.768661
4,data/Neutral_Female/neutral_xn_1.wav,neutral,-548.042175,143.709137,-27.734171,78.299103,-5.085223,-10.435101,6.87877,15.428833,...,-0.55566,-4.141469,2.441621,0.422122,4.424841,2.749235,0.471181,0.370252,30.322829,16.262694


In [16]:
high_corr_df

Unnamed: 0,Feature 1,Feature 2,Correlation
0,mfcc_3,mfcc_8,0.892260
1,mfcc_3,mfcc_13,0.756054
2,mfcc_3,mfcc_18,0.794097
3,mfcc_5,mfcc_8,0.791473
4,mfcc_5,mfcc_13,0.761114
...,...,...,...
69,tonnetz_2,tonnetz_5,0.759097
70,tonnetz_3,tonnetz_4,0.799536
71,tonnetz_3,tonnetz_5,0.769417
72,tonnetz_4,tonnetz_5,0.813616


In [17]:
high_corr_df[high_corr_df['Feature 2'].str.contains('tonnetz')]

Unnamed: 0,Feature 1,Feature 2,Correlation
25,chroma_4,tonnetz_6,-0.780319
26,chroma_4,tonnetz_7,0.766419
31,chroma_5,tonnetz_5,-0.752506
32,chroma_5,tonnetz_6,-0.823902
33,chroma_5,tonnetz_7,0.815634
39,spectral_1,tonnetz_1,1.0
43,spectral_2,tonnetz_2,1.0
44,spectral_2,tonnetz_3,0.756604
45,spectral_2,tonnetz_4,0.776798
46,spectral_2,tonnetz_5,0.759097


In [18]:
high_corr_df[high_corr_df['Feature 2'].str.contains('chroma')]

Unnamed: 0,Feature 1,Feature 2,Correlation
18,chroma_1,chroma_2,0.878791
19,chroma_1,chroma_12,0.865777
20,chroma_2,chroma_3,0.885658
21,chroma_3,chroma_4,0.898538
22,chroma_4,chroma_5,0.901052
27,chroma_5,chroma_6,0.840091
34,chroma_6,chroma_7,0.867901
35,chroma_8,chroma_9,0.777947
36,chroma_9,chroma_10,0.758239
37,chroma_10,chroma_11,0.766145


In [19]:
high_corr_df[high_corr_df['Feature 2'].str.contains('spectral')]

Unnamed: 0,Feature 1,Feature 2,Correlation
23,chroma_4,spectral_6,-0.780319
24,chroma_4,spectral_7,0.766419
28,chroma_5,spectral_5,-0.752506
29,chroma_5,spectral_6,-0.823902
30,chroma_5,spectral_7,0.815634
40,spectral_2,spectral_3,0.756604
41,spectral_2,spectral_4,0.776798
42,spectral_2,spectral_5,0.759097
47,spectral_3,spectral_4,0.799536
48,spectral_3,spectral_5,0.769417


In [20]:
dropped_data.columns


Index(['speech', 'label', 'mfcc_1', 'mfcc_2', 'mfcc_3', 'mfcc_4', 'mfcc_5',
       'mfcc_6', 'mfcc_7', 'mfcc_9', 'mfcc_10', 'mfcc_12', 'mfcc_14',
       'mfcc_15', 'mfcc_17', 'mfcc_19', 'mfcc_20', 'mfcc_22', 'mfcc_23',
       'mfcc_24', 'mfcc_25', 'mfcc_26', 'mfcc_27', 'mfcc_28', 'mfcc_29',
       'mfcc_30', 'mfcc_31', 'mfcc_32', 'mfcc_33', 'mfcc_34', 'mfcc_36',
       'mfcc_37', 'mfcc_38', 'mfcc_39', 'mfcc_40', 'chroma_1', 'chroma_8',
       'spectral_1', 'spectral_2'],
      dtype='object')

### Tonnetz
It seems that tonnetz columns was completely dropped. They are all highly correlated to spectral features (>0.75). Spectral_X is highly correlated to Chroma_X feature

### Chroma
Chroma features are highly correlated to each other.

### Spectral
Spectral features are highly correlated to each other.


In [21]:
dropped_data.to_csv('IS4242 Data.csv', index=False)