In [None]:

#IMPORT THE LIBRARIES
import pandas as pd
import numpy as np

import os
import sys

# librosa is a Python library for analyzing audio and music. It can be used to extract the data from the audio files we will see it later.
import librosa
import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# to play the audio files
import IPython.display as ipd
from IPython.display import Audio
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM,BatchNormalization , GRU
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers import Input, Flatten, Dropout, Activation
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import SGD



import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 
import tensorflow as tf 
print ("Done")

# Importing Data 

In [None]:

Tess = "tess toronto emotional speech set data/TESS Toronto emotional speech set data/"

# preprocessing

**TESS**

In [None]:
tess_directory_list = os.listdir(Tess)

file_emotion = []
file_path = []

for dir in tess_directory_list:
    directories = os.listdir(Tess + dir)
    for file in directories:
        part = file.split('.')[0]
        part = part.split('_')[2]
        if part=='ps':
            file_emotion.append('surprise')
        else:
            file_emotion.append(part)
        file_path.append(Tess + dir + '/' + file)
        
# dataframe for emotion of files
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['Path'])
Tess_df = pd.concat([emotion_df, path_df], axis=1)
Tess_df.head()
print(Tess_df.Emotions.value_counts())


**Integration**

In [None]:
# creating Dataframe using all the 4 dataframes we created so far.
data_path = Tess_df
data_path.to_csv("data_path.csv",index=False)
data_path.head()

In [None]:
print(data_path.Emotions.value_counts())


>*                           Data Visualisation and Exploration

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.title('Count of Emotions', size=16)
sns.countplot(data_path.Emotions)
plt.ylabel('Count', size=12)
plt.xlabel('Emotions', size=12)
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()

In [None]:
data,sr = librosa.load(file_path[0])
sr

In [None]:
ipd.Audio(data,rate=sr)

In [None]:
mfcc = librosa.feature.mfcc(y=data, sr=sr, n_mfcc=30)
chroma = librosa.feature.chroma_stft(y=data, sr=sr)

# MFCC
plt.figure(figsize=(16, 10))
plt.subplot(3,1,1)
librosa.display.specshow(mfcc, x_axis='time')
plt.ylabel('MFCC')
plt.colorbar()

# Chroma
plt.subplot(3,1,2)
librosa.display.specshow(chroma, x_axis='time')
plt.ylabel('Chroma')
plt.colorbar()

ipd.Audio(data,rate=sr)

# Train-Test Split BEFORE Augmentation

**Important: We split the data first, then apply augmentation only to training data**

In [None]:
# Split data into train and test sets
X_train_paths, X_test_paths, y_train, y_test = train_test_split(
    data_path['Path'], 
    data_path['Emotions'], 
    test_size=0.2, 
    random_state=42, 
    stratify=data_path['Emotions']
)

print(f"Training set size: {len(X_train_paths)}")
print(f"Test set size: {len(X_test_paths)}")
print(f"Training emotion distribution:\n{y_train.value_counts()}")
print(f"Test emotion distribution:\n{y_test.value_counts()}")

# Data augmentation functions

In [None]:
# NOISE
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data
    
# PITCH
def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

# STRETCH (time stretching)
def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate)

# SHIFT (time shifting)
def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

In [None]:
# NORMAL AUDIO

import librosa.display
plt.figure(figsize=(12, 5))
librosa.display.waveshow(y=data, sr=sr)
ipd.Audio(data,rate=sr)

In [None]:
# AUDIO WITH NOISE
x = noise(data)
plt.figure(figsize=(12,5))
librosa.display.waveshow(y=x, sr=sr)
ipd.Audio(x, rate=sr)

In [None]:
# STRETCHED AUDIO
x = stretch(data)
plt.figure(figsize=(12, 5))
librosa.display.waveshow(y=x, sr=sr)
ipd.Audio(x, rate=sr)

In [None]:
# SHIFTED AUDIO
x = shift(data)
plt.figure(figsize=(12,5))
librosa.display.waveshow(y=x, sr=sr)
ipd.Audio(x, rate=sr)

In [None]:
# AUDIO WITH PITCH
x = pitch(data, sr)
plt.figure(figsize=(12, 5))
librosa.display.waveshow(y=x, sr=sr)
ipd.Audio(x, rate=sr)

# Feature extraction functions

**Combined MFCC and Chroma feature extraction for 1D CNN + Attention architecture**

In [None]:
def zcr(data,frame_length,hop_length):
    zcr=librosa.feature.zero_crossing_rate(data,frame_length=frame_length,hop_length=hop_length)
    return np.squeeze(zcr)

def rmse(data,frame_length=2048,hop_length=512):
    rmse=librosa.feature.rms(data,frame_length=frame_length,hop_length=hop_length)
    return np.squeeze(rmse)

def mfcc(data,sr,frame_length=2048,hop_length=512,flatten:bool=True):
    mfcc=librosa.feature.mfcc(data,sr=sr)
    return np.squeeze(mfcc.T)if not flatten else np.ravel(mfcc.T)

def calculate_chroma(data, sr, hop_length=512, flatten=True):
    chroma = librosa.feature.chroma_stft(y=data, sr=sr, hop_length=hop_length)
    return np.squeeze(chroma.T) if not flatten else np.ravel(chroma.T)

def extract_features(data,sr=22050,frame_length=2048,hop_length=512):
    result=np.array([])
    
    # Combine MFCC and Chroma features
    result=np.hstack((result,
                      mfcc(data,sr,frame_length,hop_length),
                      calculate_chroma(data, sr, hop_length)
                     ))
    return result

def get_features_with_augmentation(path,duration=2.5, offset=0.6):
    """Extract features with augmentation - only for training data"""
    data,sr=librosa.load(path,duration=duration,offset=offset)
    aud=extract_features(data)
    audio=np.array(aud)
    
    # Apply different augmentations
    noised_audio=noise(data)
    aud2=extract_features(noised_audio)
    audio=np.vstack((audio,aud2))
    
    pitched_audio=pitch(data,sr)
    aud3=extract_features(pitched_audio)
    audio=np.vstack((audio,aud3))
    
    stretched_audio=stretch(data)
    aud4=extract_features(stretched_audio)
    audio=np.vstack((audio,aud4))
    
    shifted_audio=shift(data)
    aud5=extract_features(shifted_audio)
    audio=np.vstack((audio,aud5))
    
    # Combined augmentations
    pitched_noised_audio=noise(pitch(data,sr))
    aud6=extract_features(pitched_noised_audio)
    audio=np.vstack((audio,aud6))
    
    return audio

def get_features_no_augmentation(path,duration=2.5, offset=0.6):
    """Extract features without augmentation - for test data"""
    data,sr=librosa.load(path,duration=duration,offset=offset)
    aud=extract_features(data)
    return np.array(aud)

In [None]:
import multiprocessing as mp
print("Number of processors: ", mp.cpu_count())

# Feature extraction for Training Data (WITH augmentation)

In [None]:
import timeit
from tqdm import tqdm

print("Processing training data with augmentation...")
start = timeit.default_timer()
X_train_features, y_train_extended = [], []

for path, emotion, index in tqdm(zip(X_train_paths, y_train, range(len(X_train_paths)))):
    features = get_features_with_augmentation(path)
    if index % 500 == 0:
        print(f'{index} training audio files have been processed')
    
    # Each audio file generates multiple augmented versions
    for i in features:
        X_train_features.append(i)
        y_train_extended.append(emotion)

stop = timeit.default_timer()
print(f'Training data processing completed in {stop - start:.2f} seconds')
print(f'Training features shape: {len(X_train_features)} samples')

# Feature extraction for Test Data (WITHOUT augmentation)

In [None]:
print("Processing test data without augmentation...")
start = timeit.default_timer()
X_test_features, y_test_list = [], []

for path, emotion, index in tqdm(zip(X_test_paths, y_test, range(len(X_test_paths)))):
    features = get_features_no_augmentation(path)
    if index % 100 == 0:
        print(f'{index} test audio files have been processed')
    
    # Test data: only original features, no augmentation
    X_test_features.append(features)
    y_test_list.append(emotion)

stop = timeit.default_timer()
print(f'Test data processing completed in {stop - start:.2f} seconds')
print(f'Test features shape: {len(X_test_features)} samples')

# Data summary

In [None]:
print(f"Original data size: {len(data_path)}")
print(f"Training set size (before augmentation): {len(X_train_paths)}")
print(f"Training set size (after augmentation): {len(X_train_features)}")
print(f"Test set size (no augmentation): {len(X_test_features)}")
print(f"Augmentation factor for training: {len(X_train_features) / len(X_train_paths):.1f}x")

# Check feature dimensions
if len(X_train_features) > 0:
    print(f"Feature vector dimension: {len(X_train_features[0])}")

# Check emotion distribution in augmented training set
train_emotion_counts = pd.Series(y_train_extended).value_counts()
print(f"\nTraining emotion distribution (after augmentation):\n{train_emotion_counts}")
test_emotion_counts = pd.Series(y_test_list).value_counts()
print(f"\nTest emotion distribution:\n{test_emotion_counts}")

# Saving processed features

In [None]:
# Save training data
Train_features = pd.DataFrame(X_train_features)
Train_features['Emotions'] = y_train_extended
Train_features.to_csv('train_features_mfcc_chroma_aug.csv', index=False)
print("Training features saved to 'train_features_mfcc_chroma_aug.csv'")

# Save test data
Test_features = pd.DataFrame(X_test_features)
Test_features['Emotions'] = y_test_list
Test_features.to_csv('test_features_mfcc_chroma_no_aug.csv', index=False)
print("Test features saved to 'test_features_mfcc_chroma_no_aug.csv'")

# Display first few rows
print("\nTraining features preview:")
print(Train_features.head())
print("\nTest features preview:")
print(Test_features.head())

# Loading saved features (for subsequent use)

In [None]:
# Example of loading the processed features for model training
# Train_features = pd.read_csv('train_features_mfcc_chroma_aug.csv')
# Test_features = pd.read_csv('test_features_mfcc_chroma_no_aug.csv')
# print("Features loaded successfully!")
# print(f"Training data shape: {Train_features.shape}")
# print(f"Test data shape: {Test_features.shape}")