In [53]:

#IMPORT THE LIBRARIES
import pandas as pd
import numpy as np

import os
import sys

# librosa is a Python library for analyzing audio and music. It can be used to extract the data from the audio files we will see it later.
import librosa
import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# to play the audio files
import IPython.display as ipd
from IPython.display import Audio
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM,BatchNormalization , GRU
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers import Input, Flatten, Dropout, Activation
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import SGD



import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 
import tensorflow as tf 
print ("Done")

Done


Preparing Datasets

In [None]:
ravdess = "/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/"
Crema = "/kaggle/input/cremad/AudioWAV/"
Tess = "/kaggle/input/toronto-emotional-speech-set-tess/tess toronto emotional speech set data/TESS Toronto emotional speech set data/"
Savee = "/kaggle/input/surrey-audiovisual-expressed-emotion-savee/ALL/"

**Pre Processing**

ravdess

In [None]:
file_emotion = []
file_path = []
ravdess_directory_list = os.listdir(ravdess)
for i in ravdess_directory_list:
    # as their are 24 different actors in our previous directory we need to extract files for each actor.
    actor = os.listdir(ravdess + i)
    for f in actor:
        part = f.split('.')[0].split('-')
    # third part in each file represents the emotion associated to that file.
        file_emotion.append(int(part[2]))
        file_path.append(ravdess + i + '/' + f)


# dataframe for emotion of files
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])
# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['Path'])
ravdess_df = pd.concat([emotion_df, path_df], axis=1)
# changing integers to actual emotions.
ravdess_df.Emotions.replace({1:'neutral', 2:'neutral', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust',
                             8:'surprise'},
                            inplace=True)
print(ravdess_df.head())
print("______________________________________________")
print(ravdess_df.tail())
print("_______________________________________________")
print(ravdess_df.Emotions.value_counts())


    

**CremaD**

In [None]:
crema_directory_list = os.listdir(Crema)

file_emotion = []
file_path = []

for file in crema_directory_list:
    # storing file paths
    file_path.append(Crema + file)
    # storing file emotions
    part=file.split('_')
    if part[2] == 'SAD':
        file_emotion.append('sad')
    elif part[2] == 'ANG':
        file_emotion.append('angry')
    elif part[2] == 'DIS':
        file_emotion.append('disgust')
    elif part[2] == 'FEA':
        file_emotion.append('fear')
    elif part[2] == 'HAP':
        file_emotion.append('happy')
    elif part[2] == 'NEU':
        file_emotion.append('neutral')
    else:
        file_emotion.append('Unknown')
        
# dataframe for emotion of files
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['Path'])
Crema_df = pd.concat([emotion_df, path_df], axis=1)
Crema_df.head()
print(Crema_df.Emotions.value_counts())


**TESS**

In [None]:
tess_directory_list = os.listdir(Tess)

file_emotion = []
file_path = []

for dir in tess_directory_list:
    directories = os.listdir(Tess + dir)
    for file in directories:
        part = file.split('.')[0]
        part = part.split('_')[2]
        if part=='ps':
            file_emotion.append('surprise')
        else:
            file_emotion.append(part)
        file_path.append(Tess + dir + '/' + file)
        
# dataframe for emotion of files
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['Path'])
Tess_df = pd.concat([emotion_df, path_df], axis=1)
Tess_df.head()
print(Tess_df.Emotions.value_counts())


**SAVEE**

In [None]:
savee_directory_list = os.listdir(Savee)

file_emotion = []
file_path = []

for file in savee_directory_list:
    file_path.append(Savee + file)
    part = file.split('_')[1]
    ele = part[:-6]
    if ele=='a':
        file_emotion.append('angry')
    elif ele=='d':
        file_emotion.append('disgust')
    elif ele=='f':
        file_emotion.append('fear')
    elif ele=='h':
        file_emotion.append('happy')
    elif ele=='n':
        file_emotion.append('neutral')
    elif ele=='sa':
        file_emotion.append('sad')
    else:
        file_emotion.append('surprise')
        
# dataframe for emotion of files
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['Path'])
Savee_df = pd.concat([emotion_df, path_df], axis=1)
Savee_df.head()
print(Savee_df.Emotions.value_counts())


In [None]:
# creating Dataframe using all the 4 dataframes we created so far.
data_path = pd.concat([ravdess_df,Savee_df,Tess_df,Crema_df], axis = 0)
data_path.to_csv("data_path.csv",index=False)
data_path.head()

,Savee_df,Tess_df,Crema_df

In [None]:
print(data_path.Emotions.value_counts())
data_path.shape

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.title('Count of Emotions', size=16)
sns.countplot(x='Emotions', data=data_path)
plt.ylabel('Count', size=12)
plt.xlabel('Emotions', size=12)
plt.show()


In [None]:
data,sr = librosa.load(file_path[0])
data.shape

In [None]:
mfcc = librosa.feature.mfcc(y=data, sr=sr, n_mfcc=30)
print(mfcc.shape)
# MFCC
plt.figure(figsize=(16, 10))
plt.subplot(3,1,1)
librosa.display.specshow(mfcc, x_axis='time')
plt.ylabel('MFCC')
plt.colorbar()

ipd.Audio(data,rate=sr)

# Data Augmentation'

In [None]:
import numpy as np
import librosa
import scipy.signal

# Augmentation Functions
def add_noise(data, noise_factor=0.035):
    noise_amp = noise_factor * np.random.uniform() * np.amax(data)
    noisy_data = data + noise_amp * np.random.normal(size=data.shape)
    return noisy_data

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(y=data, rate=rate)

def shift(data, max_shift=5):
    shift_range = int(np.random.uniform(low=-max_shift, high=max_shift) * 1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(y=data, sr=sampling_rate, n_steps=pitch_factor)
def equalize(data, sr, cutoff=3000):
    # Simple high-shelf filter to boost higher frequencies
    b, a = scipy.signal.butter(6, cutoff / (0.5 * sr), btype='high', analog=False)
    return scipy.signal.filtfilt(b, a, data)

# Feature Extraction

# basic feature extraction

In [None]:
# def mfcc(data, sr, frame_length=2048, hop_length=512, flatten: bool = True):
#     mfcc = librosa.feature.mfcc(y=data, sr=sr, n_mfcc=13, n_fft=frame_length, hop_length=hop_length)
#     return np.squeeze(mfcc.T) if not flatten else np.ravel(mfcc.T)

# def extract_features(data, sr=22050, frame_length=2048, hop_length=512):
#     return mfcc(data, sr, frame_length, hop_length)

# # Final get_features function
# def get_features(path, duration=2.5, offset=0.6):
#     data, sr = librosa.load(path, duration=duration, offset=offset)
#     audio = [extract_features(data, sr)]

#     # Apply augmentations
#     noised = add_noise(data)
#     audio.append(extract_features(noised, sr))

#     pitched = pitch(data, sr)
#     audio.append(extract_features(pitched, sr))

#     noised_pitched = add_noise(pitched)
#     audio.append(extract_features(noised_pitched, sr))

#     equalized = equalize(data, sr)
#     audio.append(extract_features(equalized, sr))

#     return np.vstack(audio)

# Keeping the 2d structure

In [None]:
# import numpy as np
# import librosa

# def mfcc(data, sr, frame_length=2048, hop_length=512):
#     """
#     Compute MFCCs and return as a 2D array (time_steps, n_mfcc).
#     """
#     m = librosa.feature.mfcc(
#         y=data, sr=sr,
#         n_mfcc=13,
#         n_fft=frame_length,
#         hop_length=hop_length
#     )
#     # m has shape (n_mfcc, time_steps) → transpose to (time_steps, n_mfcc)
#     return m.T

# def extract_features(data, sr=22050, frame_length=2048, hop_length=512):
#     """
#     Wrapper that just returns the 2D MFCC array.
#     """
#     return mfcc(data, sr, frame_length, hop_length)

# def get_features(path, duration=2.5, offset=0.6):
#     """
#     Loads the audio, applies augmentations, and returns
#     a single NumPy array of shape (num_augs, time_steps, n_mfcc).
#     """
#     # 1) Load and compute on clean signal
#     data, sr = librosa.load(path, duration=duration, offset=offset)
#     feats = [extract_features(data, sr)]

#     # 2) Noise
#     noised = add_noise(data)
#     feats.append(extract_features(noised, sr))

#     # 3) Pitch shift
#     pitched = pitch(data, sr)
#     feats.append(extract_features(pitched, sr))

#     # 4) Pitch + noise
#     noised_pitched = add_noise(pitched)
#     feats.append(extract_features(noised_pitched, sr))

#     # 5) Equalization
#     equalized = equalize(data, sr)
#     feats.append(extract_features(equalized, sr))

#     # Stack into shape (5, time_steps, n_mfcc)
#     return np.stack(feats, axis=0)

# Using DelMFCC and Del2MFCC

In [None]:
# import numpy as np
# import librosa

# def extract_mfcc_features(data, sr, frame_length=2048, hop_length=512, flatten=True):
#     # Base MFCCs
#     mfcc = librosa.feature.mfcc(y=data, sr=sr, n_mfcc=13, n_fft=frame_length, hop_length=hop_length)
    
#     # First-order derivative (delta)
#     mfcc_delta = librosa.feature.delta(mfcc)
    
#     # Second-order derivative (delta-delta)
#     mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
    
#     # Stack: (n_mfcc * 3, time) → Transpose to (time, features)
#     features = np.vstack([mfcc, mfcc_delta, mfcc_delta2])
    
#     if flatten:
#         return np.ravel(features.T)
#     else:
#         return np.squeeze(features.T)

# def extract_features(data, sr=22050, frame_length=2048, hop_length=512):
#     return extract_mfcc_features(data, sr, frame_length, hop_length)

# def get_features(path, duration=2.5, offset=0.6):
#     data, sr = librosa.load(path, duration=duration, offset=offset)
#     audio = [extract_features(data, sr)]

#     # Apply augmentations
#     audio.append(extract_features(add_noise(data), sr))
#     audio.append(extract_features(pitch(data, sr), sr))
#     audio.append(extract_features(add_noise(pitch(data, sr)), sr))
#     audio.append(extract_features(equalize(data, sr), sr))

#     return np.vstack(audio)


keeping the 2d structure

In [None]:
import numpy as np
import librosa

def mfcc(data, sr, frame_length=2048, hop_length=512):
    """
    Compute MFCC + delta + delta-delta as a (time_steps, 39) feature matrix.
    """
    mfccs = librosa.feature.mfcc(
        y=data, sr=sr,
        n_mfcc=13,
        n_fft=frame_length,
        hop_length=hop_length
    )  # shape: (13, T)

    delta_mfccs = librosa.feature.delta(mfccs)       # shape: (13, T)
    delta2_mfccs = librosa.feature.delta(mfccs, order=2)  # shape: (13, T)

    # Stack all features along axis 0 → shape: (39, T)
    combined = np.vstack([mfccs, delta_mfccs, delta2_mfccs])  # shape: (39, T)

    # Transpose to shape (T, 39)
    return combined.T

def extract_features(data, sr=22050, frame_length=2048, hop_length=512):
    return mfcc(data, sr, frame_length, hop_length)

def get_features(path, duration=2.5, offset=0.6):
    data, sr = librosa.load(path, duration=duration, offset=offset)
    feats = [extract_features(data, sr)]

    # Augmentations
    noised = add_noise(data)
    feats.append(extract_features(noised, sr))

    pitched = pitch(data, sr)
    feats.append(extract_features(pitched, sr))

    noised_pitched = add_noise(pitched)
    feats.append(extract_features(noised_pitched, sr))

    equalized = equalize(data, sr)
    feats.append(extract_features(equalized, sr))

    return np.stack(feats, axis=0)  # shape: (5, time_steps, 39)


In [None]:
import multiprocessing as mp
print("Number of processors: ", mp.cpu_count())

# Get Features

In [None]:
import timeit
from tqdm import tqdm
start = timeit.default_timer()
X,Y=[],[]
for path,emotion,index in tqdm (zip(data_path.Path,data_path.Emotions,range(data_path.Path.shape[0]))):
    features=get_features(path)
    if index%500==0:
        print(f'{index} audio has been processed')
    for i in features:
        X.append(i)
        Y.append(emotion)
print('Done')
stop = timeit.default_timer()

print('Time: ', stop - start)         

In [None]:
len(X), len(Y), data_path.Path.shape

# Saving Features

In [None]:
# Emotions = pd.DataFrame(X)
# Emotions['Emotions'] = Y
# Emotions.to_csv('emotion.csv', index=False)
# Emotions.head()

In [None]:
# Emotions = pd.read_csv('./emotion.csv')
# Emotions.head()

In [None]:
# print(Emotions.isna().any())

In [None]:
# Emotions=Emotions.fillna(0)
# print(Emotions.isna().any())
# Emotions.shape

In [None]:
# np.sum(Emotions.isna())

# Data Preparation

In [None]:
# #taking all rows and all cols without last col for X which include features
# #taking last col for Y, which include the emotions


# X = Emotions.iloc[: ,:-1].values
# Y = Emotions['Emotions'].values
# print(Y)

In [None]:
# # As this is a multiclass classification problem onehotencoding our Y
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# encoder = OneHotEncoder()
# Y = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray()

# label_order = encoder.categories_[0].tolist()
# print(label_order)


In [None]:
# encoder.categories_[0].tolist()

In [None]:
# print(Y.shape)
# X.shape

In [None]:
# from sklearn.model_selection import train_test_split

# x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=42,test_size=0.2, shuffle=True)
# x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [None]:
# #reshape for gru
# X_train = x_train.reshape(x_train.shape[0] , x_train.shape[1] , 1)
# X_test = x_test.reshape(x_test.shape[0] , x_test.shape[1] , 1)

In [None]:
# # scaling our data with sklearn's Standard scaler
# scaler = StandardScaler()
# x_train = scaler.fit_transform(x_train)
# x_test = scaler.transform(x_test)
# x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [None]:
# import keras
# from keras.preprocessing import sequence
# from keras.models import Sequential
# from keras.layers import Dense, Embedding
# from keras.layers import LSTM,BatchNormalization , GRU
# from tensorflow.keras.utils import to_categorical
# from keras.layers import Input, Flatten, Dropout, Activation
# from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
# from keras.models import Model
# from keras.callbacks import ModelCheckpoint
# from tensorflow.keras.optimizers import SGD
# from keras.saving import register_keras_serializable

In [None]:
# #Reshape for CNN_GRU MODEL
# x_traincnn = np.expand_dims(x_train, axis=-1)
# x_testcnn = np.expand_dims(x_test, axis=-1)
# x_traincnn.shape, y_train.shape, x_testcnn.shape, y_test.shape
# #x_testcnn[0]

# Model Code

In [None]:
# import tensorflow as tf
# from tensorflow.keras import layers as L
# from tensorflow.keras.utils import register_keras_serializable

# def res_se_block(x, filters, kernel_size, pool_size):
#     # --- First conv path ---
#     y = L.Conv1D(filters, kernel_size, padding='same', activation='relu')(x)
#     y = L.BatchNormalization()(y)

#     # --- Squeeze & Excitation ---
#     se = L.GlobalAveragePooling1D()(y)              # (batch, filters)
#     se = L.Dense(filters // 8, activation='relu')(se)
#     se = L.Dense(filters, activation='sigmoid')(se) # (batch, filters)
#     se = L.Reshape((1, filters))(se)                 # (batch, 1, filters)
#     y  = L.Multiply()([y, se])                      # broadcast to (batch, time, filters)

#     # --- Second conv & skip connection ---
#     y = L.Conv1D(filters, kernel_size, padding='same')(y)
#     y = L.BatchNormalization()(y)
#     if x.shape[-1] != filters:
#         # project skip to match filters
#         x = L.Conv1D(filters, 1, padding='same')(x)
#         x = L.BatchNormalization()(x)
#     y = L.Add()([x, y])
#     y = L.Activation('relu')(y)

#     # --- Pooling ---
#     y = L.MaxPool1D(pool_size, padding='same')(y)
#     return y

# # --- Lightweight Self-Attention Layer ---
# @register_keras_serializable()
# class SimpleAttention(L.Layer):
#     def __init__(self, **kwargs):
#         super().__init__(**kwargs)
#         self.att = L.Attention()
#     def call(self, x):
#         return self.att([x, x])
#     def get_config(self):
#         return super().get_config()

# # --- Build the model ---
# inp = L.Input(shape=(1404, 1))

# x = res_se_block(inp, filters=64,  kernel_size=5, pool_size=3)
# x = res_se_block(x,    filters=128, kernel_size=3, pool_size=3)
# x = res_se_block(x,    filters=256, kernel_size=3, pool_size=2)
# x = L.Dropout(0.3)(x)

# x = L.Dense(128, activation='relu')(x)
# x = SimpleAttention()(x)
# x = L.Dropout(0.3)(x)

# x = L.GlobalAveragePooling1D()(x)
# x = L.Dense(64, activation='relu')(x)
# x = L.Dropout(0.4)(x)

# out = L.Dense(7, activation='softmax')(x)

# model = tf.keras.Model(inp, out)
# model.compile(
#     optimizer='adam',
#     loss='categorical_crossentropy',
#     metrics=['accuracy']
# )
# model.summary()


In [None]:
# history=model.fit(x_traincnn, y_train, epochs=100, validation_data=(x_testcnn, y_test), batch_size=32)

In [None]:
# # Get number of epochs based on training history
# epochs = range(len(history.history['loss']))

# fig , ax = plt.subplots(1,2)
# train_acc = history.history['accuracy']
# train_loss = history.history['loss']
# test_acc = history.history['val_accuracy']
# test_loss = history.history['val_loss']

# fig.set_size_inches(20,6)

# # Plot Loss
# ax[0].plot(epochs, train_loss, label='Training Loss')
# ax[0].plot(epochs, test_loss, label='Testing Loss')
# ax[0].set_title('Training & Testing Loss')
# ax[0].legend()
# ax[0].set_xlabel("Epochs")

# # Plot Accuracy
# ax[1].plot(epochs, train_acc, label='Training Accuracy')
# ax[1].plot(epochs, test_acc, label='Testing Accuracy')
# ax[1].set_title('Training & Testing Accuracy')
# ax[1].legend()
# ax[1].set_xlabel("Epochs")

# plt.show()


In [None]:
# import tensorflow as tf
# import tensorflow.keras.layers as L

# # ====== Positional Embedding Layer ======
# @register_keras_serializable()
# class PositionalEmbedding(L.Layer):
#     def __init__(self, sequence_length, embed_dim, **kwargs):
#         super(PositionalEmbedding, self).__init__(**kwargs)
#         self.sequence_length = sequence_length
#         self.embed_dim = embed_dim
#         self.pos_emb = self.add_weight(
#             name="pos_emb",
#             shape=(1, sequence_length, embed_dim),
#             initializer="random_normal",
#             trainable=True,
#         )

#     def call(self, x):
#         return x + self.pos_emb

#     def get_config(self):
#         config = super(PositionalEmbedding, self).get_config()
#         config.update({
#             "sequence_length": self.sequence_length,
#             "embed_dim": self.embed_dim
#         })
#         return config

# # ====== Transformer Block ======
# @register_keras_serializable()
# class TransformerBlock(L.Layer):
#     def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
#         super(TransformerBlock, self).__init__(**kwargs)
#         self.embed_dim = embed_dim
#         self.num_heads = num_heads
#         self.ff_dim = ff_dim
#         self.rate = rate

#         self.att = L.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
#         self.ffn = tf.keras.Sequential([
#             L.Dense(ff_dim, activation="relu"),
#             L.Dense(embed_dim)
#         ])
#         self.layernorm1 = L.LayerNormalization(epsilon=1e-6)
#         self.layernorm2 = L.LayerNormalization(epsilon=1e-6)
#         self.dropout1 = L.Dropout(rate)
#         self.dropout2 = L.Dropout(rate)

#     def call(self, inputs, training=False):
#         attn_output = self.att(inputs, inputs)
#         attn_output = self.dropout1(attn_output, training=training)
#         out1 = self.layernorm1(inputs + attn_output)
#         ffn_output = self.ffn(out1)
#         ffn_output = self.dropout2(ffn_output, training=training)
#         return self.layernorm2(out1 + ffn_output)

#     def get_config(self):
#         config = super(TransformerBlock, self).get_config()
#         config.update({
#             "embed_dim": self.embed_dim,
#             "num_heads": self.num_heads,
#             "ff_dim": self.ff_dim,
#             "rate": self.rate
#         })
#         return config

# # ====== Build the model ======
# input_layer = L.Input(shape=(1404, 1))

# # CNN Feature Extractor
# x = L.Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu')(input_layer)
# x = L.BatchNormalization()(x)
# x = L.MaxPool1D(pool_size=3, strides=2, padding='same')(x)

# x = L.Conv1D(128, kernel_size=3, strides=1, padding='same', activation='relu')(x)
# x = L.BatchNormalization()(x)
# x = L.MaxPool1D(pool_size=3, strides=2, padding='same')(x)

# # ====== Added new CNN layer here ======
# x = L.Conv1D(64, kernel_size=3, strides=1, padding='same', activation='relu')(x)
# x = L.BatchNormalization()(x)
# x = L.MaxPool1D(pool_size=2, strides=2, padding='same')(x)

# x = L.Dropout(0.3)(x)

# # Project to smaller dimension
# x = L.Dense(64, activation='relu')(x)

# # Add Positional Embedding
# sequence_length = x.shape[1]
# x = PositionalEmbedding(sequence_length, 64)(x)

# # Transformer Blocks
# x = TransformerBlock(embed_dim=64, num_heads=4, ff_dim=256)(x)
# x = TransformerBlock(embed_dim=64, num_heads=4, ff_dim=256)(x)
# x = L.Dropout(0.3)(x)

# # BiGRU Layers
# x = L.Bidirectional(L.GRU(256, return_sequences=True, dropout=0.3))(x)
# x = L.Bidirectional(L.GRU(128, dropout=0.3))(x)

# # Dense Layers
# x = L.Dense(128, activation='relu')(x)
# x = L.BatchNormalization()(x)
# x = L.Dropout(0.4)(x)

# # Output Layer
# output_layer = L.Dense(7, activation='softmax')(x)

# # Define Model
# model = tf.keras.Model(inputs=input_layer, outputs=output_layer)

# # ====== Compile the model ======
# loss_fn = tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1)  # Label smoothing
# model.compile(optimizer='adam', loss=loss_fn, metrics=['accuracy'])

# # ====== Summary ======
# model.summary()


In [None]:
# history=model.fit(x_traincnn, y_train, epochs=100, validation_data=(x_testcnn, y_test), batch_size=32)

In [None]:
# # Get number of epochs based on training history
# epochs = range(len(history.history['loss']))

# fig , ax = plt.subplots(1,2)
# train_acc = history.history['accuracy']
# train_loss = history.history['loss']
# test_acc = history.history['val_accuracy']
# test_loss = history.history['val_loss']

# fig.set_size_inches(20,6)

# # Plot Loss
# ax[0].plot(epochs, train_loss, label='Training Loss')
# ax[0].plot(epochs, test_loss, label='Testing Loss')
# ax[0].set_title('Training & Testing Loss')
# ax[0].legend()
# ax[0].set_xlabel("Epochs")

# # Plot Accuracy
# ax[1].plot(epochs, train_acc, label='Training Accuracy')
# ax[1].plot(epochs, test_acc, label='Testing Accuracy')
# ax[1].set_title('Training & Testing Accuracy')
# ax[1].legend()
# ax[1].set_xlabel("Epochs")

# plt.show()


# Combining both the models

data preparation

In [None]:
#encode labels and split dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

label_enc = LabelEncoder()
Y_encoded = label_enc.fit_transform(Y)  # convert emotion labels to integers

X_train, X_test, y_train, y_test = train_test_split(
    X, Y_encoded, test_size=0.2, stratify=Y_encoded, random_state=42
)

In [None]:
#create padded tensorflow dataset
import tensorflow as tf

n_mfcc = X_train[0].shape[1]  # typically 13

def make_dataset(X, Y, batch_size=32, shuffle=True):
    def gen():
        for x, y in zip(X, Y):
            yield x.astype(np.float32), y
    ds = tf.data.Dataset.from_generator(
        gen,
        output_signature=(
            tf.TensorSpec(shape=(None, n_mfcc), dtype=tf.float32),
            tf.TensorSpec(shape=(), dtype=tf.int32),
        )
    )
    if shuffle:
        ds = ds.shuffle(buffer_size=len(X))
    ds = ds.padded_batch(batch_size, padded_shapes=([None, n_mfcc], []))
    return ds.prefetch(tf.data.AUTOTUNE)

train_ds = make_dataset(X_train, y_train, batch_size=32, shuffle=True)
val_ds   = make_dataset(X_test,  y_test,  batch_size=32, shuffle=False)

In [None]:
# import tensorflow as tf
# from tensorflow.keras import layers as L

# def res_se_block(x, filters, kernel_size, pool_size):
#     shortcut = x
#     y = L.Conv1D(filters, kernel_size, padding='same', activation='relu')(x)
#     y = L.BatchNormalization()(y)

#     # Squeeze and Excitation
#     se = L.GlobalAveragePooling1D()(y)
#     se = L.Dense(filters // 8, activation='relu')(se)
#     se = L.Dense(filters, activation='sigmoid')(se)
#     se = L.Reshape((1, filters))(se)
#     y = L.Multiply()([y, se])

#     y = L.Conv1D(filters, kernel_size, padding='same')(y)
#     y = L.BatchNormalization()(y)

#     if shortcut.shape[-1] != filters:
#         shortcut = L.Conv1D(filters, 1, padding='same')(shortcut)
#         shortcut = L.BatchNormalization()(shortcut)

#     y = L.Add()([shortcut, y])
#     y = L.Activation('relu')(y)
#     return L.MaxPool1D(pool_size, padding='same')(y)

# class PositionalEmbedding(L.Layer):
#     def __init__(self, embed_dim, maxlen=500, **kw):
#         super().__init__(**kw)
#         self.embed_dim = embed_dim
#         self.maxlen = maxlen

#     def build(self, inp_shape):
#         self.pos_emb = self.add_weight(
#             shape=(1, self.maxlen, self.embed_dim),
#             initializer="random_normal",
#             trainable=True
#         )

#     def call(self, x):
#         length = tf.shape(x)[1]
#         return x + self.pos_emb[:, :length, :]

# class TransformerBlock(L.Layer):
#     def __init__(self, emb_dim, heads, ff_dim, rate=0.1, **kw):
#         super().__init__(**kw)
#         self.att = L.MultiHeadAttention(heads, key_dim=emb_dim)
#         self.ff = tf.keras.Sequential([
#             L.Dense(ff_dim, activation='relu'),
#             L.Dense(emb_dim),
#         ])
#         self.norm1 = L.LayerNormalization(epsilon=1e-6)
#         self.norm2 = L.LayerNormalization(epsilon=1e-6)
#         self.drop1 = L.Dropout(rate)
#         self.drop2 = L.Dropout(rate)

#     def call(self, x, training=False):
#         attn_output = self.att(x, x)
#         x = self.norm1(x + self.drop1(attn_output, training=training))
#         ff_output = self.ff(x)
#         return self.norm2(x + self.drop2(ff_output, training=training))

# # Input layer (MFCC + delta + delta-delta)
# inp = L.Input(shape=(None, 39))  # (time_steps, 39)

# # SE-ResNet blocks
# x = res_se_block(inp, 64, 5, 2)
# x = res_se_block(x, 128, 3, 2)
# x = res_se_block(x, 256, 3, 2)
# x = L.Dropout(0.3)(x)

# # Channel reduction + Transformer encoder
# x = L.Dense(128, activation='relu')(x)
# x = PositionalEmbedding(embed_dim=128, maxlen=500)(x)
# x = TransformerBlock(128, heads=4, ff_dim=256)(x)
# x = TransformerBlock(128, heads=4, ff_dim=256)(x)

# # Bidirectional GRU
# x = L.Bidirectional(L.GRU(128, return_sequences=True, dropout=0.3))(x)
# x = L.Bidirectional(L.GRU(64, dropout=0.3))(x)

# # Dense + regularization
# x = L.Dense(128, activation='relu')(x)
# x = L.BatchNormalization()(x)
# x = L.Dropout(0.5)(x)

# # Output
# out = L.Dense(7, activation='softmax')(x)

# # Build and compile model
# model = tf.keras.Model(inp, out)
# model.compile(
#     optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
#     loss='sparse_categorical_crossentropy',
#     metrics=['accuracy']
# )
# model.summary()


TCN Layer

In [None]:
import tensorflow as tf
from tensorflow.keras import layers as L

# Residual SE Block
def res_se_block(x, filters, kernel_size, pool_size):
    y = L.Conv1D(filters, kernel_size, padding='same', activation='relu')(x)
    y = L.BatchNormalization()(y)
    se = L.GlobalAveragePooling1D()(y)
    se = L.Dense(filters // 8, activation='relu')(se)
    se = L.Dense(filters, activation='sigmoid')(se)
    se = L.Reshape((1, filters))(se)
    y = L.Multiply()([y, se])
    y = L.Conv1D(filters, kernel_size, padding='same')(y)
    y = L.BatchNormalization()(y)
    if x.shape[-1] != filters:
        x = L.Conv1D(filters, 1, padding='same')(x)
        x = L.BatchNormalization()(x)
    y = L.Add()([x, y])
    y = L.Activation('relu')(y)
    return L.MaxPool1D(pool_size, padding='same')(y)

# Positional Embedding
class PositionalEmbedding(L.Layer):
    def __init__(self, embed_dim, maxlen=500, **kw):
        super().__init__(**kw)
        self.embed_dim = embed_dim
        self.maxlen = maxlen
    def build(self, inp_shape):
        self.pos_emb = self.add_weight(
            shape=(1, self.maxlen, self.embed_dim),
            initializer="random_normal", trainable=True)
    def call(self, x):
        length = tf.shape(x)[1]
        return x + self.pos_emb[:, :length, :]

# TCN block
def tcn_block(x, filters, kernel_size, dilation_rate, dropout_rate):
    shortcut = x
    x = L.Conv1D(filters, kernel_size, padding='causal',
                 dilation_rate=dilation_rate, activation='relu')(x)
    x = L.BatchNormalization()(x)
    x = L.SpatialDropout1D(dropout_rate)(x)
    x = L.Conv1D(filters, kernel_size, padding='causal',
                 dilation_rate=dilation_rate, activation='relu')(x)
    x = L.BatchNormalization()(x)
    if shortcut.shape[-1] != filters:
        shortcut = L.Conv1D(filters, 1, padding='same')(shortcut)
    x = L.Add()([shortcut, x])
    return L.Activation('relu')(x)

# Model
inp = L.Input(shape=(None, 39))  # MFCC + delta + delta-delta = 39

# SE-ResNet Frontend
x = res_se_block(inp, 64, 5, 3)
x = res_se_block(x, 128, 3, 3)
x = res_se_block(x, 256, 3, 2)
x = L.Dropout(0.3)(x)

# Dense + TCN block
x = L.Dense(128, activation='relu')(x)
x = PositionalEmbedding(embed_dim=128)(x)
x = tcn_block(x, filters=128, kernel_size=3, dilation_rate=2, dropout_rate=0.2)
x = tcn_block(x, filters=128, kernel_size=3, dilation_rate=4, dropout_rate=0.2)

# BiGRU
x = L.Bidirectional(L.GRU(128, return_sequences=True, dropout=0.3))(x)
x = L.Bidirectional(L.GRU(64, dropout=0.3))(x)

# Classifier head
x = L.Dense(128, activation='relu')(x)
x = L.Dropout(0.4)(x)
out = L.Dense(7, activation='softmax')(x)

model = tf.keras.Model(inp, out)
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)
model.summary()


In [None]:
history=model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=100,
)

In [None]:
# Save the full model (architecture + weights + optimizer) in the modern .keras format
model.save("CNN_model1.keras")
print("✅ Model saved to disk as CNN_model1.keras")

In [None]:
# Get number of epochs based on training history
epochs = range(len(history.history['loss']))

fig , ax = plt.subplots(1,2)
train_acc = history.history['accuracy']
train_loss = history.history['loss']
test_acc = history.history['val_accuracy']
test_loss = history.history['val_loss']

fig.set_size_inches(20,6)

# Plot Loss
ax[0].plot(epochs, train_loss, label='Training Loss')
ax[0].plot(epochs, test_loss, label='Testing Loss')
ax[0].set_title('Training & Testing Loss')
ax[0].legend()
ax[0].set_xlabel("Epochs")

# Plot Accuracy
ax[1].plot(epochs, train_acc, label='Training Accuracy')
ax[1].plot(epochs, test_acc, label='Testing Accuracy')
ax[1].set_title('Training & Testing Accuracy')
ax[1].legend()
ax[1].set_xlabel("Epochs")

plt.show()


# Validation Accuracy

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training vs Validation Accuracy')
plt.legend()
plt.grid(True)
plt.show()


# Printing the confusion matrix

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Get predictions and true labels
y_true = []
y_pred = []

for x_batch, y_batch in val_ds:
    preds = model.predict(x_batch)
    y_true.extend(y_batch.numpy())
    y_pred.extend(np.argmax(preds, axis=1))

y_true = np.array(y_true)
y_pred = np.array(y_pred)

In [None]:
# Define class labels in the correct order
emotion_labels = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']

cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=emotion_labels, yticklabels=emotion_labels, cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
print(classification_report(y_true, y_pred, target_names=emotion_labels))