In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.metrics import precision_recall_fscore_support
import joblib


In [2]:
features = pd.read_csv("C:/Nini/Capstone/CSV_Files/DataAugmentation_ravdess_extracted_features.csv")
features = features.drop(features.columns[0], axis=1)
temp = shuffle(features)
df = np.random.rand(len(temp)) < 0.8
train = temp[df]
test = temp[~df]

In [3]:
X_train = np.array(train.iloc[:,:-1])
y_train = np.array(train.iloc[:,-1])
X_test = np.array(test.iloc[:,:-1])
y_test = np.array(test.iloc[:,-1])

In [4]:
lb = LabelEncoder()
y_train = lb.fit_transform(y_train)
y_test = lb.transform(y_test)

In [5]:
X_train = torch.tensor(X_train, dtype=torch.float32).unsqueeze(1)
y_train = torch.tensor(y_train, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.float32).unsqueeze(1)
y_test = torch.tensor(y_test, dtype=torch.long)

In [6]:
joblib.dump(lb, "label_encoder.pkl")
print("Label Encoder saved successfully!")

Label Encoder saved successfully!


In [7]:
batch_size = 256
train_loader = DataLoader(TensorDataset(X_train,y_train), batch_size=batch_size,shuffle=True)
test_loader = DataLoader(TensorDataset(X_test,y_test), batch_size=batch_size, shuffle=False)

Test 1

In [8]:
class CNNModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=8)
        self.bn1 = nn.BatchNorm1d(64)
        self.conv2 = nn.Conv1d(64, 128, kernel_size=8)
        self.bn2 = nn.BatchNorm1d(128)
        self.pool1 = nn.MaxPool1d(kernel_size=4, padding=2)
        self.dropout1 = nn.Dropout(0.5)
        self.conv3 = nn.Conv1d(128, 128, kernel_size=8)
        self.bn3 = nn.BatchNorm1d(128)
        self.pool2 = nn.MaxPool1d(kernel_size=4, padding=2)
        self.dropout2 = nn.Dropout(0.5)
        self.conv4 = nn.Conv1d(128, 64, kernel_size=3, padding=2)
        self.bn4 = nn.BatchNorm1d(64)
        self.fc1 = nn.Linear(512, 256)
        self.dropout3 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(256, num_classes)

    # def _compute_flattened_size(self, input_size):
    #     with torch.no_grad():
    #         x = torch.randn(1, 1, input_size)  # Create a dummy input
    #         x = self.conv1(x)
    #         x = F.relu(x)
    #         x = self.conv2(x)
    #         x = F.relu(x)
    #         x = self.pool1(x)
    #         x = self.dropout1(x)
    #         x = self.conv3(x)
    #         x = F.relu(x)
    #         x = self.pool2(x)
    #         x = self.dropout2(x)
    #         x = self.conv4(x)
    #         x = F.relu(x)
    #         x = x.view(1, -1)  # Flatten
    #         self._to_linear = x.shape[1]

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool1(x)
        x = self.dropout1(x)
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.pool2(x)
        x = self.dropout2(x)
        x = F.relu(self.bn4(self.conv4(x)))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.dropout3(x)
        x = self.fc2(x)
        return x

Test 2

In [9]:
# class CNNModel(nn.Module):
#     def __init__(self, input_size, num_classes):
#         super(CNNModel, self).__init__()
#         self.conv1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=8)
#         self.bn1 = nn.BatchNorm1d(64)
#         self.conv2 = nn.Conv1d(64, 128, kernel_size=8)
#         self.bn2 = nn.BatchNorm1d(128)
#         self.pool1 = nn.MaxPool1d(kernel_size=6, padding=2)
#         self.dropout1 = nn.Dropout(0.5)
#         self.conv3 = nn.Conv1d(128, 128, kernel_size=8)
#         self.bn3 = nn.BatchNorm1d(128)
#         self.pool2 = nn.MaxPool1d(kernel_size=6, padding=2)
#         self.dropout2 = nn.Dropout(0.5)
#         # self.conv4 = nn.Conv1d(128, 64, kernel_size=4, padding=2)
#         # self.bn4 = nn.BatchNorm1d(64)
#         self.fc1 = nn.Linear(384, 256)
#         self.dropout3 = nn.Dropout(0.5)
#         self.fc2 = nn.Linear(256, num_classes)

#     # def _compute_flattened_size(self, input_size):
#     #     with torch.no_grad():
#     #         x = torch.randn(1, 1, input_size)  # Create a dummy input
#     #         x = self.conv1(x)
#     #         x = F.relu(x)
#     #         x = self.conv2(x)
#     #         x = F.relu(x)
#     #         x = self.pool1(x)
#     #         x = self.dropout1(x)
#     #         x = self.conv3(x)
#     #         x = F.relu(x)
#     #         x = self.pool2(x)
#     #         x = self.dropout2(x)
#     #         x = self.conv4(x)
#     #         x = F.relu(x)
#     #         x = x.view(1, -1)  # Flatten
#     #         self._to_linear = x.shape[1]

#     def forward(self, x):
#         x = F.relu(self.bn1(self.conv1(x)))
#         x = F.relu(self.bn2(self.conv2(x)))
#         x = self.pool1(x)
#         x = self.dropout1(x)
#         x = F.relu(self.bn3(self.conv3(x)))
#         x = self.pool2(x)
#         x = self.dropout2(x)
#         # x = F.relu(self.bn4(self.conv4(x)))
#         x = x.view(x.size(0), -1)
#         x = F.relu(self.fc1(x))
#         x = self.dropout3(x)
#         x = self.fc2(x)
#         return x

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CNNModel(input_size=X_train.shape[1], num_classes=8).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-4)

In [11]:
print("X_train shape:", X_train.shape)

X_train shape: torch.Size([2317, 1, 128])


In [12]:
# epochs = 1500
# for epoch in range(epochs):
#     model.train()
#     running_loss = 0
#     total_loss = 0
#     total = 0
#     correct = 0
#     for X_batch, y_batch in train_loader:
#         X_batch, y_batch = X_batch.to(device), y_batch.to(device)
#         optimizer.zero_grad()
#         outputs = model(X_batch)
#         loss = criterion(outputs, y_batch)
#         loss.backward()
#         optimizer.step()
#         running_loss += loss.item()
#         _, predicted = torch.max(outputs, 1)  # Get predicted class
#         correct += (predicted == y_batch).sum().item()
#         total += y_batch.size(0)
#     train_accuracy = correct / total
#     avg_train_loss = running_loss / len(train_loader)
#         # total_loss += loss.item()
#         # correct += (outputs.argmax(dim=1) == y_batch).sum().item()
#     model.eval()  # Set model to evaluation mode
#     val_correct = 0
#     val_total = 0
#     val_loss = 0.0

#     with torch.no_grad():  # Disable gradient computation for validation
#         for val_inputs, val_labels in test_loader:
#             val_inputs, val_labels = val_inputs.to(device), val_labels.to(device)

#             val_outputs = model(val_inputs)
#             loss = criterion(val_outputs, val_labels)
#             val_loss += loss.item()

#             _, val_predicted = torch.max(val_outputs, 1)
#             val_correct += (val_predicted == val_labels).sum().item()
#             val_total += val_labels.size(0)
    
#     val_accuracy = val_correct / val_total
#     avg_val_loss = val_loss / len(test_loader)
#     # accuracy = correct / len(train_loader.dataset)
#     print(f"Epoch [{epoch+1}/{epochs}]")
#     print(f"Train Loss: {avg_train_loss:.4f}, Train Acc: {train_accuracy:.4f}")
#     print(f"Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.4f}")
#     print("-" * 50)

With Precision, recall and f1-score

In [13]:
epochs = 1500
for epoch in range(epochs):
    model.train()
    running_loss = 0
    correct = 0
    total = 0

    # Training loop
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)  # Get predicted class
        correct += (predicted == y_batch).sum().item()
        total += y_batch.size(0)
    
    train_accuracy = correct / total
    avg_train_loss = running_loss / len(train_loader)

    # Validation loop
    model.eval()
    val_correct = 0
    val_total = 0
    val_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for val_inputs, val_labels in test_loader:
            val_inputs, val_labels = val_inputs.to(device), val_labels.to(device)
            val_outputs = model(val_inputs)
            loss = criterion(val_outputs, val_labels)
            val_loss += loss.item()
            _, val_predicted = torch.max(val_outputs, 1)

            val_correct += (val_predicted == val_labels).sum().item()
            val_total += val_labels.size(0)

            # Store predictions and labels for evaluation
            all_preds.extend(val_predicted.cpu().numpy())
            all_labels.extend(val_labels.cpu().numpy())

    val_accuracy = val_correct / val_total
    avg_val_loss = val_loss / len(test_loader)

    # Compute Precision, Recall, F1-score
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted',zero_division=1)

    print(f"Epoch [{epoch+1}/{epochs}]")
    print(f"Train Loss: {avg_train_loss:.4f}, Train Acc: {train_accuracy:.4f}")
    print(f"Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.4f}")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")
    print("-" * 50)

Epoch [1/1500]
Train Loss: 2.0381, Train Acc: 0.1972
Val Loss: 1.9939, Val Acc: 0.2398
Precision: 0.6007, Recall: 0.2398, F1-score: 0.1480
--------------------------------------------------
Epoch [2/1500]
Train Loss: 1.8265, Train Acc: 0.2836
Val Loss: 1.8986, Val Acc: 0.2487
Precision: 0.4873, Recall: 0.2487, F1-score: 0.1828
--------------------------------------------------
Epoch [3/1500]
Train Loss: 1.7933, Train Acc: 0.2866
Val Loss: 1.8591, Val Acc: 0.2416
Precision: 0.5091, Recall: 0.2416, F1-score: 0.1943
--------------------------------------------------
Epoch [4/1500]
Train Loss: 1.7343, Train Acc: 0.3060
Val Loss: 1.8060, Val Acc: 0.2948
Precision: 0.3651, Recall: 0.2948, F1-score: 0.2352
--------------------------------------------------
Epoch [5/1500]
Train Loss: 1.7033, Train Acc: 0.3457
Val Loss: 1.7928, Val Acc: 0.3108
Precision: 0.4397, Recall: 0.3108, F1-score: 0.2797
--------------------------------------------------
Epoch [6/1500]
Train Loss: 1.6920, Train Acc: 0.34

In [14]:
torch.save(model.state_dict(), "C:/Nini/Capstone/Models/DataAugmentation_cnn_model2.pth")

Inference on test set

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [16]:
num_classes = 8  # Update this based on your dataset
model = CNNModel(input_size=128, num_classes=num_classes)  # Update input_size accordingly
model.load_state_dict(torch.load("C:/Nini/Capstone/Models/DataAugmentation_cnn_model2.pth", map_location=device))  # Load saved weights
model.to(device)
model.eval()  # Set to evaluation mode
print("Model Loaded Successfully")

Model Loaded Successfully


In [17]:
print(model)

CNNModel(
  (conv1): Conv1d(1, 64, kernel_size=(8,), stride=(1,))
  (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv1d(64, 128, kernel_size=(8,), stride=(1,))
  (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool1d(kernel_size=4, stride=4, padding=2, dilation=1, ceil_mode=False)
  (dropout1): Dropout(p=0.5, inplace=False)
  (conv3): Conv1d(128, 128, kernel_size=(8,), stride=(1,))
  (bn3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool1d(kernel_size=4, stride=4, padding=2, dilation=1, ceil_mode=False)
  (dropout2): Dropout(p=0.5, inplace=False)
  (conv4): Conv1d(128, 64, kernel_size=(3,), stride=(1,), padding=(2,))
  (bn4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=512, out_features=256, bias=True)
  (dropout3): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_fe

In [18]:
def predict(model, input_tensor, device):
    model.eval()  # Set model to evaluation mode
    input_tensor = input_tensor.to(device)
    
    with torch.no_grad():  # Disable gradient calculations for inference
        output = model(input_tensor)
        predicted_class = torch.argmax(output, dim=1)  # Get predicted class
        
    return predicted_class.cpu().numpy()  # Convert to numpy array for easy handling

# Example usage:
sample_input = X_test[:5].to(device)  # Take a few samples for inference
predictions = predict(model, sample_input, device)
print("Predicted Labels:", predictions)

Predicted Labels: [0 3 0 5 1]


In [19]:
actual_labels = y_test[:5].cpu().numpy()
for i in range(len(predictions)):
    print(f"Sample {i+1}: Predicted = {predictions[i]}, Actual = {actual_labels[i]}")

Sample 1: Predicted = 0, Actual = 0
Sample 2: Predicted = 3, Actual = 3
Sample 3: Predicted = 0, Actual = 0
Sample 4: Predicted = 5, Actual = 5
Sample 5: Predicted = 1, Actual = 1


In [20]:
lb = joblib.load("label_encoder.pkl")
decoded_predictions = lb.inverse_transform(predictions)
decoded_actual_labels = lb.inverse_transform(actual_labels)

In [21]:
for i in range(len(predictions)):
    print(f"Sample {i+1}: Predicted = {decoded_predictions[i]}, Actual = {decoded_actual_labels[i]}")


Sample 1: Predicted = angry, Actual = angry
Sample 2: Predicted = fearful, Actual = fearful
Sample 3: Predicted = angry, Actual = angry
Sample 4: Predicted = neutral, Actual = neutral
Sample 5: Predicted = calm, Actual = calm


Inference on unseen audio

In [22]:
import numpy as np
import librosa
import torch
import joblib
import soxr

In [23]:
mean = np.load("C:/Nini/Capstone/src/Data Preprocessing/mean.npy")[:,None]  # Load mean array
std = np.load("C:/Nini/Capstone/src/Data Preprocessing/std.npy") [:,None]

In [24]:
lb = joblib.load("label_encoder.pkl")

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNModel(input_size=128, num_classes=8)  # Ensure the architecture matches
model.load_state_dict(torch.load("C:/Nini/Capstone/Models/DataAugmentation_cnn_model.pth", map_location=device))
model.to(device)
model.eval()

CNNModel(
  (conv1): Conv1d(1, 64, kernel_size=(8,), stride=(1,))
  (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv1d(64, 128, kernel_size=(8,), stride=(1,))
  (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool1d(kernel_size=4, stride=4, padding=2, dilation=1, ceil_mode=False)
  (dropout1): Dropout(p=0.5, inplace=False)
  (conv3): Conv1d(128, 128, kernel_size=(8,), stride=(1,))
  (bn3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2): MaxPool1d(kernel_size=4, stride=4, padding=2, dilation=1, ceil_mode=False)
  (dropout2): Dropout(p=0.5, inplace=False)
  (conv4): Conv1d(128, 64, kernel_size=(3,), stride=(1,), padding=(2,))
  (bn4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=512, out_features=256, bias=True)
  (dropout3): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_fe

In [26]:
def preprocess_audio(file_path):
    X, sample_rate = librosa.load(file_path,res_type='kaiser_fast',duration=3,sr=44100,offset=0.5)
    audio_resampled = soxr.resample(X, sample_rate, 16000)
    spectrogram = librosa.feature.melspectrogram(y=audio_resampled,sr=16000,n_mels=128,fmax=8000)
    db_spec = librosa.power_to_db(spectrogram)
    mel_spectrogram = (db_spec - mean) / std
    log_spectrogram = np.mean(mel_spectrogram,axis=1)
    mel_spectrogram_tensor = torch.tensor(log_spectrogram, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
    return mel_spectrogram_tensor

In [27]:
def predict_audio(file_path):
    input_tensor = preprocess_audio(file_path).to(device)
    print(input_tensor)
    
    with torch.no_grad():  # Disable gradients for inference
        output = model(input_tensor)
        predicted_class = torch.argmax(output, dim=1).cpu().numpy()[0]  # Convert to NumPy
        
    # Decode the label
    predicted_label = lb.inverse_transform([predicted_class])[0]
    return predicted_label

In [28]:
audio_file = "C:/Nini/Capstone/Test/01-01-06-02-01-02-01.wav"
prediction = predict_audio(audio_file)
print(f"Predicted Emotion: {prediction}")

tensor([[[-39.0506, -39.5870, -39.7154, -40.9100, -42.6148, -36.2505, -30.1936,
          -24.8767, -18.0850, -14.5041, -12.3833, -11.8852, -16.1793, -22.7990,
          -25.9747, -25.2175, -22.4984, -21.3819, -20.2342, -17.6569, -17.4210,
          -18.9434, -18.0588, -18.8115, -20.8730, -23.4603, -26.8653, -29.2984,
          -27.1497, -27.5133, -28.2432, -27.1207, -29.3671, -29.7387, -27.8602,
          -25.9646, -24.4105, -25.4388, -27.0234, -27.4708, -27.9303, -28.8141,
          -30.0984, -29.2751, -27.1728, -26.0809, -24.6129, -21.5107, -22.5599,
          -25.7683, -27.6894, -28.2929, -25.8600, -24.5037, -21.5814, -18.7496,
          -19.2529, -20.6802, -21.8183, -22.2989, -22.7439, -22.5519, -21.2937,
          -20.5533, -24.2446, -27.4240, -27.9722, -28.1946, -29.2111, -28.8527,
          -28.7333, -28.0296, -27.9280, -29.4251, -28.7732, -26.7698, -24.6226,
          -24.7869, -26.9146, -28.3228, -29.1971, -29.8335, -29.6409, -30.9195,
          -32.9055, -32.5348, -31.9851, 