In [11]:
import os
import glob
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# 3. Constants
SR = 16000
DURATIO = 3
CLIP_LEN = SR * DURATION
N_MELS = 64

In [13]:
# 4. Feature extraction
def extract_mel(file_path):
    y, sr = librosa.load(file_path, sr=SR, mono=True, duration=DURATION)
    if len(y) < CLIP_LEN:
        y = np.pad(y, (0, CLIP_LEN-len(y)))
    else:
        y = y[:CLIP_LEN]
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=N_MELS)
    mel = librosa.power_to_db(mel, ref=np.max)
    mel = (mel - mel.mean())/(mel.std()+1e-6)
    return mel.astype(np.float32)

In [17]:
# 5. Collecting file paths with labels
X, y = [], []
root = "data"  # adjust to actual folder name
for file in glob.glob(f"{root}/**/*.wav", recursive=True):
    parts = os.path.basename(file).split("-")
    emotion = parts[2]  # '02', '03', '04', etc.
    if emotion in ["02", "03"]:  # calm, happy
        X.append(file); y.append(0)
    elif emotion == "04":  # sad
        X.append(file); y.append(1)

print("Samples loaded:", len(X), "labels:", set(y))

Samples loaded: 1152 labels: {0, 1}


In [18]:
# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
# 7. Dataset class
class VocalEmotionDataset(Dataset):
    def __init__(self, files , labels):
        self.files = files
        self.labels = labels
    def __len__(self):
        return len(self.files)
    def __getitem__(self, idx):
        mel = extract_mel(self.files[idx])
        mel = torch.tensor(mel).unsqueeze(0)  # (1, n_mels, time)
        label = torch.tensor(self.labels[idx], dtype=torch.float32)
        return mel, label

train_ds = VocalEmotionDataset(X_train, y_train)
test_ds  = VocalEmotionDataset(X_test,  y_test)

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
test_loader  = DataLoader(test_ds, batch_size=16)

In [20]:
# 8. Model definition
class CNNVoice(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
        )
        # compute flattened size with dummy input
        dummy = torch.zeros(1,1,N_MELS, CLIP_LEN//(512))  # rough time dimension
        flat_size = self.conv(dummy).numel()
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(flat_size, 64), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(64,1), nn.Sigmoid()
        )
    def forward(self, x):
        return self.fc(self.conv(x))

In [21]:
# 9. Initialize training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNVoice().to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [22]:
# 10. Training loop
for epoch in range(10):
    model.train()
    total_loss = 0
    for Xb, yb in train_loader:
        Xb, yb = Xb.to(device), yb.to(device).unsqueeze(1)
        preds = model(Xb)
        loss = criterion(preds, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/10 — Loss: {total_loss/len(train_loader):.4f}")

Epoch 1/10 — Loss: 0.6609
Epoch 2/10 — Loss: 0.6045
Epoch 3/10 — Loss: 0.5240
Epoch 4/10 — Loss: 0.4178
Epoch 5/10 — Loss: 0.2903
Epoch 6/10 — Loss: 0.1984
Epoch 7/10 — Loss: 0.1387
Epoch 8/10 — Loss: 0.0984
Epoch 9/10 — Loss: 0.0574
Epoch 10/10 — Loss: 0.0414


In [23]:
# 11. Evaluation
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for Xb, yb in test_loader:
        Xb = Xb.to(device)
        preds = (model(Xb).cpu().numpy()>0.5).astype(int).ravel()
        y_true.extend(yb.numpy()); y_pred.extend(preds)
print("Test Accuracy:", accuracy_score(y_true, y_pred))


Test Accuracy: 0.9437229437229437


In [24]:
# 12. Save the model
torch.save(model.state_dict(), "voice_depression_model.pth")
print("Model saved!")

Model saved!
