In [1]:
!pip install librosa scikit-learn matplotlib seaborn gradio



In [3]:
#Mount Google Drive to access dataset
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
import zipfile

zip_path = '/content/drive/MyDrive/baby_cry_data.zip'
extract_path = '/content/drive/MyDrive/baby_cry_data'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [5]:
#Checking class distribution
import os
from collections import Counter

base_dir = '/content/drive/MyDrive/baby_cry_data'
classes = os.listdir(base_dir)

class_counts = {}
for cls in classes:
    if not cls.startswith('.'):
        files = os.listdir(os.path.join(base_dir, cls))
        class_counts[cls] = len(files)

print(class_counts)

{'BabyCryingSounds': 9, 'Baby Crying Sounds': 9}


In [11]:
#Convert audio to mel spectogram
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt

def audio_to_melspectrogram(file_path, max_len=128):
    y, sr = librosa.load(file_path, sr=22050)
    mels = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    log_mels = librosa.power_to_db(mels, ref=np.max)

    if log_mels.shape[1] < max_len:
        pad_width = max_len - log_mels.shape[1]
        log_mels = np.pad(log_mels, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        log_mels = log_mels[:, :max_len]

    return log_mels

In [12]:
import tqdm

X = []
y = []
label_map = {label: idx for idx, label in enumerate(sorted(class_counts.keys()))}

audio_extensions = ('*.wav', '*.ogg')
for label in label_map:
    folder = os.path.join(base_dir, label)
    for file in tqdm.tqdm(base_dir, desc=f"Processing {label}"):
        file_path = os.path.join(folder, file)
        if not file_path.lower().endswith(audio_extensions):
            continue
        try:
            mel = audio_to_melspectrogram(file_path)
            X.append(mel)
            y.append(label_map[label])
        except Exception as e:
            print(f"Error processing {file_path}: {e}")

Processing Baby Crying Sounds: 100%|██████████| 53/53 [00:00<00:00, 106159.56it/s]


In [13]:
import os
import glob
from collections import defaultdict

#Path to the main dataset folder on Google Drive
base_dir = "/content/drive/MyDrive/baby_cry_data/BabyCryingSounds"

#Classes in the dataset
labels = ['belly pain', 'burping', 'cold_hot', 'discomfort',
                   'hungry', 'laugh', 'noise', 'silence', 'tired']

#Mapping: label name → list of audio file paths
data_by_class = defaultdict(list)

#Build class-to-files mapping
audio_extensions = ('*.wav', '*.ogg')

for label in labels:
    folder_path = os.path.join(base_dir, label)
    if os.path.exists(folder_path):
        audio_files = []
        for ext in audio_extensions:
            audio_files.extend(glob.glob(os.path.join(folder_path, ext)))
        data_by_class[label].extend(audio_files)
        print(f"✅ Loaded {len(audio_files)} files for class: {label}")
    else:
        print(f"⚠️ Folder not found for class: {label}")

print("\n📊 Class distribution:")
for label, files in data_by_class.items():
    print(f"{label:12s} → {len(files)} files")

✅ Loaded 124 files for class: belly pain
✅ Loaded 108 files for class: burping
✅ Loaded 108 files for class: cold_hot
✅ Loaded 135 files for class: discomfort
✅ Loaded 382 files for class: hungry
✅ Loaded 108 files for class: laugh
✅ Loaded 108 files for class: noise
✅ Loaded 108 files for class: silence
✅ Loaded 132 files for class: tired

📊 Class distribution:
belly pain   → 124 files
burping      → 108 files
cold_hot     → 108 files
discomfort   → 135 files
hungry       → 382 files
laugh        → 108 files
noise        → 108 files
silence      → 108 files
tired        → 132 files


In [14]:
#Convert Audio to Mel Spectrogram
import librosa
import numpy as np

IMG_HEIGHT = 128
IMG_WIDTH = 128

def audio_to_mel(file_path):
    y, sr = librosa.load(file_path, sr=22050)
    mels = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=IMG_HEIGHT)
    db = librosa.power_to_db(mels, ref=np.max)

    if db.shape[1] < IMG_WIDTH:
        pad = IMG_WIDTH - db.shape[1]
        db = np.pad(db, ((0, 0), (0, pad)), mode='constant')
    else:
        db = db[:, :IMG_WIDTH]

    return db

In [15]:
#Build Dataset
X = []
y = []

for label, file_paths in data_by_class.items():
    for path in file_paths:
        try:
            mel = audio_to_mel(path)
            X.append(mel)
            y.append(label)

        except Exception as e:
            print(f"Error: {path} - {e}")

X = np.array(X)
X = X[..., np.newaxis]

In [16]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

y = np.array(y)

le = LabelEncoder()
y_encoded = le.fit_transform(y)

y_cat = to_categorical(y_encoded)

y_labels = le.inverse_transform(y_encoded)

#First split: Train + Test (80/20)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y_labels, test_size=0.2, random_state=42, stratify=y_labels
)

#Second split: Train + Val (80/20 of the remaining → 64/16 overall)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.2, random_state=42, stratify=y_train_val
)

In [17]:
import os
from tqdm import tqdm

output_base = '/content/baby_cry_split'

#Create directories for train, val, test
for split in ['train', 'val', 'test']:
    for class_name in le.classes_:
        os.makedirs(os.path.join(output_base, split, class_name), exist_ok=True)

def save_split(X_split, y_split, split_name):
    for idx, (mel, label) in enumerate(tqdm(zip(X_split, y_split), total=len(X_split), desc=f"Saving {split_name}")):
        save_path = os.path.join(output_base, split_name, label, f"mel_{idx}.npy")
        np.save(save_path, mel)

save_split(X_train, y_train, 'train')
save_split(X_val, y_val, 'val')
save_split(X_test, y_test, 'test')


Saving train: 100%|██████████| 840/840 [00:00<00:00, 1854.69it/s]
Saving val: 100%|██████████| 210/210 [00:00<00:00, 1976.93it/s]
Saving test: 100%|██████████| 263/263 [00:00<00:00, 1976.72it/s]


# Model training cnn resnet **18**

In [18]:
!pip install torch torchvision --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [19]:
!pip uninstall -y sympy
!pip install sympy==1.12

Found existing installation: sympy 1.13.1
Uninstalling sympy-1.13.1:
  Successfully uninstalled sympy-1.13.1
Collecting sympy==1.12
  Downloading sympy-1.12-py3-none-any.whl.metadata (12 kB)
Downloading sympy-1.12-py3-none-any.whl (5.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.7/5.7 MB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sympy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.6.0+cu124 requires sympy==1.13.1; python_version >= "3.9", but you have sympy 1.12 which is incompatible.[0m[31m
[0mSuccessfully installed sympy-1.12


# CNN Base Model

In [35]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np

# Hyperparameters
BATCH_SIZE = 64
EPOCHS = 50
LEARNING_RATE = 1e-3
PATIENCE = 5  #For early stopping

train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(128),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

val_transforms = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

data_dir = '/content/baby_cry_split/'
try:
    train_dataset = CryDataset(f"{data_dir}/train", transform=train_transforms)
    val_dataset = CryDataset(f"{data_dir}/val", transform=val_transforms)

    #We check if datasets are empty before creating DataLoaders
    if len(train_dataset) == 0:
        print("Error: Training dataset is empty. Cannot create DataLoader.")
        train_loader = None
    else:
        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, collate_fn=custom_collate_fn) # Use custom collate_fn
        print(f"Training dataset size: {len(train_dataset)}")


    if len(val_dataset) == 0:
        print("Error: Validation dataset is empty. Cannot create DataLoader.")
        val_loader = None
    else:
        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, collate_fn=custom_collate_fn) # Use custom collate_fn
        print(f"Validation dataset size: {len(val_dataset)}")

    #Determine the number of classes from the training dataset
    if train_dataset is not None:
         NUM_CLASSES = len(train_dataset.classes)
         print(f"Number of classes: {NUM_CLASSES}")
    else:
         print("Error: Could not determine the number of classes from the training dataset.")
         NUM_CLASSES = 0

except NameError:
    print("Error: CryDataset or custom_collate_fn not defined. Please ensure the cell defining them is executed.")
    train_loader = None
    val_loader = None
    NUM_CLASSES = 0

model = None
if NUM_CLASSES > 0:
    class BaseCNN(nn.Module):
        def __init__(self, num_classes):
            super(BaseCNN, self).__init__()
            self.features = nn.Sequential(
                nn.Conv2d(3, 32, 3, padding=1),
                nn.BatchNorm2d(32),
                nn.ReLU(),
                nn.MaxPool2d(2),

                nn.Conv2d(32, 64, 3, padding=1),
                nn.BatchNorm2d(64),
                nn.ReLU(),
                nn.MaxPool2d(2),

                nn.Conv2d(64, 128, 3, padding=1),
                nn.BatchNorm2d(128),
                nn.ReLU(),
                nn.MaxPool2d(2),

                nn.Dropout(0.3)  #For regularization
            )
            self.classifier = nn.Sequential(
                nn.Flatten(),
                nn.Linear(128 * 16 * 16, 256),
                nn.ReLU(),
                nn.Dropout(0.5),  #For regularization
                nn.Linear(256, num_classes)
            )

        def forward(self, x):
            x = self.features(x)
            x = self.classifier(x)
            return x

    model = BaseCNN(num_classes=NUM_CLASSES).to(device)
    print("Base CNN model loaded.")
else:
    print("Base CNN model not loaded due to insufficient number of classes.")

criterion = None
optimizer = None

if model is not None:
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)
    print("Loss and optimizer initialized.")
else:
    print("Loss and optimizer not initialized as model was not loaded.")

best_val_loss = float('inf')
patience_counter = 0

if train_loader is not None and val_loader is not None and model is not None:
    for epoch in range(EPOCHS):
        model.train()
        train_loss = 0
        correct = 0
        total = 0

        for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]"):
            if images.size(0) == 0:
                 continue
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * images.size(0)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

        avg_train_loss = train_loss / total if total > 0 else 0
        train_acc = correct / total if total > 0 else 0


        #Validation phase
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for images, labels in tqdm(val_loader, desc=f"Epoch {epoch+1} [Val]"):
                if images.size(0) == 0:
                     continue
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                loss = criterion(outputs, labels)

                val_loss += loss.item() * images.size(0)
                _, preds = torch.max(outputs, 1)
                val_correct += (preds == labels).sum().item()
                val_total += labels.size(0)

        avg_val_loss = val_loss / val_total if val_total > 0 else 0
        val_acc = val_correct / val_total if val_total > 0 else 0


        print(f"\n📊 Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Train Acc: {train_acc:.3f} | "
              f"Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.3f}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), 'best_cnn_model.pth')
            patience_counter = 0
            print("✅ Best model saved.")
        else:
            patience_counter += 1
            if patience_counter >= PATIENCE:
                print("⏹️ Early stopping triggered.")
                break

    try:
        model.load_state_dict(torch.load('best_cnn_model.pth'))
        print("✅ Best model loaded.")
    except FileNotFoundError:
        print("⚠️ Best model file not found. Skipping model loading.")

else:
    print("\nTraining skipped due to empty dataset(s) or model loading failure.")

🚀 Using device: cpu
Training dataset size: 840
Validation dataset size: 210
Number of classes: 9
Base CNN model loaded.
Loss and optimizer initialized.


Epoch 1 [Train]: 100%|██████████| 14/14 [00:47<00:00,  3.41s/it]
Epoch 1 [Val]: 100%|██████████| 4/4 [00:05<00:00,  1.32s/it]



📊 Epoch 1 | Train Loss: 7.6468 | Train Acc: 0.168 | Val Loss: 2.1678 | Val Acc: 0.305
✅ Best model saved.


Epoch 2 [Train]: 100%|██████████| 14/14 [00:46<00:00,  3.33s/it]
Epoch 2 [Val]: 100%|██████████| 4/4 [00:04<00:00,  1.20s/it]



📊 Epoch 2 | Train Loss: 2.1890 | Train Acc: 0.173 | Val Loss: 2.0315 | Val Acc: 0.371
✅ Best model saved.


Epoch 3 [Train]: 100%|██████████| 14/14 [00:44<00:00,  3.17s/it]
Epoch 3 [Val]: 100%|██████████| 4/4 [00:06<00:00,  1.52s/it]



📊 Epoch 3 | Train Loss: 2.0832 | Train Acc: 0.181 | Val Loss: 1.8510 | Val Acc: 0.371
✅ Best model saved.


Epoch 4 [Train]: 100%|██████████| 14/14 [00:45<00:00,  3.25s/it]
Epoch 4 [Val]: 100%|██████████| 4/4 [00:04<00:00,  1.18s/it]



📊 Epoch 4 | Train Loss: 2.0515 | Train Acc: 0.218 | Val Loss: 1.8149 | Val Acc: 0.371
✅ Best model saved.


Epoch 5 [Train]: 100%|██████████| 14/14 [00:46<00:00,  3.30s/it]
Epoch 5 [Val]: 100%|██████████| 4/4 [00:05<00:00,  1.38s/it]



📊 Epoch 5 | Train Loss: 2.0104 | Train Acc: 0.262 | Val Loss: 1.7788 | Val Acc: 0.371
✅ Best model saved.


Epoch 6 [Train]: 100%|██████████| 14/14 [00:44<00:00,  3.16s/it]
Epoch 6 [Val]: 100%|██████████| 4/4 [00:05<00:00,  1.29s/it]



📊 Epoch 6 | Train Loss: 2.0221 | Train Acc: 0.267 | Val Loss: 1.7430 | Val Acc: 0.371
✅ Best model saved.


Epoch 7 [Train]: 100%|██████████| 14/14 [00:45<00:00,  3.25s/it]
Epoch 7 [Val]: 100%|██████████| 4/4 [00:04<00:00,  1.21s/it]



📊 Epoch 7 | Train Loss: 1.9939 | Train Acc: 0.313 | Val Loss: 1.7691 | Val Acc: 0.371


Epoch 8 [Train]: 100%|██████████| 14/14 [00:44<00:00,  3.19s/it]
Epoch 8 [Val]: 100%|██████████| 4/4 [00:05<00:00,  1.46s/it]



📊 Epoch 8 | Train Loss: 1.9727 | Train Acc: 0.330 | Val Loss: 1.7554 | Val Acc: 0.371


Epoch 9 [Train]: 100%|██████████| 14/14 [00:45<00:00,  3.22s/it]
Epoch 9 [Val]: 100%|██████████| 4/4 [00:04<00:00,  1.19s/it]



📊 Epoch 9 | Train Loss: 2.0002 | Train Acc: 0.336 | Val Loss: 1.7227 | Val Acc: 0.371
✅ Best model saved.


Epoch 10 [Train]: 100%|██████████| 14/14 [00:45<00:00,  3.25s/it]
Epoch 10 [Val]: 100%|██████████| 4/4 [00:05<00:00,  1.46s/it]



📊 Epoch 10 | Train Loss: 1.9475 | Train Acc: 0.330 | Val Loss: 1.7627 | Val Acc: 0.367


Epoch 11 [Train]: 100%|██████████| 14/14 [00:45<00:00,  3.25s/it]
Epoch 11 [Val]: 100%|██████████| 4/4 [00:05<00:00,  1.32s/it]



📊 Epoch 11 | Train Loss: 1.9344 | Train Acc: 0.339 | Val Loss: 1.7519 | Val Acc: 0.371


Epoch 12 [Train]: 100%|██████████| 14/14 [00:45<00:00,  3.25s/it]
Epoch 12 [Val]: 100%|██████████| 4/4 [00:04<00:00,  1.22s/it]



📊 Epoch 12 | Train Loss: 1.9605 | Train Acc: 0.336 | Val Loss: 1.7455 | Val Acc: 0.371


Epoch 13 [Train]: 100%|██████████| 14/14 [00:44<00:00,  3.18s/it]
Epoch 13 [Val]: 100%|██████████| 4/4 [00:06<00:00,  1.51s/it]



📊 Epoch 13 | Train Loss: 1.9230 | Train Acc: 0.342 | Val Loss: 1.7243 | Val Acc: 0.371


Epoch 14 [Train]: 100%|██████████| 14/14 [00:44<00:00,  3.19s/it]
Epoch 14 [Val]: 100%|██████████| 4/4 [00:04<00:00,  1.20s/it]


📊 Epoch 14 | Train Loss: 1.9173 | Train Acc: 0.348 | Val Loss: 1.7897 | Val Acc: 0.371
⏹️ Early stopping triggered.
✅ Best model loaded.





EfficientNet_B0

In [30]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms
from torchvision.models import efficientnet_b0, EfficientNet_B0_Weights
from torch.utils.data import DataLoader
import numpy as np

BATCH_SIZE = 32
EPOCHS = 30
LR = 1e-3
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

data_transforms = {
    'train': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5]*3, std=[0.5]*3)
    ]),
    'val': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5]*3, std=[0.5]*3)
    ])
}

data_dir = '/content/baby_cry_split/'
datasets = {
    x: CryDataset(f"{data_dir}/{x}", data_transforms[x])
    for x in ['train', 'val']
}

dataloaders = {}
if len(datasets['train']) == 0:
    print("Error: Training dataset is empty. Cannot create DataLoader.")
    train_loader = None
else:
    dataloaders['train'] = DataLoader(datasets['train'], batch_size=BATCH_SIZE, shuffle=True, collate_fn=custom_collate_fn)
    print(f"Training dataset size: {len(datasets['train'])}")


if len(datasets['val']) == 0:
    print("Error: Validation dataset is empty. Cannot create DataLoader.")
    val_loader = None
else:
    dataloaders['val'] = DataLoader(datasets['val'], batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate_fn)
    print(f"Validation dataset size: {len(datasets['val'])}")


if 'train' in dataloaders and datasets['train'] is not None:
    NUM_CLASSES = len(datasets['train'].classes)
    print(f"Number of classes: {NUM_CLASSES}")
else:
    print("Error: Could not determine the number of classes from the training dataset.")
    NUM_CLASSES = 0


model = None
if NUM_CLASSES > 0:
    model = efficientnet_b0(weights=EfficientNet_B0_Weights.DEFAULT)
    model.classifier[1] = nn.Linear(model.classifier[1].in_features, NUM_CLASSES)
    model = model.to(DEVICE)

    #Unfreeze deeper layers for fine-tuning
    for name, param in model.features.named_parameters():
        #Unfreeze blocks 4, 5, 6, 7 and the classifier
        if any(block in name for block in ["4", "5", "6", "7"]) or 'classifier' in name:
             param.requires_grad = True
        else:
             param.requires_grad = False
    print("Model loaded and modified for fine-tuning.")
else:
    print("Model not loaded due to insufficient number of classes.")


criterion = None
optimizer = None
scheduler = None

if model is not None:
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=LR)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)
    print("Loss, Optimizer, and Scheduler initialized.")
else:
    print("Loss, Optimizer, and Scheduler not initialized as model was not loaded.")

def mixup_data(x, y, alpha=1.0):
    if x.size(0) == 0:
        return x, y, y, 0.0
    lam = np.random.beta(alpha, alpha)
    index = torch.randperm(x.size(0)).to(x.device)
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

def mixup_criterion(criterion, pred, y_a, y_b, lam):
    if pred.size(0) == 0:
        return torch.tensor(0.0, device=pred.device, requires_grad=True)
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)


best_val_acc = 0.0
if 'train' in dataloaders and 'val' in dataloaders and model is not None:
    for epoch in range(EPOCHS):
        print(f"\nEpoch {epoch+1:02d}")

        model.train()
        running_loss, running_corrects = 0.0, 0
        total_train_samples = 0

        for inputs, labels in dataloaders['train']:
            if inputs.size(0) == 0:
                continue
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)

            inputs, targets_a, targets_b, lam = mixup_data(inputs, labels)
            outputs = model(inputs)
            loss = mixup_criterion(criterion, outputs, targets_a, targets_b, lam)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            _, preds = torch.max(outputs, 1)
            running_loss += loss.item() * inputs.size(0)
            running_corrects += (lam * preds.eq(targets_a).sum().item() + (1 - lam) * preds.eq(targets_b).sum().item())
            total_train_samples += inputs.size(0)


        epoch_loss = running_loss / total_train_samples if total_train_samples > 0 else 0.0
        epoch_acc = running_corrects / total_train_samples if total_train_samples > 0 else 0.0
        print(f"Train Loss: {epoch_loss:.4f} | Train Acc: {epoch_acc:.3f}")

        model.eval()
        val_loss, val_corrects = 0.0, 0
        total_val_samples = 0

        with torch.no_grad():
            for inputs, labels in dataloaders['val']:
                if inputs.size(0) == 0:
                    continue
                inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                _, preds = torch.max(outputs, 1)
                val_loss += loss.item() * inputs.size(0)
                val_corrects += torch.sum(preds == labels).item()
                total_val_samples += inputs.size(0)


        val_epoch_loss = val_loss / total_val_samples if total_val_samples > 0 else 0.0
        val_epoch_acc = val_corrects / total_val_samples if total_val_samples > 0 else 0.0
        print(f"Val Loss: {val_epoch_loss:.4f} | Val Acc: {val_epoch_acc:.3f}")

        if total_val_samples > 0:
             scheduler.step(val_epoch_loss)

        if val_epoch_acc > best_val_acc:
            best_val_acc = val_epoch_acc
            torch.save(model.state_dict(), "best_model.pth")
            print("✅ Best model saved!")
    print("\nTraining complete.")

else:
    print("\nTraining skipped due to empty dataset(s) or model loading failure.")

Training dataset size: 840
Validation dataset size: 210
Number of classes: 9


Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-7f5810bc.pth
100%|██████████| 20.5M/20.5M [00:00<00:00, 86.8MB/s]


Model loaded and modified for fine-tuning.
Loss, Optimizer, and Scheduler initialized.

Epoch 01
Train Loss: 1.8262 | Train Acc: 0.381
Val Loss: 1.7669 | Val Acc: 0.395
✅ Best model saved!

Epoch 02
Train Loss: 1.6117 | Train Acc: 0.459
Val Loss: 1.7212 | Val Acc: 0.262

Epoch 03
Train Loss: 1.5225 | Train Acc: 0.487
Val Loss: 1.4301 | Val Acc: 0.510
✅ Best model saved!

Epoch 04
Train Loss: 1.4636 | Train Acc: 0.479
Val Loss: 1.3209 | Val Acc: 0.524
✅ Best model saved!

Epoch 05
Train Loss: 1.4399 | Train Acc: 0.508
Val Loss: 1.3187 | Val Acc: 0.505

Epoch 06
Train Loss: 1.4230 | Train Acc: 0.500
Val Loss: 1.3645 | Val Acc: 0.433

Epoch 07
Train Loss: 1.3712 | Train Acc: 0.523
Val Loss: 1.4016 | Val Acc: 0.481

Epoch 08
Train Loss: 1.3164 | Train Acc: 0.540
Val Loss: 1.4765 | Val Acc: 0.452

Epoch 09
Train Loss: 1.3281 | Train Acc: 0.525
Val Loss: 1.5206 | Val Acc: 0.471

Epoch 10
Train Loss: 1.2167 | Train Acc: 0.572
Val Loss: 1.6098 | Val Acc: 0.419

Epoch 11
Train Loss: 1.1563 | Tr

# mobile v3net

In [26]:
import os
import numpy as np
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
import torch.nn as nn
import torch.optim as optim
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm

torch.backends.cudnn.benchmark = True

class CryDataset(Dataset):
    def __init__(self, folder, transform=None):
        self.paths, self.labels = [], []
        for label in sorted(os.listdir(folder)):
            class_dir = os.path.join(folder, label)
            if os.path.isdir(class_dir):
                for file in os.listdir(class_dir):
                    if file.endswith(".npy"):
                        self.paths.append(os.path.join(class_dir, file))
                        self.labels.append(label)
        self.classes = sorted(list(set(self.labels)))
        self.class_to_idx = {cls: i for i, cls in enumerate(self.classes)}
        self.transform = transform

    def __len__(self): return len(self.paths)

    def __getitem__(self, idx):
        path = self.paths[idx]
        label_str = self.labels[idx]
        label = self.class_to_idx[label_str]
        try:
            mel_spec = np.load(path)
            mel_spec = np.squeeze(mel_spec)
            mel_norm = (mel_spec - mel_spec.min()) / (mel_spec.max() - mel_spec.min() + 1e-6)
            img = Image.fromarray((mel_norm * 255).astype(np.uint8)).resize((224, 224)).convert("RGB")
            if self.transform: img = self.transform(img)
            return img, label
        except Exception as e:
            print(f"Error loading {path}: {e}")
            return None, None

def custom_collate_fn(batch):
    batch = [b for b in batch if b[0] is not None]
    if len(batch) == 0: return torch.empty(0), torch.empty(0)
    imgs, labels = zip(*batch)
    return torch.stack(imgs), torch.tensor(labels)

train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.6, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.3, hue=0.02),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5]*3, std=[0.5]*3)
])
val_transform = transforms.Compose([
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5]*3, std=[0.5]*3)
])

train_data = CryDataset("/content/baby_cry_split/train", transform=train_transform)
val_data = CryDataset("/content/baby_cry_split/val", transform=val_transform)

batch_size = 16
num_workers = 2

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True,
                          collate_fn=custom_collate_fn, num_workers=num_workers)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False,
                        collate_fn=custom_collate_fn, num_workers=num_workers)

train_labels = [train_data.class_to_idx[label] for label in train_data.labels]
class_weights = torch.tensor(
    compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels),
    dtype=torch.float
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("🚀 Using device:", device)

model = models.mobilenet_v3_large(weights=models.MobileNet_V3_Large_Weights.DEFAULT)

for param in model.features.parameters():
    param.requires_grad = False

model.classifier = nn.Sequential(
    nn.Linear(model.classifier[0].in_features, 256),
    nn.ReLU(),
    nn.Dropout(0.6),
    nn.Linear(256, len(train_data.classes))
)
model = model.to(device)


criterion = nn.CrossEntropyLoss(weight=class_weights.to(device), label_smoothing=0.1)
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=3, factor=0.5, verbose=True)


best_val_loss = float("inf")
no_improve = 0
early_stop_patience = 6
max_epochs = 30

for epoch in range(1, max_epochs + 1):
    model.train()
    train_loss, correct, total = 0.0, 0, 0

    loop = tqdm(train_loader, desc=f"Epoch {epoch:02d} [Train]")
    for X, y in loop:
        if X.size(0) == 0: continue
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * X.size(0)
        correct += (outputs.argmax(1) == y).sum().item()
        total += X.size(0)
        loop.set_postfix(loss=loss.item())

    train_loss /= total
    train_acc = correct / total

    model.eval()
    val_loss, val_correct, val_total = 0.0, 0, 0
    with torch.no_grad():
        for X, y in tqdm(val_loader, desc=f"Epoch {epoch:02d} [Val]"):
            if X.size(0) == 0: continue
            X, y = X.to(device), y.to(device)
            outputs = model(X)
            loss = criterion(outputs, y)
            val_loss += loss.item() * X.size(0)
            val_correct += (outputs.argmax(1) == y).sum().item()
            val_total += X.size(0)

    val_loss /= val_total
    val_acc = val_correct / val_total

    print(f"\n📊 Epoch {epoch:02d} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.3f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.3f}")
    scheduler.step(val_loss)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_model.pth")
        no_improve = 0
    else:
        no_improve += 1
        if no_improve >= early_stop_patience:
            print("⏹️ Early stopping triggered.")
            break

try:
    model.load_state_dict(torch.load("best_model.pth"))
    model.eval()
    print("✅ Best model loaded.")
except FileNotFoundError:
    print("⚠️ No saved model found.")


🚀 Using device: cpu


Downloading: "https://download.pytorch.org/models/mobilenet_v3_large-5c1a4163.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v3_large-5c1a4163.pth
100%|██████████| 21.1M/21.1M [00:00<00:00, 60.0MB/s]
Epoch 01 [Train]: 100%|██████████| 53/53 [00:41<00:00,  1.29it/s, loss=2.2]
Epoch 01 [Val]: 100%|██████████| 14/14 [00:05<00:00,  2.51it/s]



📊 Epoch 01 | Train Loss: 2.2105 | Train Acc: 0.167 | Val Loss: 2.2786 | Val Acc: 0.276


Epoch 02 [Train]: 100%|██████████| 53/53 [00:35<00:00,  1.49it/s, loss=2.03]
Epoch 02 [Val]: 100%|██████████| 14/14 [00:05<00:00,  2.45it/s]



📊 Epoch 02 | Train Loss: 2.1419 | Train Acc: 0.218 | Val Loss: 2.2442 | Val Acc: 0.276


Epoch 03 [Train]: 100%|██████████| 53/53 [00:34<00:00,  1.54it/s, loss=2.21]
Epoch 03 [Val]: 100%|██████████| 14/14 [00:06<00:00,  2.15it/s]



📊 Epoch 03 | Train Loss: 2.0811 | Train Acc: 0.285 | Val Loss: 2.2022 | Val Acc: 0.343


Epoch 04 [Train]: 100%|██████████| 53/53 [00:34<00:00,  1.54it/s, loss=1.89]
Epoch 04 [Val]: 100%|██████████| 14/14 [00:06<00:00,  2.14it/s]



📊 Epoch 04 | Train Loss: 2.0087 | Train Acc: 0.299 | Val Loss: 2.1599 | Val Acc: 0.386


Epoch 05 [Train]: 100%|██████████| 53/53 [00:34<00:00,  1.54it/s, loss=1.87]
Epoch 05 [Val]: 100%|██████████| 14/14 [00:06<00:00,  2.16it/s]



📊 Epoch 05 | Train Loss: 1.9534 | Train Acc: 0.305 | Val Loss: 2.1218 | Val Acc: 0.371


Epoch 06 [Train]: 100%|██████████| 53/53 [00:34<00:00,  1.54it/s, loss=1.96]
Epoch 06 [Val]: 100%|██████████| 14/14 [00:06<00:00,  2.17it/s]



📊 Epoch 06 | Train Loss: 1.9112 | Train Acc: 0.311 | Val Loss: 2.0795 | Val Acc: 0.362


Epoch 07 [Train]: 100%|██████████| 53/53 [00:35<00:00,  1.51it/s, loss=1.82]
Epoch 07 [Val]: 100%|██████████| 14/14 [00:06<00:00,  2.30it/s]



📊 Epoch 07 | Train Loss: 1.8798 | Train Acc: 0.318 | Val Loss: 2.0498 | Val Acc: 0.324


Epoch 08 [Train]: 100%|██████████| 53/53 [00:35<00:00,  1.51it/s, loss=1.94]
Epoch 08 [Val]: 100%|██████████| 14/14 [00:05<00:00,  2.57it/s]



📊 Epoch 08 | Train Loss: 1.8386 | Train Acc: 0.340 | Val Loss: 2.0271 | Val Acc: 0.329


Epoch 09 [Train]: 100%|██████████| 53/53 [00:35<00:00,  1.49it/s, loss=2.2]
Epoch 09 [Val]: 100%|██████████| 14/14 [00:05<00:00,  2.60it/s]



📊 Epoch 09 | Train Loss: 1.8316 | Train Acc: 0.335 | Val Loss: 2.0153 | Val Acc: 0.286


Epoch 10 [Train]: 100%|██████████| 53/53 [00:35<00:00,  1.49it/s, loss=1.64]
Epoch 10 [Val]: 100%|██████████| 14/14 [00:05<00:00,  2.61it/s]



📊 Epoch 10 | Train Loss: 1.7928 | Train Acc: 0.350 | Val Loss: 2.0037 | Val Acc: 0.262


Epoch 11 [Train]: 100%|██████████| 53/53 [00:35<00:00,  1.49it/s, loss=1.69]
Epoch 11 [Val]: 100%|██████████| 14/14 [00:05<00:00,  2.58it/s]



📊 Epoch 11 | Train Loss: 1.7677 | Train Acc: 0.369 | Val Loss: 1.9934 | Val Acc: 0.252


Epoch 12 [Train]: 100%|██████████| 53/53 [00:35<00:00,  1.49it/s, loss=1.65]
Epoch 12 [Val]: 100%|██████████| 14/14 [00:05<00:00,  2.60it/s]



📊 Epoch 12 | Train Loss: 1.7655 | Train Acc: 0.357 | Val Loss: 1.9927 | Val Acc: 0.233


Epoch 13 [Train]: 100%|██████████| 53/53 [00:35<00:00,  1.51it/s, loss=1.91]
Epoch 13 [Val]: 100%|██████████| 14/14 [00:06<00:00,  2.33it/s]



📊 Epoch 13 | Train Loss: 1.7495 | Train Acc: 0.349 | Val Loss: 1.9770 | Val Acc: 0.252


Epoch 14 [Train]: 100%|██████████| 53/53 [00:35<00:00,  1.51it/s, loss=2.01]
Epoch 14 [Val]: 100%|██████████| 14/14 [00:06<00:00,  2.27it/s]



📊 Epoch 14 | Train Loss: 1.7229 | Train Acc: 0.381 | Val Loss: 1.9798 | Val Acc: 0.229


Epoch 15 [Train]: 100%|██████████| 53/53 [00:34<00:00,  1.53it/s, loss=1.73]
Epoch 15 [Val]: 100%|██████████| 14/14 [00:07<00:00,  1.76it/s]



📊 Epoch 15 | Train Loss: 1.7186 | Train Acc: 0.360 | Val Loss: 1.9684 | Val Acc: 0.243


Epoch 16 [Train]: 100%|██████████| 53/53 [00:34<00:00,  1.53it/s, loss=1.73]
Epoch 16 [Val]: 100%|██████████| 14/14 [00:06<00:00,  2.16it/s]



📊 Epoch 16 | Train Loss: 1.6962 | Train Acc: 0.371 | Val Loss: 1.9504 | Val Acc: 0.238


Epoch 17 [Train]: 100%|██████████| 53/53 [00:34<00:00,  1.54it/s, loss=1.85]
Epoch 17 [Val]: 100%|██████████| 14/14 [00:06<00:00,  2.15it/s]



📊 Epoch 17 | Train Loss: 1.6892 | Train Acc: 0.364 | Val Loss: 1.9015 | Val Acc: 0.295


Epoch 18 [Train]: 100%|██████████| 53/53 [00:34<00:00,  1.55it/s, loss=1.86]
Epoch 18 [Val]: 100%|██████████| 14/14 [00:05<00:00,  2.39it/s]



📊 Epoch 18 | Train Loss: 1.6902 | Train Acc: 0.376 | Val Loss: 1.8661 | Val Acc: 0.362


Epoch 19 [Train]: 100%|██████████| 53/53 [00:35<00:00,  1.51it/s, loss=1.48]
Epoch 19 [Val]: 100%|██████████| 14/14 [00:05<00:00,  2.47it/s]



📊 Epoch 19 | Train Loss: 1.6696 | Train Acc: 0.398 | Val Loss: 1.8273 | Val Acc: 0.371


Epoch 20 [Train]: 100%|██████████| 53/53 [00:35<00:00,  1.50it/s, loss=1.93]
Epoch 20 [Val]: 100%|██████████| 14/14 [00:05<00:00,  2.63it/s]



📊 Epoch 20 | Train Loss: 1.6812 | Train Acc: 0.379 | Val Loss: 1.7957 | Val Acc: 0.405


Epoch 21 [Train]: 100%|██████████| 53/53 [00:35<00:00,  1.49it/s, loss=1.65]
Epoch 21 [Val]: 100%|██████████| 14/14 [00:05<00:00,  2.61it/s]



📊 Epoch 21 | Train Loss: 1.6751 | Train Acc: 0.396 | Val Loss: 1.7623 | Val Acc: 0.424


Epoch 22 [Train]: 100%|██████████| 53/53 [00:35<00:00,  1.50it/s, loss=2.11]
Epoch 22 [Val]: 100%|██████████| 14/14 [00:05<00:00,  2.58it/s]



📊 Epoch 22 | Train Loss: 1.6624 | Train Acc: 0.400 | Val Loss: 1.7586 | Val Acc: 0.390


Epoch 23 [Train]: 100%|██████████| 53/53 [00:35<00:00,  1.50it/s, loss=1.73]
Epoch 23 [Val]: 100%|██████████| 14/14 [00:05<00:00,  2.60it/s]



📊 Epoch 23 | Train Loss: 1.6705 | Train Acc: 0.381 | Val Loss: 1.7350 | Val Acc: 0.443


Epoch 24 [Train]: 100%|██████████| 53/53 [00:35<00:00,  1.51it/s, loss=1.45]
Epoch 24 [Val]: 100%|██████████| 14/14 [00:05<00:00,  2.40it/s]



📊 Epoch 24 | Train Loss: 1.6422 | Train Acc: 0.412 | Val Loss: 1.7374 | Val Acc: 0.405


Epoch 25 [Train]: 100%|██████████| 53/53 [00:34<00:00,  1.53it/s, loss=2.33]
Epoch 25 [Val]: 100%|██████████| 14/14 [00:06<00:00,  2.23it/s]



📊 Epoch 25 | Train Loss: 1.6745 | Train Acc: 0.375 | Val Loss: 1.7325 | Val Acc: 0.414


Epoch 26 [Train]: 100%|██████████| 53/53 [00:34<00:00,  1.54it/s, loss=1.52]
Epoch 26 [Val]: 100%|██████████| 14/14 [00:06<00:00,  2.17it/s]



📊 Epoch 26 | Train Loss: 1.6578 | Train Acc: 0.379 | Val Loss: 1.7342 | Val Acc: 0.405


Epoch 27 [Train]: 100%|██████████| 53/53 [00:34<00:00,  1.54it/s, loss=1.74]
Epoch 27 [Val]: 100%|██████████| 14/14 [00:06<00:00,  2.14it/s]



📊 Epoch 27 | Train Loss: 1.6355 | Train Acc: 0.407 | Val Loss: 1.7219 | Val Acc: 0.410


Epoch 28 [Train]: 100%|██████████| 53/53 [00:34<00:00,  1.53it/s, loss=1.44]
Epoch 28 [Val]: 100%|██████████| 14/14 [00:06<00:00,  2.13it/s]



📊 Epoch 28 | Train Loss: 1.6239 | Train Acc: 0.390 | Val Loss: 1.7225 | Val Acc: 0.400


Epoch 29 [Train]: 100%|██████████| 53/53 [00:34<00:00,  1.54it/s, loss=1.46]
Epoch 29 [Val]: 100%|██████████| 14/14 [00:05<00:00,  2.37it/s]



📊 Epoch 29 | Train Loss: 1.6408 | Train Acc: 0.411 | Val Loss: 1.7232 | Val Acc: 0.376


Epoch 30 [Train]: 100%|██████████| 53/53 [00:35<00:00,  1.51it/s, loss=1.52]
Epoch 30 [Val]: 100%|██████████| 14/14 [00:05<00:00,  2.54it/s]


📊 Epoch 30 | Train Loss: 1.6066 | Train Acc: 0.406 | Val Loss: 1.7217 | Val Acc: 0.400
✅ Best model loaded.





latest resnet 50

In [24]:
import os
import numpy as np
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
import torch.nn as nn
import torch.optim as optim
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm

torch.backends.cudnn.benchmark = True

class CryDataset(Dataset):
    def __init__(self, folder, transform=None):
        self.paths, self.labels = [], []
        for label in sorted(os.listdir(folder)):
            class_dir = os.path.join(folder, label)
            if os.path.isdir(class_dir):
                for file in os.listdir(class_dir):
                    if file.endswith(".npy"):
                        self.paths.append(os.path.join(class_dir, file))
                        self.labels.append(label)
        self.classes = sorted(list(set(self.labels)))
        self.class_to_idx = {cls: i for i, cls in enumerate(self.classes)}
        self.transform = transform

    def __len__(self): return len(self.paths)

    def __getitem__(self, idx):
        path = self.paths[idx]
        label = self.class_to_idx[self.labels[idx]]
        try:
            mel_spec = np.load(path)
            mel_spec = np.squeeze(mel_spec)
            mel_norm = (mel_spec - mel_spec.min()) / (mel_spec.max() - mel_spec.min() + 1e-6)
            img = Image.fromarray((mel_norm * 255).astype(np.uint8)).resize((224, 224)).convert("RGB")
            if self.transform: img = self.transform(img)
            return img, label
        except Exception as e:
            print(f"Error loading {path}: {e}")
            return None, None

def custom_collate_fn(batch):
    batch = [b for b in batch if b[0] is not None]
    if len(batch) == 0: return torch.empty(0), torch.empty(0)
    imgs, labels = zip(*batch)
    return torch.stack(imgs), torch.tensor(labels)

train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomApply([transforms.ColorJitter(0.2, 0.2)], p=0.3),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5]*3, std=[0.5]*3)
])
val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5]*3, std=[0.5]*3)
])

train_data = CryDataset("/content/baby_cry_split/train", transform=train_transform)
val_data = CryDataset("/content/baby_cry_split/val", transform=val_transform)

batch_size = 32
num_workers = 2

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True,
                          collate_fn=custom_collate_fn, num_workers=num_workers, pin_memory=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False,
                        collate_fn=custom_collate_fn, num_workers=num_workers, pin_memory=True)


train_labels = [train_data.class_to_idx[label] for label in train_data.labels]
class_weights = torch.tensor(
    compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels),
    dtype=torch.float
).to(device)


model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
model.fc = nn.Sequential(
    nn.Dropout(0.4),
    nn.Linear(model.fc.in_features, len(train_data.classes))
)
model = model.to(device)


criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=3e-4, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=3, factor=0.5)


best_val_loss = float("inf")
no_improve = 0
early_stop_patience = 5
max_epochs = 25

for epoch in range(1, max_epochs + 1):
    model.train()
    train_loss, train_correct, train_total = 0.0, 0, 0
    for X, y in tqdm(train_loader, desc=f"Epoch {epoch:02d} [Train]"):
        if X.size(0) == 0: continue
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        out = model(X)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * X.size(0)
        train_correct += (out.argmax(1) == y).sum().item()
        train_total += X.size(0)
    train_loss /= train_total
    train_acc = train_correct / train_total

    model.eval()
    val_loss, val_correct, val_total = 0.0, 0, 0
    with torch.no_grad():
        for X, y in tqdm(val_loader, desc=f"Epoch {epoch:02d} [Val]"):
            if X.size(0) == 0: continue
            X, y = X.to(device), y.to(device)
            out = model(X)
            loss = criterion(out, y)
            val_loss += loss.item() * X.size(0)
            val_correct += (out.argmax(1) == y).sum().item()
            val_total += X.size(0)
    val_loss /= val_total
    val_acc = val_correct / val_total

    print(f"\n📊 Epoch {epoch:02d} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.3f} | "
          f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.3f}\n")

    scheduler.step(val_loss)
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_model.pth")
        no_improve = 0
    else:
        no_improve += 1
        if no_improve >= early_stop_patience:
            print("⏹️ Early stopping triggered.")
            break

model.load_state_dict(torch.load("best_model.pth"))
model.eval()
print("✅ Best model loaded.")


🚀 Using device: cpu


Epoch 01 [Train]: 100%|██████████| 27/27 [03:26<00:00,  7.64s/it]
Epoch 01 [Val]: 100%|██████████| 7/7 [00:17<00:00,  2.46s/it]



📊 Epoch 01 | Train Loss: 1.5597 | Train Acc: 0.352 | Val Loss: 1.2981 | Val Acc: 0.419



Epoch 02 [Train]: 100%|██████████| 27/27 [03:25<00:00,  7.62s/it]
Epoch 02 [Val]: 100%|██████████| 7/7 [00:17<00:00,  2.47s/it]



📊 Epoch 02 | Train Loss: 1.1991 | Train Acc: 0.442 | Val Loss: 1.3581 | Val Acc: 0.405



Epoch 03 [Train]: 100%|██████████| 27/27 [03:30<00:00,  7.81s/it]
Epoch 03 [Val]: 100%|██████████| 7/7 [00:18<00:00,  2.58s/it]



📊 Epoch 03 | Train Loss: 1.1412 | Train Acc: 0.482 | Val Loss: 1.2815 | Val Acc: 0.386



Epoch 04 [Train]: 100%|██████████| 27/27 [03:26<00:00,  7.65s/it]
Epoch 04 [Val]: 100%|██████████| 7/7 [00:18<00:00,  2.63s/it]



📊 Epoch 04 | Train Loss: 1.1367 | Train Acc: 0.471 | Val Loss: 1.2982 | Val Acc: 0.443



Epoch 05 [Train]: 100%|██████████| 27/27 [03:25<00:00,  7.62s/it]
Epoch 05 [Val]: 100%|██████████| 7/7 [00:18<00:00,  2.65s/it]



📊 Epoch 05 | Train Loss: 1.0277 | Train Acc: 0.500 | Val Loss: 1.4859 | Val Acc: 0.352



Epoch 06 [Train]: 100%|██████████| 27/27 [03:24<00:00,  7.58s/it]
Epoch 06 [Val]: 100%|██████████| 7/7 [00:18<00:00,  2.65s/it]



📊 Epoch 06 | Train Loss: 0.9690 | Train Acc: 0.523 | Val Loss: 1.2796 | Val Acc: 0.405



Epoch 07 [Train]: 100%|██████████| 27/27 [03:25<00:00,  7.60s/it]
Epoch 07 [Val]: 100%|██████████| 7/7 [00:18<00:00,  2.60s/it]



📊 Epoch 07 | Train Loss: 0.9116 | Train Acc: 0.536 | Val Loss: 1.3394 | Val Acc: 0.429



Epoch 08 [Train]: 100%|██████████| 27/27 [03:25<00:00,  7.62s/it]
Epoch 08 [Val]: 100%|██████████| 7/7 [00:17<00:00,  2.54s/it]



📊 Epoch 08 | Train Loss: 0.8672 | Train Acc: 0.536 | Val Loss: 1.3724 | Val Acc: 0.400



Epoch 09 [Train]: 100%|██████████| 27/27 [03:25<00:00,  7.62s/it]
Epoch 09 [Val]: 100%|██████████| 7/7 [00:17<00:00,  2.49s/it]



📊 Epoch 09 | Train Loss: 0.7299 | Train Acc: 0.610 | Val Loss: 1.4851 | Val Acc: 0.400



Epoch 10 [Train]: 100%|██████████| 27/27 [03:26<00:00,  7.65s/it]
Epoch 10 [Val]: 100%|██████████| 7/7 [00:17<00:00,  2.48s/it]



📊 Epoch 10 | Train Loss: 0.6986 | Train Acc: 0.625 | Val Loss: 1.7032 | Val Acc: 0.376



Epoch 11 [Train]: 100%|██████████| 27/27 [03:26<00:00,  7.63s/it]
Epoch 11 [Val]: 100%|██████████| 7/7 [00:17<00:00,  2.48s/it]



📊 Epoch 11 | Train Loss: 0.6216 | Train Acc: 0.637 | Val Loss: 1.4286 | Val Acc: 0.438

⏹️ Early stopping triggered.
✅ Best model loaded.


resnet 18

In [19]:
import os
import torch
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from torchvision import models
from PIL import Image
import numpy as np
from sklearn.utils.class_weight import compute_class_weight


class CryDataset(Dataset):
    def __init__(self, folder, transform=None):
        self.paths, self.labels = [], []

        for label in sorted(os.listdir(folder)):
            class_dir = os.path.join(folder, label)
            if os.path.isdir(class_dir):
                for file in os.listdir(class_dir):
                    if file.endswith(".npy"):
                        self.paths.append(os.path.join(class_dir, file))
                        self.labels.append(label)

        self.classes = sorted(set(self.labels))
        self.class_to_idx = {cls: i for i, cls in enumerate(self.classes)}
        self.transform = transform

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        try:
            path = self.paths[idx]
            label = self.class_to_idx[self.labels[idx]]
            mel = np.squeeze(np.load(path))

            if mel.ndim != 2:
                raise ValueError("Invalid mel shape")

            mel_norm = (mel - mel.min()) / (mel.max() - mel.min() + 1e-6)
            img = Image.fromarray((mel_norm * 255).astype(np.uint8)).resize((224, 224)).convert("RGB")
            if self.transform:
                img = self.transform(img)
            return img, label

        except:
            return None, None


def custom_collate_fn(batch):
    batch = [b for b in batch if b[0] is not None]
    if not batch:
        return torch.empty(0), torch.empty(0)
    imgs, labels = zip(*batch)
    return torch.stack(imgs), torch.tensor(labels)


train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(0.2, 0.2, 0.2, 0.1),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3),
    transforms.RandomErasing(p=0.25)
])

val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])


train_data = CryDataset("/content/baby_cry_split/train", transform=train_transform)
val_data   = CryDataset("/content/baby_cry_split/val", transform=val_transform)

train_loader = DataLoader(train_data, batch_size=16, shuffle=True, collate_fn=custom_collate_fn)
val_loader   = DataLoader(val_data, batch_size=16, shuffle=False, collate_fn=custom_collate_fn)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

base_model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
for param in base_model.parameters():
    param.requires_grad = False

in_features = base_model.fc.in_features
base_model.fc = torch.nn.Sequential(
    torch.nn.Dropout(0.5),
    torch.nn.Linear(in_features, len(train_data.classes))
)
model = base_model.to(device)


y_train_encoded = [train_data.class_to_idx[label] for label in train_data.labels]
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_encoded), y=y_train_encoded)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

criterion = torch.nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.Adam(model.fc.parameters(), lr=3e-5, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5)


best_val_loss = float("inf")
early_stop_patience = 7
no_improve = 0

for epoch in range(1, 31):
    model.train()
    train_loss, correct, total = 0.0, 0, 0

    for X, y in train_loader:
        if X.size(0) == 0: continue
        X, y = X.to(device), y.to(device)

        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * X.size(0)
        correct += (outputs.argmax(1) == y).sum().item()
        total += X.size(0)

    train_loss /= total
    train_acc = correct / total

    model.eval()
    val_loss, val_correct, val_total = 0.0, 0, 0
    with torch.no_grad():
        for X, y in val_loader:
            if X.size(0) == 0: continue
            X, y = X.to(device), y.to(device)
            outputs = model(X)
            loss = criterion(outputs, y)

            val_loss += loss.item() * X.size(0)
            val_correct += (outputs.argmax(1) == y).sum().item()
            val_total += X.size(0)

    val_loss /= val_total
    val_acc = val_correct / val_total

    print(f"Epoch {epoch:02d} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.3f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.3f}")
    scheduler.step(val_loss)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_model.pth")
        no_improve = 0
    else:
        no_improve += 1
        if no_improve >= early_stop_patience:
            print("⏹️ Early stopping triggered.")
            break

model.load_state_dict(torch.load("best_model.pth"))
model.eval()
print("✅ Best model loaded.")


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:01<00:00, 37.9MB/s]


Epoch 01 | Train Loss: 2.5808 | Train Acc: 0.106 | Val Loss: 2.3318 | Val Acc: 0.062
Epoch 02 | Train Loss: 2.4340 | Train Acc: 0.089 | Val Loss: 2.2956 | Val Acc: 0.081
Epoch 03 | Train Loss: 2.4018 | Train Acc: 0.100 | Val Loss: 2.2507 | Val Acc: 0.110
Epoch 04 | Train Loss: 2.3876 | Train Acc: 0.113 | Val Loss: 2.2438 | Val Acc: 0.114
Epoch 05 | Train Loss: 2.3891 | Train Acc: 0.120 | Val Loss: 2.2269 | Val Acc: 0.105
Epoch 06 | Train Loss: 2.3536 | Train Acc: 0.117 | Val Loss: 2.2132 | Val Acc: 0.133
Epoch 07 | Train Loss: 2.3443 | Train Acc: 0.130 | Val Loss: 2.1890 | Val Acc: 0.148
Epoch 08 | Train Loss: 2.2792 | Train Acc: 0.143 | Val Loss: 2.1716 | Val Acc: 0.176
Epoch 09 | Train Loss: 2.2843 | Train Acc: 0.131 | Val Loss: 2.1488 | Val Acc: 0.171
Epoch 10 | Train Loss: 2.2515 | Train Acc: 0.151 | Val Loss: 2.1341 | Val Acc: 0.195
Epoch 11 | Train Loss: 2.2792 | Train Acc: 0.123 | Val Loss: 2.1246 | Val Acc: 0.205
Epoch 12 | Train Loss: 2.2803 | Train Acc: 0.157 | Val Loss: 2.11

# Yamnet

In [None]:
import os
import librosa
import numpy as np
import tensorflow_hub as hub
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import random
import zipfile
from tensorflow.keras import layers, regularizers
from sklearn.metrics import classification_report, accuracy_score
import keras_tuner as kt
from tensorflow.keras import layers, regularizers
from keras_tuner.tuners import BayesianOptimization
!pip install keras_tuner

# Assuming the zip is uploaded to your Drive
zip_path = '/content/drive/MyDrive/Baby_Crying_Sounds.zip'
extract_path = '/content/drive/MyDrive/baby_cry_data'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# Load YAMNet model
yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1')

# Base path: folders like /content/baby_cry_data/BabyCryingSounds/hungry/*.wav
base_dir = "/content/drive/MyDrive/baby_cry_data/Baby Crying Sounds"

# Output base path to save embeddings
output_base = "/content/drive/MyDrive/baby_cry_embeddings"

# Step 1: Helper to load waveform at 16 kHz
def load_audio_16k(file_path):
    waveform, sr = librosa.load(file_path, sr=16000, mono=True)
    return waveform


def augment_audio(waveform, sr=16000):
    choice = random.choice(['pitch', 'stretch', 'none'])

    if choice == 'pitch':
        n_steps = random.choice([-2, -1, 1, 2])
        return librosa.effects.pitch_shift(waveform, sr=sr, n_steps=n_steps)

    elif choice == 'stretch':
        rate = random.uniform(0.8, 1.2)
        return librosa.effects.time_stretch(waveform, rate=rate)

    else:
        return waveform



def extract_yamnet_embedding(file_path, augment=True):
    waveform = load_audio_16k(file_path)

    if augment:
        waveform = augment_audio(waveform, sr=16000)

    waveform = tf.convert_to_tensor(waveform, dtype=tf.float32)
    scores, embeddings, spectrogram = yamnet_model(waveform)
    return tf.reduce_mean(embeddings, axis=0).numpy()



# Step 3: Loop through folders and collect data
X = []
y = []
file_paths = []

labels = sorted([label for label in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, label))])

for label in labels:
    folder = os.path.join(base_dir, label)
    for fname in tqdm(os.listdir(folder), desc=f"Processing {label}"):
        if not fname.lower().endswith(('.wav', '.mp3', '.ogg')):
            continue
        fpath = os.path.join(folder, fname)
        try:
            # Original sample
            emb = extract_yamnet_embedding(fpath, augment=False)
            X.append(emb)
            y.append(label)
            file_paths.append(fpath)

            # Augmented copies (e.g., 2 extra variants)
            for _ in range(2):
                emb_aug = extract_yamnet_embedding(fpath, augment=True)
                X.append(emb_aug)
                y.append(label)
                file_paths.append(fpath)  # or just fpath again
        except Exception as e:
            print(f"⚠️ Error processing {fpath}: {e}")


# Step 4: Encode labels
X = np.array(X)
y = np.array(y)
le = LabelEncoder()
y_encoded = le.fit_transform(y)
label_names = le.classes_

from collections import Counter

# Count occurrences of each class in the combined dataset
# y_encoded contains the integer encoded labels for all files
class_counts = Counter(y_encoded)

# Map integer indices back to class names
label_counts = {class_names[i]: count for i, count in class_counts.items()}

print("Number of audio files per category:")
for label, count in label_counts.items():
    print(f"{label}: {count}")


# Step 5: Split into train, val, test
X_train_val, X_test, y_train_val, y_test, f_train_val, f_test = train_test_split(
    X, y_encoded, file_paths, test_size=0.1, random_state=42, stratify=y_encoded)

X_train, X_val, y_train, y_val, f_train, f_val = train_test_split(
    X_train_val, y_train_val, f_train_val, test_size=0.1, random_state=42, stratify=y_train_val)


splits = {
    'train': (X_train, y_train),
    'val': (X_val, y_val),
    'test': (X_test, y_test)
}

# Step 6: Save to folders by split/class
for split in ['train', 'val', 'test']:
    for class_name in label_names:
        os.makedirs(os.path.join(output_base, split, class_name), exist_ok=True)

def save_embeddings(X_split, y_split, split_name):
    for i, (emb, label_idx) in enumerate(zip(X_split, y_split)):
        class_name = label_names[label_idx]
        save_path = os.path.join(output_base, split_name, class_name, f"yamnet_{i}.npy")
        np.save(save_path, emb)

# Save all splits
save_embeddings(X_train, y_train, "train")
save_embeddings(X_val, y_val, "val")
save_embeddings(X_test, y_test, "test")

print("✅ Done: Embeddings extracted, split, and saved by class.")

def load_embeddings(data_dir):
    X, y = [], []
    class_names = sorted(os.listdir(data_dir))  # ['belly pain', 'burping', ...]
    label_to_idx = {name: idx for idx, name in enumerate(class_names)}

    for label in class_names:
        label_dir = os.path.join(data_dir, label)
        for fname in os.listdir(label_dir):
            if fname.endswith('.npy'):
                path = os.path.join(label_dir, fname)
                emb = np.load(path)
                X.append(emb)
                y.append(label_to_idx[label])
    return np.array(X), np.array(y), class_names


train_dir = '/content/drive/MyDrive/baby_cry_embeddings/train'
val_dir   = '/content/drive/MyDrive/baby_cry_embeddings/val'
test_dir  = '/content/drive/MyDrive/baby_cry_embeddings/test'

X_train, y_train, class_names = load_embeddings(train_dir)
X_val, y_val, _ = load_embeddings(val_dir)
X_test, y_test, _ = load_embeddings(test_dir)

print(f"✅ Loaded: {X_train.shape[0]} train | {X_val.shape[0]} val | {X_test.shape[0]} test")


#Build the model with best hyperparameters
model = tf.keras.Sequential([
    layers.Input(shape=(1024,)),

    # Layer 1
    layers.Dense(526, use_bias=False,
                 kernel_regularizer=regularizers.l2(0.0001897)),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Dropout(0.4),

    # Layer 2
    layers.Dense(256, use_bias=False,
                 kernel_regularizer=regularizers.l2(0.0002384)),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Dropout(0.4),


    layers.Dense(128, use_bias=False,
    kernel_regularizer=regularizers.l2(0.0002384)),
    layers.BatchNormalization(),
    layers.Activation('relu'),

    # Output layer
    layers.Dense(len(np.unique(y_train)), activation="softmax")
])

# Compile
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001342)

model.compile(
    optimizer=optimizer,
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

early_stop = tf.keras.callbacks.EarlyStopping(
    patience=15, restore_best_weights=True
)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    factor=0.5, patience=7, verbose=1, min_lr=1e-6
)

checkpoint = tf.keras.callbacks.ModelCheckpoint(
    "best_yamnet_model.h5", save_best_only=True, verbose=1
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=32,
    callbacks=[early_stop, reduce_lr, checkpoint],
    # class_weight=class_weight_dict
)

# Get predictions as class indices
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"\n✅ Final Test Accuracy: {test_acc:.2%}")

y_pred = model.predict(X_test).argmax(axis=1)

print(classification_report(y_test, y_pred, target_names=class_names))

def build_model(hp):
    model = tf.keras.Sequential()
    model.add(layers.Input(shape=(1024,)))

    for i in range(hp.Int("num_layers", 1, 3)):
        units = hp.Int(f"units_{i}", min_value=64, max_value=512, step=64)
        l2 = hp.Float(f"l2_{i}", 1e-4, 1e-2, sampling="log")

        model.add(layers.Dense(units, use_bias=False,
                               kernel_regularizer=regularizers.l2(l2)))
        model.add(layers.BatchNormalization())
        model.add(layers.Activation('relu'))

        dropout = hp.Float(f"dropout_{i}", 0.3, 0.6, step=0.1)
        model.add(layers.Dropout(dropout))

    model.add(layers.Dense(len(np.unique(y_train)), activation="softmax"))

    lr = hp.Float("lr", 1e-4, 1e-2, sampling="log")
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

    model.compile(
        optimizer=optimizer,
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )

    return model

tuner = BayesianOptimization(
    build_model,
    objective="val_accuracy",
    max_trials=20,
    directory="yamnet_tuning",
    project_name="bayesian_final"
)

early_stop = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
checkpoint = tf.keras.callbacks.ModelCheckpoint("best_model.h5", save_best_only=True)
tuner.search(X_train, y_train, epochs=30, validation_data=(X_val, y_val), callbacks=[early_stop, checkpoint])


best_model = tuner.get_best_models(num_models=1)[0]
best_hp = tuner.get_best_hyperparameters(1)[0]

print("Best hyperparameters:")
print(best_hp.values)

loss, acc = best_model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc:.2f}")

from IPython.display import Audio

true_labels = [class_names[i] for i in y_test]
pred_labels = [class_names[i] for i in y_pred]

for i in range(10):  # show first 10
    print(f"File: {f_test[i]}")
    print(f"True label: {true_labels[i]}")
    print(f"Predicted:  {pred_labels[i]}")
    print("-" * 40)

i = 212  # index of interest
print(f"True: {true_labels[i]}, Predicted: {pred_labels[i]}")
Audio(filename=f_test[i])  #Use filename for file paths