In [5]:
conda install -c conda-forge sox

Channels:
 - conda-forge
 - defaults
 - anaconda
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /anaconda/envs/jupyter_env

  added / updated specs:
    - sox


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2025.1.31          |     pyhd8ed1ab_0         159 KB  conda-forge
    gettext-0.23.1             |       h5888daf_0         473 KB  conda-forge
    gettext-tools-0.23.1       |       h5888daf_0         2.8 MB  conda-forge
    lame-3.100                 |    h166bdaf_1003         496 KB  conda-forge
    libasprintf-0.23.1         |       h8e693c7_0          42 KB  conda-forge
    libasprintf-devel-0.23.1   |       h8e693c7_0          33 KB  conda-forge
    libflac-1.4.3              |       h59595ed_0         385 KB  conda-forge
    libgettextpo-0.23.1        |       h5888daf_0       

In [10]:
pip install torch torchaudio pandas numpy sklearn librosa efficientnet-pytorch

Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[15 lines of output][0m
  [31m   [0m The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
  [31m   [0m rather than 'sklearn' for pip commands.
  [31m   [0m 
  [31m   [0m Here is how to fix this error in the main use cases:
  [31m   [0m - use 'pip install scikit-learn' rather than 'pip install sklearn'
  [31m   [0m - replace 'sklearn' by 'scikit-learn' in your pip requirements files
  [31m   [0m   (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
  [31m   [0m - if the 'sklearn' package is used by one of your dependencies,
  [31m   [0m   it would be great if you take some time to track which package uses
  [31m   [0m   'sklearn' instead of 'scikit-lea

In [13]:
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
from torch.utils.data import Dataset, DataLoader
import glob
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score
import torchaudio.transforms as T
from torchvision.models import resnet18, resnet34, efficientnet_b0
import torch.nn.functional as F

# Alternative implementation for time stretching and pitch shifting
def time_stretch(waveform, sample_rate, rate):
    """Time stretch without changing pitch using torchaudio's speed effect"""
    effect = [["speed", str(rate)]]
    return torchaudio.sox_effects.apply_effects_tensor(waveform.unsqueeze(0), sample_rate, effect)[0]

def pitch_shift(waveform, sample_rate, n_steps):
    """Pitch shift using torchaudio's pitch effect"""
    effect = [["pitch", str(n_steps * 100)]]
    return torchaudio.sox_effects.apply_effects_tensor(waveform.unsqueeze(0), sample_rate, effect)[0]

# Enhanced Audio Dataset with Advanced Augmentations
class AudioDataset(Dataset):
    def __init__(self, annotations_file, audio_dir, target_length=16000, augmentations=False, use_spectrogram=False):
        self.annos = pd.read_csv(annotations_file)
        self.audio_dir = audio_dir
        self.target_length = target_length
        self.augmentations = augmentations
        self.use_spectrogram = use_spectrogram
        self.spec_aug = nn.Sequential(
            T.TimeMasking(time_mask_param=40),
            T.FrequencyMasking(freq_mask_param=27)
        ) if augmentations else None

        self.valid_data = []
        for idx in range(len(self.annos)):
            clip_name = self.annos.iloc[idx, 0]
            audio_path = os.path.join(self.audio_dir, f"{clip_name}.wav")
            if os.path.exists(audio_path):
                label = self._process_label(self.annos.iloc[idx, 1])
                self.valid_data.append((audio_path, label))

        print(f"Found {len(self.valid_data)}/{len(self.annos)} valid files.")

    def __len__(self):
        return len(self.valid_data)

    def __getitem__(self, idx):
        audio_path, label = self.valid_data[idx]
        try:
            waveform, sample_rate = torchaudio.load(audio_path)
            waveform = waveform.mean(dim=0)  # Convert to mono
            waveform = torchaudio.functional.resample(waveform, sample_rate, 16000)

            if waveform.shape[0] > self.target_length:
                waveform = waveform[:self.target_length]
            else:
                padding = self.target_length - waveform.shape[0]
                waveform = torch.nn.functional.pad(waveform, (0, padding))

            if self.use_spectrogram:
                spectrogram = T.MelSpectrogram(
                    sample_rate=16000,
                    n_mels=256,
                    n_fft=2048,
                    hop_length=512
                )(waveform)
                spectrogram = T.AmplitudeToDB()(spectrogram)

                if self.augmentations:
                    spectrogram = self.spec_aug(spectrogram)
                    if torch.rand(1) < 0.3:
                        waveform = time_stretch(waveform, 16000, 1.2)
                    if torch.rand(1) < 0.3:
                        waveform = pitch_shift(waveform, 16000, 4)

                spectrogram = spectrogram.unsqueeze(0)
                return spectrogram.float(), torch.tensor(label)
            else:
                if self.augmentations:
                    waveform = self._enhanced_waveform_augmentations(waveform)
                return waveform.float(), torch.tensor(label)
        except Exception as e:
            print(f"Error loading {audio_path}: {e}")
            return None, None

    def _enhanced_waveform_augmentations(self, waveform):
        if torch.rand(1) < 0.5:
            time_mask = T.TimeMasking(time_mask_param=100)
            waveform = time_mask(waveform.unsqueeze(0)).squeeze(0)
        if torch.rand(1) < 0.3:
            noise = torch.randn_like(waveform) * 0.01
            waveform += noise
        return waveform

    def _process_label(self, label_str):
        str_label = str(label_str).strip().lower()
        return 0 if any(k in str_label for k in ['truth', '0']) else 1

# Collate Function with Mixup Support
def audio_collate_fn(batch):
    batch = [item for item in batch if item[0] is not None]
    if not batch:
        return torch.zeros(1, 16000), torch.tensor([0])
    inputs, labels = zip(*batch)
    if isinstance(inputs[0], torch.Tensor):
        inputs = torch.stack(inputs)
    labels = torch.stack(labels)
    return inputs, labels

def mixup_data(x, y, alpha=0.4):
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    index = torch.randperm(batch_size)

    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

# Model Architectures
class ASTModel(nn.Module):
    def __init__(self, patch_size=16, num_layers=6, num_heads=8, embed_dim=768):
        super().__init__()
        self.patch_size = patch_size
        self.patch_embed = nn.Conv2d(1, embed_dim, kernel_size=patch_size, stride=patch_size)
        
        # Calculate dynamic dimensions based on input
        self.dynamic_pos_embed = True
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads), 
            num_layers=num_layers
        )
        self.classifier = nn.Linear(embed_dim, 2)

    def forward(self, x):
        # x shape: [batch, 1, n_mels, time_steps]
        x = self.patch_embed(x)  # [batch, embed_dim, n_patches_mel, n_patches_time]
        
        # Calculate the number of patches
        b, c, h, w = x.shape
        x = x.flatten(2).transpose(1, 2)  # [batch, n_patches, embed_dim]
        
        # Dynamic position embedding
        if self.dynamic_pos_embed:
            pos_embed = nn.Parameter(torch.zeros(1, x.shape[1] + 1, c)).to(x.device)
            nn.init.trunc_normal_(pos_embed, std=0.02)
        else:
            pos_embed = self.pos_embed
            
        cls_tokens = self.cls_token.expand(b, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + pos_embed
        
        x = self.transformer(x)
        pooled = x[:, 0]  # CLS token
        return self.classifier(pooled)

class EfficientNetModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.effnet = efficientnet_b0(pretrained=True)
        self.effnet.features[0][0] = nn.Conv2d(1, 32, kernel_size=3, stride=2, padding=1, bias=False)
        self.effnet.classifier[1] = nn.Linear(self.effnet.classifier[1].in_features, 2)

    def forward(self, x):
        return self.effnet(x)

class CRNNModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
        )
        self.gru = nn.GRU(128 * 64, 256, bidirectional=True, dropout=0.3)
        self.fc = nn.Linear(512, 2)

    def forward(self, x):
        x = self.cnn(x)
        b, c, h, w = x.size()
        x = x.view(b, c * h, w).permute(0, 2, 1)
        x, _ = self.gru(x)
        x = x.mean(dim=1)
        return self.fc(x)

class PANNModel(nn.Module):
    def __init__(self, num_classes=2):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(128, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(256, 512, 3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
        )
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Advanced Loss Functions
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        return loss.mean()

# Training Function with Advanced Features
def train_model(config, fold):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using {device} for fold {fold}")
    
    # Get fold-specific file paths
    train_file = os.path.join(config['protocols_dir'], f"train_fold{fold}.csv")
    test_file = os.path.join(config['protocols_dir'], f"test_fold{fold}.csv")
    
    # Model Initialization
    if config['model_type'] == 'ast':
        model = ASTModel(
            patch_size=config.get('patch_size', 16),
            num_layers=config.get('num_layers', 6),
            num_heads=config.get('num_heads', 8),
            embed_dim=config.get('embed_dim', 768)
        ).to(device)
    elif config['model_type'] == 'effnet':
        model = EfficientNetModel().to(device)
    elif config['model_type'] == 'crnn':
        model = CRNNModel().to(device)
    elif config['model_type'] == 'pann':
        model = PANNModel().to(device)
    else:
        raise ValueError("Invalid model type")

    # Optimizer and Loss
    optimizer = optim.AdamW(
        model.parameters(),
        lr=config['learning_rate'],
        weight_decay=config.get('weight_decay', 0.01)
    )

    if config.get('loss_type') == 'focal':
        criterion = FocalLoss().to(device)
    else:
        class_weights = compute_class_weights(train_file, config['audio_dir']).to(device)
        criterion = nn.CrossEntropyLoss(weight=class_weights if config['class_weighting'] else None)

    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        mode='min',
        factor=0.5,
        patience=2,
        verbose=True
    )

    # Fold-specific checkpoint directory
    checkpoint_dir = os.path.join(config['checkpoint_dir'], f'fold_{fold}')
    os.makedirs(checkpoint_dir, exist_ok=True)
    metrics_file = os.path.join(checkpoint_dir, 'metrics.csv')
    
    if not os.path.exists(metrics_file):
        pd.DataFrame(columns=['epoch', 'train_loss', 'train_acc', 'train_f1', 'train_recall', 'train_auc',
                              'val_loss', 'val_acc', 'val_f1', 'val_recall', 'val_auc']).to_csv(metrics_file, index=False)

    # Initialize datasets
    train_dataset = AudioDataset(
        train_file,
        config['audio_dir'],
        target_length=config['target_length'],
        augmentations=config['augmentations'],
        use_spectrogram=config['use_spectrogram']
    )
    val_dataset = AudioDataset(
        test_file,
        config['audio_dir'],
        target_length=config['target_length'],
        augmentations=False,
        use_spectrogram=config['use_spectrogram']
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=config['batch_size'],
        collate_fn=audio_collate_fn,
        shuffle=True,
        num_workers=4
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=config['batch_size'],
        collate_fn=audio_collate_fn,
        num_workers=4
    )

    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(config['num_epochs']):
        model.train()
        epoch_loss = 0
        train_preds, train_labels = [], []

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            if config.get('use_mixup', False):
                inputs, targets_a, targets_b, lam = mixup_data(inputs, labels)
                outputs = model(inputs)
                loss = lam * criterion(outputs, targets_a) + (1 - lam) * criterion(outputs, targets_b)
            else:
                outputs = model(inputs)
                loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            train_preds.append(outputs.detach())
            train_labels.append(labels)

        # Training metrics
        train_preds = torch.cat(train_preds)
        train_labels = torch.cat(train_labels)
        train_acc, train_f1, train_recall, train_auc = compute_metrics(train_preds, train_labels)
        train_loss = epoch_loss / len(train_loader)

        # Validation
        model.eval()
        val_loss = 0
        val_preds, val_labels = [], []
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                val_loss += criterion(outputs, labels).item()
                val_preds.append(outputs)
                val_labels.append(labels)

        val_preds = torch.cat(val_preds)
        val_labels = torch.cat(val_labels)
        val_acc, val_f1, val_recall, val_auc = compute_metrics(val_preds, val_labels)
        val_loss = val_loss / len(val_loader)

        # Update scheduler
        scheduler.step(val_loss)

        # Save metrics
        metrics = {
            'epoch': epoch + 1,
            'train_loss': train_loss,
            'train_acc': train_acc,
            'train_f1': train_f1,
            'train_recall': train_recall,
            'train_auc': train_auc,
            'val_loss': val_loss,
            'val_acc': val_acc,
            'val_f1': val_f1,
            'val_recall': val_recall,
            'val_auc': val_auc
        }
        pd.DataFrame([metrics]).to_csv(metrics_file, mode='a', header=False, index=False)

        # Save best model
        if val_loss < best_val_loss - config['min_delta']:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_loss': val_loss,
            }, os.path.join(checkpoint_dir, 'best_model.pth'))
        else:
            patience_counter += 1
            if patience_counter >= config['patience']:
                print(f"Early stopping at epoch {epoch+1}")
                break

        print(f"Fold {fold} | Epoch {epoch+1}: "
              f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val AUC: {val_auc:.4f}")

def compute_class_weights(train_file, audio_dir):
    annos = pd.read_csv(train_file)
    labels = []
    for idx in range(len(annos)):
        clip_name = annos.iloc[idx, 0]
        audio_path = os.path.join(audio_dir, f"{clip_name}.wav")
        if os.path.exists(audio_path):
            label = 0 if any(k in str(annos.iloc[idx, 1]).lower() for k in ['truth', '0']) else 1
            labels.append(label)
    labels = np.array(labels)
    class_counts = np.bincount(labels)
    total = len(labels)
    weights = total / (2.0 * class_counts)
    return torch.tensor(weights, dtype=torch.float)

def compute_metrics(preds, labels):
    preds = preds.argmax(dim=1).cpu().numpy()
    labels = labels.cpu().numpy()
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    recall = recall_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    return acc, f1, recall, auc

# Main Execution
if __name__ == "__main__":
    base_config = {
        'batch_size': 16,
        'num_epochs': 50,
        'learning_rate': 1e-3,
        'weight_decay': 0.01,
        'audio_dir': '/home/azureuser/cloudfiles/code/Users/yashika22csu235/research/audio_files',
        'protocols_dir': '/home/azureuser/cloudfiles/code/Users/yashika22csu235/research/train_protocol',
        'base_checkpoint_dir': '/home/azureuser/cloudfiles/code/Users/yashika22csu235/research/train_new/experiment1',
        'target_length': 44100,
        'augmentations': True,
        'class_weighting': True,
        'use_spectrogram': True,
        'use_mixup': True,
        'patience': 5,
        'min_delta': 0.001
    }

    experiments = [
        {
            'name': 'effnet_b0_specaug',
            'model_type': 'effnet',
            'learning_rate': 2e-4,
            'batch_size': 32
        },
        {
            'name': 'crnn_advanced',
            'model_type': 'crnn',
            'loss_type': 'focal',
            'target_length': 88200
        },
        {
        'name': 'ast_dynamic',
        'model_type': 'ast',
        'patch_size': 16,  # Can try 8 or 32
        'num_layers': 6,
        'num_heads': 8,
        'embed_dim': 768,
        'target_length': 44100  # Should match your audio length
        },
        {
            'name': 'pann_advanced',
            'model_type': 'pann',
            'batch_size': 32
        }
    ]

    for exp in experiments:
        config = base_config.copy()
        config.update(exp)
        
        # Create parent experiment directory
        exp_checkpoint_dir = os.path.join(base_config['base_checkpoint_dir'], exp['name'])
        os.makedirs(exp_checkpoint_dir, exist_ok=True)
        
        # Run for all 3 folds
        for fold in [1, 2, 3]:
            print(f"\nStarting experiment: {exp['name']} - Fold {fold}")
            config['checkpoint_dir'] = exp_checkpoint_dir
            train_model(config, fold)
            print(f"Completed fold {fold} for experiment: {exp['name']}\n")


Starting experiment: effnet_b0_specaug - Fold 1
Using cpu for fold 1


Found 826/1098 valid files.
Found 435/548 valid files.
Error loading /home/azureuser/cloudfiles/code/Users/yashika22csu235/research/audio_files/YW_WILTY_EP54_truth4.wav: Input tensor has to be 2D.
Error loading /home/azureuser/cloudfiles/code/Users/yashika22csu235/research/audio_files/BRI_WILTY_EP64_lie_13.wav: Input tensor has to be 2D.
Error loading /home/azureuser/cloudfiles/code/Users/yashika22csu235/research/audio_files/SB_WILTY_EP42_lie5.wav: Input tensor has to be 2D.
Error loading /home/azureuser/cloudfiles/code/Users/yashika22csu235/research/audio_files/BRI_WILTY_EP57_truth_24.wav: Input tensor has to be 2D.
Error loading /home/azureuser/cloudfiles/code/Users/yashika22csu235/research/audio_files/YW_WILTY_EP50_truth7.wav: Input tensor has to be 2D.
Error loading /home/azureuser/cloudfiles/code/Users/yashika22csu235/research/audio_files/YW_WILTY_EP49_truth8.wav: Input tensor has to be 2D.
Error loading /home/azureuser/cloudfiles/code/Users/yashika22csu235/research/audio_files/SB