<a href="https://colab.research.google.com/github/mpaoloo/SpeechTechnologiesHW/blob/main/KeyWordSpottingHomeAssignment_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Key Word Spotting Home Assignment №3


### Основные импорты

In [5]:
import json
import os
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import random
from collections import Counter
from torch.utils.tensorboard import SummaryWriter
import numpy as np

%pip install torchcodec
import torchcodec.decoders

random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

test_tar_path = '/content/drive/MyDrive/SpeechTechnologies/test_data.tar'
train_tar_path = '/content/drive/MyDrive/SpeechTechnologies/train_data.tar'
destination_folder = '/content/drive/MyDrive/SpeechTechnologies/'



In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Разархивируем датасеты и закинем в ту же директорию в google drive

In [None]:
import tarfile
import os

if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

try:
    with tarfile.open(test_tar_path, "r:*") as tar:
        tar.extractall(path=destination_folder)
except Exception as e:
    print(f"An error occurred: {e}")

  tar.extractall(path=destination_folder)


In [None]:
try:
    with tarfile.open(train_tar_path, "r:*") as tar:
        tar.extractall(path=destination_folder)
except Exception as e:
    print(f"An error occurred: {e}")

  tar.extractall(path=destination_folder)


In [7]:
# Задаем новые пути для файлов
train_audio_path = '/content/drive/MyDrive/SpeechTechnologies/train_opus/audio'
test_audio_path = '/content/drive/MyDrive/SpeechTechnologies/test_opus/audio'
train_bounds_path = '/content/drive/MyDrive/SpeechTechnologies/train_opus/word_bounds.json'

### Подготовка датасета

In [8]:
# Формирование таргета для обучения
train_paths = []
train_labels = []

with open(train_bounds_path, "r") as f:
    train_bounds = json.load(f)

for audio_path in tqdm(os.listdir(train_audio_path)):
    if audio_path.startswith('.'):
        continue
    audio_id = audio_path.split('.')[0]
    label = 1 if audio_id in train_bounds else 0
    train_labels.append(label)
    train_paths.append(os.path.join(train_audio_path, audio_path))

100%|██████████| 150389/150389 [00:00<00:00, 537560.44it/s]


In [9]:
train_paths, val_paths, train_labels, val_labels = train_test_split(
    train_paths,
    train_labels,
    test_size=0.15,
    stratify=train_labels,
    random_state=42
)

In [10]:
class LazyAudioDataset(Dataset):
    def __init__(self, audio_paths, audio_labels, transform=None, sr=16000):
        self.audio_paths = audio_paths
        self.audio_labels = audio_labels
        self.transform = transform
        self.sr = sr


    def __len__(self):
        return len(self.audio_paths)

    def __getitem__(self, idx):
        waveform, sample_rate = torchaudio.load(self.audio_paths[idx])

        if sample_rate != self.sr:
            resampler = torchaudio.transforms.Resample(sample_rate, self.sr)
            waveform = resampler(waveform)

        if self.transform:
            waveform = self.transform(waveform)

        return waveform, self.audio_labels[idx]

### Реализация DS-CNN small в 1D варианте

In [12]:
class SimpleDS_CNN(nn.Module):
    def __init__(self, num_classes=1):
        super().__init__()

        # Depthwise Separable Conv блоки
        self.block1 = nn.Sequential(
            # Depthwise
            nn.Conv1d(64, 64, kernel_size=3, padding=1, groups=64, bias=False),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            # Pointwise
            nn.Conv1d(64, 64, kernel_size=1, bias=False),
            nn.BatchNorm1d(64),
            nn.ReLU(),
        )

        self.block2 = nn.Sequential(
            nn.Conv1d(64, 64, kernel_size=3, padding=1, groups=64, bias=False),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Conv1d(64, 64, kernel_size=1, bias=False),
            nn.BatchNorm1d(64),
            nn.ReLU(),
        )

        self.block3 = nn.Sequential(
            nn.Conv1d(64, 64, kernel_size=3, padding=1, groups=64, bias=False),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Conv1d(64, 64, kernel_size=1, bias=False),
            nn.BatchNorm1d(64),
            nn.ReLU(),
        )
        self.gap = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(64, num_classes)

    def forward(self, x):
        # (batch, 1, 64, time)
        x = x.squeeze(1)

        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)

        x = self.gap(x)
        x = x.squeeze(-1)  # (batch, 64)

        x = self.fc(x)
        return x.squeeze(-1)

model = SimpleDS_CNN()
print('Количество парамметров в DS_CNN:', sum(p.numel() for p in model.parameters()))

Количество парамметров в DS_CNN: 13697


In [40]:
@torch.no_grad()
def get_metrics(preds, targets, threshold=0.5):
    preds = preds.squeeze()
    targets = targets.squeeze().float()

    preds_proba = torch.sigmoid(preds)
    preds_binary = (preds_proba > threshold).float()

    tp = ((preds_binary == 1) & (targets == 1)).sum().item()
    tn = ((preds_binary == 0) & (targets == 0)).sum().item()
    fp = ((preds_binary == 1) & (targets == 0)).sum().item()
    fn = ((preds_binary == 0) & (targets == 1)).sum().item()

    total = tp + tn + fp + fn
    positive_total = tp + fn
    negative_total = fp + tn

    # для бинарной классификации
    accuracy = (tp + tn) / total if total > 0 else 0.0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    # KWS метрики
    far = fp / negative_total if negative_total > 0 else 0.0
    frr = fn / positive_total if positive_total > 0 else 0.0
    detection_score_harmonic = 2 * ((1 - frr) * (1 - far)) / ((1 - frr) + (1 - far)) if ((1 - frr) + (1 - far)) > 0 else 0.0
    detection_score_avg = 1 - (far + frr) / 2

    return {
        'threshold': threshold,
        # Основные метрики
        'Accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        # KWS специфичные метрики
        'far': far,
        'frr': frr,
        'score_harmonic': detection_score_harmonic,
    }

def eval_step(model, test_dataloader, criterion, device='cuda', max_steps=None):
    model.eval()
    valid_loss = 0.0
    steps = 0
    all_preds, all_targets = [], []
    with torch.no_grad():
        for i, batch in enumerate(tqdm(test_dataloader,
                                       desc="Validation",
                                       leave=False)):

            if max_steps is not None and i >= max_steps:
                break
            mels, labels = batch
            mels = mels.to(device)

            if isinstance(labels, (int, float)):
                labels = torch.tensor([labels], dtype=torch.float32)
            else:
                labels = labels.float()

            labels = labels.to(device)

            outputs = model(mels)

            loss = criterion(outputs.squeeze(-1), labels.squeeze(-1))

            all_preds.append(outputs)
            all_targets.append(labels)

            valid_loss += loss.item()
            steps += 1

    if steps > 0:
        valid_loss /= steps

    if all_preds:
        all_preds_tensor = torch.cat(all_preds, dim=0)
        all_targets_tensor = torch.cat(all_targets, dim=0)
        metrics = get_metrics(all_preds_tensor, all_targets_tensor)
    else:
        metrics = get_metrics(torch.tensor([]), torch.tensor([]))

    metrics['loss'] = valid_loss
    return metrics

def train_loop(model, train_dataloader, val_dataloader, epochs,
               optimizer, criterion, val_every=500, device='cuda',
               best_path=None, val_steps=None):

    model.to(device)
    best_val_score = 0.0

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        batch_count = 0

        train_metrics = {'Accuracy': 0.0, 'f1': 0.0, 'score_harmonic': 0.0}

        pbar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}")
        for i, batch in enumerate(pbar):
            mels, labels = batch

            if labels.dim() > 1:
                labels = labels.squeeze(-1)

            mels = mels.to(device, non_blocking=True)
            labels = labels.float().to(device, non_blocking=True)

            optimizer.zero_grad(set_to_none=True)
            outputs = model(mels)
            loss = criterion(outputs, labels)

            outputs = model(mels)
            loss = criterion(outputs.squeeze(-1), labels.squeeze(-1))

            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            batch_count += 1

            if i % 50 == 0:
                with torch.no_grad():
                    metrics = get_metrics(outputs, labels)
                    train_metrics = metrics

            pbar.set_postfix({
                'loss': f"{loss.item():.4f}",
                'acc': f"{train_metrics.get('Accuracy', 0.0):.3f}",
                'score': f"{train_metrics.get('score_harmonic', 0.0):.3f}"
            })

            if val_every and i % val_every == 0 and i > 0:
                val_metrics = eval_step(model, val_dataloader, criterion,
                                        device=device, max_steps=val_steps)

                # Сохраняем лучшую модель
                if best_path and val_metrics['score_harmonic'] > best_val_score:
                    best_val_score = val_metrics['score_harmonic']
                    torch.save(model.state_dict(), best_path)
                    print(f"\n New best Score: {best_val_score:.4f})")

        train_loss /= len(train_dataloader)
        val_metrics = eval_step(model, val_dataloader, criterion, device=device)

        print(f"Epoch {epoch+1}/{epochs}")
        print(f"Train Loss: {train_loss:.4f}")
        print(f"Val Loss:   {val_metrics['loss']:.4f}")
        print(f"Val Acc:    {val_metrics['Accuracy']:.4f}")
        print(f"Val F1:     {val_metrics['f1']:.4f}")
        print(f"Val Score:  {val_metrics['score_harmonic']:.4f}")
        print(f"Val FAR:    {val_metrics['far']:.4f}")
        print(f"Val FRR:    {val_metrics['frr']:.4f}")

        # Сохраняем лучшую модель после эпохи
        if best_path and val_metrics['score_harmonic'] > best_val_score:
            best_val_score = val_metrics['score_harmonic']
            torch.save(model.state_dict(), best_path)
            print(f"Score: {best_val_score:.4f}")

    print(f"\n Best score: {best_val_score:.4f}")
    return model

In [31]:
mel_spectrogram_transform = nn.Sequential(
    MelSpectrogram(sample_rate=16000, n_mels=64),
    AmplitudeToDB()
)

In [18]:
# Загружаем датасет
train_data = LazyAudioDataset(train_paths,
                             train_labels,
                             transform=mel_spectrogram_transform)

val_data = LazyAudioDataset(val_paths,
                           val_labels,
                           transform=mel_spectrogram_transform)


# train_dataset = LazyAudioDataset(
#     train_paths[:5000],
#     train_labels[:5000],
#     transform=mel_spectrogram_transform
# )

# val_dataset = LazyAudioDataset(
#     val_paths[:1000],
#     val_labels[:1000],
#     transform=mel_spectrogram_transform
# )

### Обучение модели

In [26]:
# Задаем параметры

epochs = 2
lr = 1e-3

device = 'cuda'
batch_size = 64

In [28]:

train_loader = DataLoader(
    train_data,
    batch_size=batch_size,
    shuffle=True,
    num_workers=6,
    pin_memory=False,
    prefetch_factor=None,
    persistent_workers=False
)

val_loader = DataLoader(
    val_data,
    batch_size=batch_size,
    shuffle=False,
    num_workers=2,
    pin_memory=False,
    prefetch_factor=None,
    persistent_workers=False
)

criterion = nn.BCEWithLogitsLoss()

model = SimpleDS_CNN().to(device)
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=lr,
    weight_decay=0.0001
)

train_loop(
    model=model,
    train_dataloader=train_loader,
    val_dataloader=val_loader,
    epochs=epochs,
    optimizer=optimizer,
    criterion=criterion,
    device=device,
    best_path="modelKWS.pth",
    val_every=None,
    val_steps=None
)

Epoch 1/2: 100%|██████████| 999/999 [1:37:55<00:00,  5.88s/it, loss=0.4541, acc=0.781, score=0.784]


ValueError: Target size (torch.Size([64, 1])) must be the same as input size (torch.Size([64]))

In [47]:
train_data = LazyAudioDataset(train_paths,
                             train_labels,
                             transform=mel_spectrogram_transform)


val_dataset = LazyAudioDataset(
    val_paths[:1000],
    val_labels[:1000],
    transform=mel_spectrogram_transform
)

train_loader = DataLoader(
    train_data,
    batch_size=32,
    shuffle=True,
    num_workers=4,
)


train_loop(
    model=model,
    train_dataloader=train_loader,
    val_dataloader=val_dataset,
    epochs=1,
    optimizer=optimizer,
    criterion=criterion,
    device=device,
    best_path="model.pth",
    val_every=300,
)

Epoch 1/1:  15%|█▌        | 300/1998 [02:49<15:24,  1.84it/s, loss=0.4497, acc=0.875, score=0.874]
Validation:   0%|          | 0/1000 [00:00<?, ?it/s][A
Validation:   0%|          | 2/1000 [00:00<01:27, 11.45it/s][A
Validation:   0%|          | 4/1000 [00:00<01:26, 11.49it/s][A
Validation:   1%|          | 6/1000 [00:00<01:20, 12.35it/s][A
Validation:   1%|          | 8/1000 [00:00<01:19, 12.47it/s][A
Validation:   1%|          | 10/1000 [00:00<01:18, 12.63it/s][A
Validation:   1%|          | 12/1000 [00:01<01:23, 11.86it/s][A
Validation:   1%|▏         | 14/1000 [00:01<01:25, 11.59it/s][A
Validation:   2%|▏         | 16/1000 [00:01<01:20, 12.26it/s][A
Validation:   2%|▏         | 18/1000 [00:01<01:28, 11.08it/s][A
Validation:   2%|▏         | 20/1000 [00:01<01:31, 10.69it/s][A
Validation:   2%|▏         | 22/1000 [00:01<01:27, 11.12it/s][A
Validation:   2%|▏         | 24/1000 [00:02<01:21, 11.96it/s][A
Validation:   3%|▎         | 26/1000 [00:02<01:17, 12.63it/s][A
Vali


 New best Score: 0.7480)


Epoch 1/1:  30%|██▉       | 598/1998 [05:53<13:29,  1.73it/s, loss=0.5154, acc=0.812, score=0.812]
Validation:   0%|          | 0/1000 [00:00<?, ?it/s][A
Validation:   0%|          | 2/1000 [00:00<01:23, 11.91it/s][A
Validation:   0%|          | 4/1000 [00:00<01:34, 10.53it/s][A
Validation:   1%|          | 6/1000 [00:00<01:35, 10.45it/s][A
Validation:   1%|          | 8/1000 [00:00<01:32, 10.77it/s][A
Validation:   1%|          | 10/1000 [00:00<01:26, 11.43it/s][A
Validation:   1%|          | 12/1000 [00:01<01:25, 11.57it/s][A
Validation:   1%|▏         | 14/1000 [00:01<01:30, 10.89it/s][A
Validation:   2%|▏         | 16/1000 [00:01<01:30, 10.91it/s][A
Validation:   2%|▏         | 18/1000 [00:01<01:38,  9.95it/s][A
Validation:   2%|▏         | 20/1000 [00:01<01:46,  9.17it/s][A
Validation:   2%|▏         | 21/1000 [00:02<01:51,  8.79it/s][A
Validation:   2%|▏         | 22/1000 [00:02<02:05,  7.82it/s][A
Validation:   2%|▏         | 23/1000 [00:02<02:11,  7.41it/s][A
Vali


 New best Score: 0.7488)


Epoch 1/1: 100%|██████████| 1998/1998 [20:38<00:00,  1.61it/s, loss=0.5266, acc=0.688, score=0.682]
                                                               

Epoch 1/1
Train Loss: 0.5344
Val Loss:   0.5328
Val Acc:    0.7340
Val F1:     0.7560
Val Score:  0.7242
Val FAR:    0.3469
Val FRR:    0.1874

 Best score: 0.7488




SimpleDS_CNN(
  (block1): Sequential(
    (0): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,), groups=64, bias=False)
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv1d(64, 64, kernel_size=(1,), stride=(1,), bias=False)
    (4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
  )
  (block2): Sequential(
    (0): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,), groups=64, bias=False)
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv1d(64, 64, kernel_size=(1,), stride=(1,), bias=False)
    (4): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
  )
  (block3): Sequential(
    (0): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,), groups=64, bias=False)
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running

### Инференс и сохранение результатов

In [None]:

import torch
import pandas as pd
import os
from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_path =
model.load_state_dict(torch.load(model_path, map_location=device))

test_path = "/content/drive/MyDrive/SpeechTechnologies/test_opus/audio"
test_paths = []
test_ids = []

sorted_files = sorted(os.listdir(test_audio_path))

for audio_path in tqdm(sorted_files):
    if audio_path.startswith('.'):
        continue
    audio_id = audio_path.split('.')[0]
    test_ids.append(audio_id)
    test_paths.append(os.path.join(test_path, audio_path))

test_labels = [0] * len(test_paths)

test_dataset = LazyAudioDataset(test_paths,
                                test_labels,
                                transform=mel_spectrogram_transform)
test_loader = DataLoader(test_dataset, batch_size=batch_size,
                         shuffle=False, num_workers=2)

model.eval()
predictions = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        mels, _ = batch
        mels = mels.to(device)
        outputs = model(mels)
        preds = (torch.sigmoid(outputs) > 0.5).int().cpu().numpy().flatten()
        predictions.extend(preds)

df = pd.DataFrame({
    'id': test_audio_ids,
    'label': predictions
})

df.to_csv('submission.csv', index=False)