In [None]:
%%capture
!pip install torchaudio --quiet
!pip install soundfile --quiet
!pip install pydub --quiet
!apt-get install -y ffmpeg > /dev/null 2>&1

In [None]:
import torch
import torchaudio
import json
import numpy as np
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import soundfile
import scipy.signal
from tqdm import tqdm

In [None]:
SAMPLE_RATE = 16000
WINDOW_SIZE = 20
HOP_SIZE = 1
N_MELS = 80
EPS = 1e-9

BATCH_SIZE = 32
EPOCHS = 5
LR = 3e-4

In [None]:
def load_opus_from_bytes(audio_bytes):
    with tempfile.NamedTemporaryFile(suffix='.opus', delete=False) as tmp:
        tmp.write(audio_bytes)
        tmp_path = tmp.name

    try:
        wav_path = tmp_path.replace('.opus', '.wav')
        cmd = [
            'ffmpeg', '-y', '-i', tmp_path,
            '-ar', str(SAMPLE_RATE),
            '-ac', '1',
            '-f', 'wav',
            wav_path
        ]
        subprocess.run(cmd, capture_output=True, check=True)
        wav, sr = torchaudio.load(wav_path)

        os.remove(tmp_path)
        os.remove(wav_path)
        return wav, sr

    except Exception as e:
        if os.path.exists(tmp_path):
            os.remove(tmp_path)
        wav_path = tmp_path.replace('.opus', '.wav')
        if os.path.exists(wav_path):
            os.remove(wav_path)
        raise e

### Создаём датасет

In [None]:
class KWSDataset(Dataset):
    def __init__(self, data_path, labels_json=None, is_training=True, sr=16000):
        self.data_path = Path(data_path)
        self.sr = sr
        self.is_training = is_training

        self.spectrogram_converter = torchaudio.transforms.MelSpectrogram(
            sample_rate=sr,
            n_fft=400,
            hop_length=160,
            n_mels=80
        )

        self.audio_files = list(self.data_path.glob('*.opus'))

        if labels_json:
            with open(labels_json, 'r') as file:
                self.label_dict = json.load(file)
        else:
            self.label_dict = {}

    def __len__(self):
        return len(self.audio_files)

    def compute_spectrogram(self, audio_signal):
        spec = self.spectrogram_converter(torch.tensor(audio_signal[None, :], dtype=torch.float32))
        spec = torchaudio.transforms.AmplitudeToDB()(spec)
        if spec.shape[0] > 1:
            spec = spec.mean(dim=0, keepdim=True)
        spec = spec.squeeze(0).transpose(0, 1)
        return spec

    def __getitem__(self, index):
        audio_path = self.audio_files[index]
        audio_id = audio_path.stem

        audio_signal, original_sr = soundfile.read(str(audio_path))

        if len(audio_signal.shape) > 1:
            audio_signal = audio_signal.mean(axis=1)

        if original_sr != self.sr:
            ratio = self.sr / original_sr
            target_len = int(len(audio_signal) * ratio)
            audio_signal = scipy.signal.resample(audio_signal, target_len)

        spec = self._compute_spectrogram(audio_signal)

        if self.is_training and audio_id in self.label_dict:
            target = 1
        elif self.is_training:
            target = 0
        else:
            target = -1

        return spec, torch.tensor(target, dtype=torch.long)

### Создаём модель

In [None]:
class ResidualBlock(nn.Module):

    def __init__(self, in_channels, out_channels, stride=(1, 1)):
        super().__init__()

        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, 3, stride=stride, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),

            nn.Conv2d(out_channels, out_channels, 3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),

            nn.Conv2d(out_channels, out_channels, 3, padding=1),
            nn.BatchNorm2d(out_channels)
        )

        self.relu = nn.ReLU(inplace=True)

        # Skip connection
        self.skip = nn.Sequential()
        if stride != (1, 1) or in_channels != out_channels:
            self.skip = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, 1, stride=stride),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        identity = self.skip(x)

        out = self.conv(x)
        out = out + identity  # Skip connection

        return self.relu(out)

In [None]:
class KWSModel(nn.Module):

    def __init__(self, n_mels=80):
        super().__init__()

        self.stem = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True)
        )

        self.res1 = ResidualBlock(32, 64, stride=(2, 2))
        self.res2 = ResidualBlock(64, 128, stride=(2, 2))
        self.res3 = ResidualBlock(128, 256, stride=(2, 2))

        # Global pooling + классификатор
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
            nn.Dropout(0.3),
            nn.Linear(256, 64),
            nn.ReLU(inplace=True),
            nn.Dropout(0.2),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        x = self.stem(x)
        x = self.res1(x)
        x = self.res2(x)
        x = self.res3(x)
        x = self.classifier(x)
        return x.squeeze(-1)

model = KWSModel(n_mels=N_MELS)

In [None]:
test_input = torch.randn(4, 1, N_MELS, 200)
test_output = model(test_input)

In [None]:
def inference(net, test_data):
    dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    net.to(dev)
    net.eval()

    results = []

    with torch.no_grad():
        for specs in test_data:
            specs = specs.to(dev)
            out = net(specs)
            preds = torch.sigmoid(out)
            preds = (preds > 0.5).int()

            results.extend(preds.cpu().numpy())

    return results

In [None]:
total_pos = 0
total_neg = 0

for specs, targets in tqdm(train_loader, desc="Counting"):
    total_pos += (targets == 1).sum().item()
    total_neg += (targets == 0).sum().item()

if total_pos > 0:
    pos_weight = torch.tensor([total_neg / total_pos]).to(DEVICE)
else:
    pos_weight = torch.tensor([1.0]).to(DEVICE)

In [None]:
def save_results(files_list, preds, out_file='submission.csv'):
    with open(out_file, 'w') as f:
        f.write('id,label\n')
        for fpath, p in zip(files_list, preds):
            fid = fpath.stem
            f.write(f'{fid},{p}\n')
    print(f'Results saved to {out_file}')

### Обучение модели

In [None]:
train_ds = SoundDataset('train_opus/audio', 'train_opus/word_bounds.json', is_training=True)
test_ds = SoundDataset('test_opus/audio', is_training=False)

tr_size = int(0.8 * len(train_ds))
v_size = len(train_ds) - tr_size
tr_subset, v_subset = torch.utils.data.random_split(train_ds, [tr_size, v_size])

tr_loader = DataLoader(tr_subset, batch_size=32, shuffle=True, collate_fn=batch_collate)
v_loader = DataLoader(v_subset, batch_size=32, shuffle=False, collate_fn=batch_collate)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False, collate_fn=batch_collate)

net = SimpleCNN(n_classes=2)

print('Training network...')
trained_net = run_training(net, tr_loader, v_loader, num_epochs=5, learning_rate=3e-4)

print('Running inference...')
preds = inference(trained_net, test_loader)

save_results(test_ds.audio_files, preds)

Training network...


Epoch 1/5 [Train]: 100%|██████████| 2250/2250 [31:26<00:00,  1.19it/s, Loss=0.6911, Acc=58.21%]
Epoch 1/5 [Val]: 100%|██████████| 563/563 [06:01<00:00,  1.56it/s, Loss=0.6621, Acc=61.28%]


Epoch 1/5:
Train Loss: 0.6735, Train Acc: 58.21%
Val Loss: 0.6523, Val Acc: 61.28%
Metric: 0.5870, FR: 0.2592, FA: 0.5139
--------------------------------------------------


Epoch 2/5 [Train]: 100%|██████████| 2250/2250 [32:03<00:00,  1.17it/s, Loss=0.6252, Acc=61.32%]
Epoch 2/5 [Val]: 100%|██████████| 563/563 [06:01<00:00,  1.56it/s, Loss=0.6596, Acc=63.56%]


Epoch 2/5:
Train Loss: 0.6551, Train Acc: 61.32%
Val Loss: 0.6372, Val Acc: 63.56%
Metric: 0.6267, FR: 0.2872, FA: 0.4408
--------------------------------------------------


Epoch 3/5 [Train]: 100%|██████████| 2250/2250 [32:06<00:00,  1.17it/s, Loss=0.6615, Acc=63.45%]
Epoch 3/5 [Val]: 100%|██████████| 563/563 [05:59<00:00,  1.57it/s, Loss=0.5762, Acc=61.26%]


Epoch 3/5:
Train Loss: 0.6401, Train Acc: 63.45%
Val Loss: 0.6576, Val Acc: 61.26%
Metric: 0.5273, FR: 0.1561, FA: 0.6166
--------------------------------------------------


Epoch 4/5 [Train]: 100%|██████████| 2250/2250 [31:17<00:00,  1.20it/s, Loss=0.6898, Acc=65.23%]
Epoch 4/5 [Val]: 100%|██████████| 563/563 [21:12<00:00,  2.26s/it, Loss=0.5179, Acc=60.58%]   


Epoch 4/5:
Train Loss: 0.6243, Train Acc: 65.23%
Val Loss: 0.6639, Val Acc: 60.58%
Metric: 0.4497, FR: 0.0835, FA: 0.7021
--------------------------------------------------


Epoch 5/5 [Train]: 100%|██████████| 2250/2250 [51:42<00:00,  1.38s/it, Loss=0.6074, Acc=66.79%]    
Epoch 5/5 [Val]: 100%|██████████| 563/563 [07:05<00:00,  1.32it/s, Loss=0.6180, Acc=68.20%]

Epoch 5/5:
Train Loss: 0.6108, Train Acc: 66.79%
Val Loss: 0.5975, Val Acc: 68.20%
Metric: 0.6789, FR: 0.3626, FA: 0.2738
--------------------------------------------------
Running inference...
Results saved to submission.csv





In [None]:
print('Running inference...')
preds = inference(model, test_loader)
save_results(test_ds.audio_files, preds)
