In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import soundfile as sf
import librosa
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

In [2]:
# !pip install torchmetrics

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!unzip '/content/drive/MyDrive/clips.zip'

In [2]:
f = open('data.json')
data = json.load(f)

In [3]:
SAMPLE_RATE = 16000

In [93]:
class KeywordSpottingDataset(Dataset):
    def __init__(self, file_paths, labels):
        self.file_paths = file_paths
        self.labels = labels

    def __len__(self):
        return len(self.file_paths) - 1

    def emphasis(self, audio, pre_emphasis = 0.97):
        audio = np.append(audio[0], audio[1:] - pre_emphasis * audio[:-1])
        return audio

    def crop_audio(self, audio, sr=16000):
        # Get length of audio in samples
        audio_len = audio.shape[0]

        silence_length = np.random.uniform(0.3, 0.5)
        # print("silence_length", silence_length)
        # Calculate max number of samples to replace with silence
        silence_len = int(silence_length * sr) # 10% of audio length

        in_start = np.random.choice([True, False])
        # print('instart', in_start)
        if in_start:
            start_idx = 0
            end_idx = start_idx + silence_len
        else:
            start_idx = audio_len - silence_len - 1
            end_idx = audio_len - 1

        # Replace audio segment with silence
        augmented_audio = audio.copy()
        augmented_audio[start_idx:end_idx] = 0.0

        return augmented_audio

    def add_noise(self, audio, noise_level=0.01):
        noise = np.random.normal(scale=noise_level, size=len(audio))
        return audio + noise

    def pitch_shift(self, audio, sr=16000):
        steps = [-3, -2, -1, 1, 2, 3]
        choice = np.random.choice(steps, 1)
        return librosa.effects.pitch_shift(audio, sr=sr, n_steps=choice)

    def augment_audio(self, audio):
        # Randomly apply one or more augmentation methods
        methods = []
        if np.random.random() < 0.5:
            methods.append(self.crop_audio)
        if np.random.random() < 0.5:
            methods.append(self.add_noise)
        if np.random.random() < 0.5:
            methods.append(self.pitch_shift)

        # Apply selected augmentation methods to audio
        for method in methods:
            audio = method(audio)

        return audio

    def get_log_mel_spectrogram(self, audio):
        mel_spec = librosa.feature.melspectrogram(y=audio, sr=SAMPLE_RATE, n_fft=4096, hop_length=512, n_mels=128)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        mel_spec_db = mel_spec_db.astype(np.float32)
        return mel_spec_db


    def normalize_spectrogram(self, mel_spec_db):
        mel_spec_db = librosa.util.normalize(mel_spec_db)
        return mel_spec_db

    def get_mfcc(self, audio):
        mfcc = librosa.feature.mfcc(y=audio, sr=SAMPLE_RATE, n_mfcc=40, n_fft=4096, hop_length=512)
        mfcc = mfcc.astype(np.float32)
        return mfcc

    def pad_mfccs(self, mfccs):
        # mfccs shape: (40, 63)
        pad_width = ((0, 128 - mfccs.shape[0]), (0, 0))
        padded_mfccs = np.pad(mfccs, pad_width, mode='constant')
        return padded_mfccs

    def merge_spec_and_mfcc(self, mel_spec_db, mfccs):
        spectrograms = torch.stack([torch.from_numpy(mel_spec_db), torch.from_numpy(mfccs)])
        return spectrograms

    def padding(self, batch, seq_len):
        if len(batch[0][0]) < seq_len:
            m = torch.nn.ConstantPad1d((0, seq_len - len(batch[0][0])), 0)
            batch = m(batch)
        return batch

    def __getitem__(self, idx):
        if model_name == 'bcresnet':
            audio_path1 = self.file_paths[idx]
            audio_path2 = self.file_paths[idx + 1]
            label = 1 if self.labels[idx] == 1 or self.labels[idx + 1]  == 1 else 0
            # Load audio file and extract mel spectrogram
            audio1, _ = librosa.load(audio_path1, sr=SAMPLE_RATE)
            audio2, _ = librosa.load(audio_path2, sr=SAMPLE_RATE)
            audio = np.concatenate([audio1, audio2])
            # augment audio
            audio = self.emphasis(audio)
            audio = self.augment_audio(audio)
            # built spectrograms
            mel_spec_db = self.get_log_mel_spectrogram(audio)
            mfccs = self.get_mfcc(audio)
            mfccs = self.pad_mfccs(mfccs)
            tensor = self.merge_spec_and_mfcc(mel_spec_db, mfccs)
        elif model_name == 'matrixnet':
            audio_path = self.file_paths[idx]
            label = self.labels[idx]
            audio, _ = librosa.load(audio_path, sr=SAMPLE_RATE)
            # augment audio
            audio = self.emphasis(audio)
            audio = self.augment_audio(audio)
            mfcc = librosa.feature.mfcc(y=audio, sr=SAMPLE_RATE, n_mfcc=64)
            inputs = self.padding(torch.from_numpy(mfcc.reshape(1, 64, -1)), 128)
            tensor = inputs.reshape(64, -1)
        label = torch.tensor(label, dtype=torch.long)
        label.numpy()
        return tensor, label

In [88]:
# file_paths = [path for path in data.keys()]
# labels = [label for label in data.values()]
# X_train_paths, X_val_paths, y_train, y_val = train_test_split(file_paths, labels, test_size=0.2, random_state=42, shuffle=True, stratify=labels)
#
# # create train and validation datasets
# train_dataset = KeywordSpottingDataset(X_train_paths, y_train)
# val_dataset = KeywordSpottingDataset(X_val_paths, y_val)
#
# # create train and validation dataloaders
# batch_size = 16
# train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [87]:
# import IPython
#
# dataset_audio = None
# for x, y in train_dataloader:
#     print(x.shape)
#     plt.figure(figsize=(10, 4))
#     dataloader_audio = x[0].numpy()
#     librosa.display.specshow(dataloader_audio, x_axis='time', y_axis='mel', sr=SAMPLE_RATE)
#     plt.colorbar(format='%+2.0f dB')
#     plt.title('Mel spectrogram')
#     plt.show()
#     IPython.display.display(IPython.display.Audio(dataset_audio, rate=SAMPLE_RATE))
#     break

In [37]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class BCResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(BCResidualBlock, self).__init__()

        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        out = self.bn1(self.conv1(x))
        out = F.relu(out)
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class BCResNet(nn.Module):
    def __init__(self, num_classes=2):
        super(BCResNet, self).__init__()

        self.in_channels = 64

        self.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self._make_layer(64, 2, stride=1)
        self.layer2 = self._make_layer(128, 2, stride=2)
        self.layer3 = self._make_layer(256, 2, stride=2)
        self.layer4 = self._make_layer(512, 2, stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512, num_classes)

    def _make_layer(self, out_channels, num_blocks, stride):
        layers = []
        layers.append(BCResidualBlock(self.in_channels, out_channels, stride=stride))
        self.in_channels = out_channels
        for i in range(1, num_blocks):
            layers.append(BCResidualBlock(out_channels, out_channels))

        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.bn1(self.conv1(x))
        out = F.relu(out)
        out = self.maxpool(out)

        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)

        out = self.avgpool(out)
        out = out.view(out.size(0), -1)
        out = self.fc(out)

        return out

    def emphasis(self, audio, pre_emphasis = 0.97):
        audio = np.append(audio[0], audio[1:] - pre_emphasis * audio[:-1])
        return audio

    def get_log_mel_spectrogram(self, audio):
        mel_spec = librosa.feature.melspectrogram(y=audio, sr=SAMPLE_RATE, n_fft=4096, hop_length=512, n_mels=128)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        mel_spec_db = mel_spec_db.astype(np.float32)
        return mel_spec_db



In [94]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [95]:
file_paths = [path for path in data.keys()]
labels = [label for label in data.values()]
X_train_paths, X_val_paths, y_train, y_val = train_test_split(file_paths, labels, test_size=0.2, random_state=42, shuffle=True, stratify=labels)

In [96]:
# create train and validation datasets
train_dataset = KeywordSpottingDataset(X_train_paths, y_train)
val_dataset = KeywordSpottingDataset(X_val_paths, y_val)

In [97]:
# create train and validation datasets
train_dataset = KeywordSpottingDataset(X_train_paths, y_train)
val_dataset = KeywordSpottingDataset(X_val_paths, y_val)

# create train and validation dataloaders
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [98]:
model_name = "matrixnet"

if model_name == "matrixnet":
    model = MatchboxNet(B=3, R=2, C=64, bins=64, NUM_CLASSES=2)
    model.load_state_dict(torch.load('./model_9_60_1.pth'))
    model.float().to("cpu")
elif model_name == "bcresnet":
    model = BCResNet(num_classes=2)
    model.load_state_dict(torch.load('./model6.pth'))
    model.to("cpu")

In [100]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.transforms import Compose
from torchmetrics import Accuracy

# Define the transform to apply to the input data
transform = Compose([])

# Initialize the datasets and dataloaders

# Initialize the Conformer model and modify the output layer to output 2 classes

model.to(device)


# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Define the training loop
def train(model, loader, criterion, optimizer):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    acc = 0
    count = 0
    for i, (inputs, targets) in enumerate(loader):
        inputs = inputs.to(device)
        targets = targets.to(device)
        # print("targets", targets)
        optimizer.zero_grad()
        #outputs = model(inputs.unsqueeze(1))
        #print("targets", targets)
        if model_name == "bcresnet":
            outputs = model(inputs.unsqueeze(1))
        elif model_name == "matrixnet":
            outputs = model(inputs.unsqueeze(1))
        #print("outputs", nn.functional.softmax(outputs, dim=0))
        # print("outputs", outputs)
        # probs = nn.functional.softmax(outputs, dim=1)
        # print("probs", probs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        running_acc = Accuracy(num_classes=2, compute_on_step=False, dist_sync_on_step=False, task='binary')(predicted.cpu(), targets.cpu())
        print("training: batch accuracy", running_acc)
        acc += running_acc
        count = i
    accuracy = acc / (count + 1)
    return running_loss / len(loader), accuracy

# Define the validation loop
def validate(model, loader, criterion):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    acc = 0
    count = 0
    with torch.no_grad():
        for i, (inputs, targets) in enumerate(loader):
            inputs = inputs.to(device)
            targets = targets.to(device)
            #print("targets", targets)
            if model_name == "bcresnet":
                outputs = model(inputs.unsqueeze(1))
            elif model_name == "matrixnet":
                outputs = model(inputs.unsqueeze(1))
            #print("outputs", nn.functional.softmax(outputs, dim=1))
            loss = criterion(outputs, targets)
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            # print("validate predicted", predicted)
            # print("validate targets", targets)
            running_acc = Accuracy(num_classes=2, compute_on_step=False, dist_sync_on_step=False, task='binary')(predicted.cpu(), targets.cpu())
            print("validation: batch accuracy", running_acc)
            acc += running_acc
            count = i
        accuracy = acc / (count + 1)
    return running_loss / len(loader), accuracy

# Train the model
for epoch in range(10):
    train_loss, train_acc = train(model, train_dataloader, criterion, optimizer)
    print(f"epoch-{epoch}: trained")
    val_loss, val_acc = validate(model, val_dataloader, criterion)
    print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Train Acc={train_acc:.2f}%, Val Loss={val_loss:.4f}, Val Acc={val_acc:.2f}%")
    # torch.save(model.state_dict(), './model.pth')
    print("model saved")

RuntimeError: Expected 2D (unbatched) or 3D (batched) input to conv1d, but got input of size: [16, 1, 64, 128]

In [104]:
def predict(model, device, audio):
    print(model_name)
    # Set model to evaluate mode
    model.eval()
    model.to(device)
    # Disable gradient calculation to speed up inference
    with torch.no_grad():
        audio = model.emphasis(audio)
        # Move inputs to device

        if model_name=="matrixnet":
            mfcc = librosa.feature.mfcc(y=audio, sr=SAMPLE_RATE, n_mfcc=64)
            inputs = model.padding(torch.from_numpy(mfcc.reshape(1, 64, -1)), 128)
            tensor = inputs.reshape(1, 64, -1)
        else:
            mel_spec = librosa.feature.melspectrogram(y=audio, sr=SAMPLE_RATE, n_fft=4096, hop_length=512, n_mels=128)
            mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
            mel_spec_db = mel_spec_db.astype(np.float32)
            tensor = torch.from_numpy(mel_spec_db).unsqueeze(0).unsqueeze(1)

        # Forward pass through the model
        outputs = model(tensor)

        # Convert logits to probabilities
        probs = F.softmax(outputs, dim=1)

        # Get predicted class indices
        preds = torch.argmax(probs, dim=1)

    return preds

In [105]:
# hop
# 2006 ++
# 3412 ++
# 6849 ++
# 7223 ++
# 7744 -+

# тест
audio_path1 = './clips/7744.wav'
audio_path2 = './clips/7745.wav'

# Load audio file and extract mel spectrogram
audio1, _ = librosa.load(audio_path1, sr=SAMPLE_RATE)
audio2, _ = librosa.load(audio_path2, sr=SAMPLE_RATE)

audio = np.concatenate([audio1, audio2])

In [106]:
predict(model, 'cpu', audio).item()

matrixnet


1

In [39]:
# import os
# import librosa
# import torch
# import numpy as np
#
# def detect_keywords(model, audio_file_path):
#     # Load audio file
#     y, sr = librosa.load(audio_file_path, sr=16000)
#
#     # Set parameters
#     window_size = 2 * sr  # 2 seconds
#     hop_size = sr // 2  # 50% overlap
#     min_time_between_keywords = 2 * sr  # 2 seconds
#     detection_threshold = 0.5  # adjust as needed
#
#     # Iterate over audio file in overlapping windows
#     keywords_detected = []
#     i = 0
#     while i < len(y) - window_size:
#         window = y[i:i + window_size]
#
#         # Compute mel spectrogram
#         mel_spectrogram = librosa.feature.melspectrogram(window, sr=sr, n_mels=128, hop_length=hop_size, n_fft=2048)
#         log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
#
#         # Normalize spectrogram
#         mean = log_mel_spectrogram.mean()
#         std = log_mel_spectrogram.std()
#         normalized_spectrogram = (log_mel_spectrogram - mean) / std
#
#         # Convert to PyTorch tensor
#         tensor = torch.from_numpy(normalized_spectrogram).unsqueeze(0).unsqueeze(1).float()
#
#         # Make prediction
#         with torch.no_grad():
#             output = model(tensor)
#             prediction = torch.sigmoid(output).item()
#
#         # Check if keyword was detected
#         if prediction > detection_threshold:
#             detection_time = i / sr
#             if not keywords_detected or detection_time - keywords_detected[-1] >= min_time_between_keywords:
#                 start_time = max(0, i - sr) / sr
#                 end_time = min(len(y), i + 4 * sr) / sr
#                 cut_name = f"{detection_time:.2f}.wav"
#                 cut_path = os.path.join(os.path.dirname(audio_file_path), cut_name)
#                 librosa.output.write_wav(cut_path, y[int(start_time * sr):int(end_time * sr)], sr=sr)
#                 keywords_detected.append(detection_time)
#
#         # Move to next window
#         i += hop_size
#
#     return keywords_detected

In [11]:
import os
import librosa
import torch
import numpy as np

def detect_wake_words(model, audio_file_path, model_name):
    res = []
    # Set up the parameters for the audio file and the sliding window
    sr = SAMPLE_RATE
    window_size = 1 # 1 second
    overlap_length = 0.1 # overlap in seconds
    hop_length = int(overlap_length * sr)  # 0.5 second

    if not os.path.exists("./outputs"):
        os.mkdir("./outputs")

    # Set up the output file
    output_file = f'./outputs/{os.path.basename(audio_file_path).replace(".wav", ".txt")}'
    with open(output_file, 'w') as f:
        pass  # clear the file

    # Load the input .wav file
    y, sr = librosa.load(audio_file_path, sr=sr)

    # Pad the input signal to ensure that we get predictions for the entire file
    n_pad = window_size - (len(y) % window_size)
    y_padded = np.concatenate((y, np.zeros(n_pad)))

    sec = overlap_length

    frames = int(y.shape[0] / hop_length)
    # Loop through the sliding windows of the input signal

    model.eval()
    with torch.no_grad():
        for i in range(frames):

        # for i in range(0, len(y_padded) - window_size + 1, hop_length):

            # Get the current window
            window = y_padded[int(sec * SAMPLE_RATE): int(sec * SAMPLE_RATE + (SAMPLE_RATE * window_size)) ]

            window = model.emphasis(window)

            if model_name == "matrixnet":
                mfcc = librosa.feature.mfcc(y=window, sr=SAMPLE_RATE, n_mfcc=64)
                inputs = model.padding(torch.from_numpy(mfcc.reshape(1, 64, -1)), 128)
                tensor = inputs.reshape(1, 64, -1).float()
            elif model_name == "bcresnet":
                mel_spec_db = model.get_log_mel_spectrogram(window)
                # Convert the mel spectrogram to a PyTorch tensor
                tensor = torch.from_numpy(mel_spec_db).unsqueeze(0).unsqueeze(1).float()
            else:
                raise NotImplementedError("no model with such name")

            # Make a prediction with the PyTorch model
            prediction = model(tensor)

            probs = F.softmax(prediction, dim=1)

            # Get predicted class indices
            preds = torch.argmax(probs, dim=1).item()

            # Get predicted class indices
            print(f"i: {i}; sec {sec}:", preds)

            # Check if the prediction is above a certain threshold
            threshold = 0.5
            if preds > threshold:
                res.append(sec)
                # If the prediction is above the threshold, log the timing
                time_start = i/sr - 1  # start 1 sec earlier
                time_end = time_start + 4  # end 4 sec after the detection
                with open(output_file, 'a') as f:
                    f.write(f'{os.path.basename(audio_file_path)}:{sec+1:.0f}\n')

                # Cut the .wav file with that detection and save it
                cut_name = f'./outputs/{os.path.basename(audio_file_path).replace(".wav", f"_{sec+1:.0f}.wav")}'

                # такую запись при радио
                sf.write(cut_name, y_padded[int(sec * SAMPLE_RATE) - SAMPLE_RATE: int(sec * SAMPLE_RATE + (SAMPLE_RATE * 3)) ], samplerate=SAMPLE_RATE)

                # sf.write(cut_name, y_padded[int(sec * SAMPLE_RATE) + (1 * SAMPLE_RATE): int(sec * SAMPLE_RATE + (SAMPLE_RATE * 2.5)) ], samplerate=SAMPLE_RATE)
            sec += overlap_length
    print(f'Keywords detected on {", ".join([str(x) for x in res])} seconds')
    return res

In [44]:
%%time
# in cli use 0.5 sec overlap
res = detect_wake_words(model, './data/thanos_message.wav', model_name=model_name)

i: 0; sec 0.1: 0
i: 1; sec 0.2: 0
i: 2; sec 0.30000000000000004: 0
i: 3; sec 0.4: 0
i: 4; sec 0.5: 0
i: 5; sec 0.6: 0
i: 6; sec 0.7: 0
i: 7; sec 0.7999999999999999: 0
i: 8; sec 0.8999999999999999: 0
i: 9; sec 0.9999999999999999: 0
i: 10; sec 1.0999999999999999: 0
i: 11; sec 1.2: 0
i: 12; sec 1.3: 0
i: 13; sec 1.4000000000000001: 0
i: 14; sec 1.5000000000000002: 0
i: 15; sec 1.6000000000000003: 0
i: 16; sec 1.7000000000000004: 0
i: 17; sec 1.8000000000000005: 0
i: 18; sec 1.9000000000000006: 0
i: 19; sec 2.0000000000000004: 0
i: 20; sec 2.1000000000000005: 0
i: 21; sec 2.2000000000000006: 0
i: 22; sec 2.3000000000000007: 0
i: 23; sec 2.400000000000001: 0
i: 24; sec 2.500000000000001: 0
i: 25; sec 2.600000000000001: 0
i: 26; sec 2.700000000000001: 0
i: 27; sec 2.800000000000001: 0
i: 28; sec 2.9000000000000012: 0
i: 29; sec 3.0000000000000013: 0
i: 30; sec 3.1000000000000014: 0
i: 31; sec 3.2000000000000015: 0
i: 32; sec 3.3000000000000016: 0
i: 33; sec 3.4000000000000017: 0
i: 34; sec 3

KeyboardInterrupt: 

In [17]:
%%time
# in cli use 0.5 sec overlap
res = detect_wake_words(model, './data/radio.wav', model_name=model_name)

i: 0; sec 0.1: 0
i: 1; sec 0.2: 0
i: 2; sec 0.30000000000000004: 0
i: 3; sec 0.4: 0
i: 4; sec 0.5: 0
i: 5; sec 0.6: 0
i: 6; sec 0.7: 0
i: 7; sec 0.7999999999999999: 0
i: 8; sec 0.8999999999999999: 0
i: 9; sec 0.9999999999999999: 0
i: 10; sec 1.0999999999999999: 0
i: 11; sec 1.2: 0
i: 12; sec 1.3: 0
i: 13; sec 1.4000000000000001: 0
i: 14; sec 1.5000000000000002: 0
i: 15; sec 1.6000000000000003: 0
i: 16; sec 1.7000000000000004: 0
i: 17; sec 1.8000000000000005: 0
i: 18; sec 1.9000000000000006: 0
i: 19; sec 2.0000000000000004: 0
i: 20; sec 2.1000000000000005: 0
i: 21; sec 2.2000000000000006: 0
i: 22; sec 2.3000000000000007: 0
i: 23; sec 2.400000000000001: 0
i: 24; sec 2.500000000000001: 0
i: 25; sec 2.600000000000001: 0
i: 26; sec 2.700000000000001: 0
i: 27; sec 2.800000000000001: 0
i: 28; sec 2.9000000000000012: 0
i: 29; sec 3.0000000000000013: 0
i: 30; sec 3.1000000000000014: 0
i: 31; sec 3.2000000000000015: 0
i: 32; sec 3.3000000000000016: 0
i: 33; sec 3.4000000000000017: 0
i: 34; sec 3

KeyboardInterrupt: 

In [108]:
# model_6_val_2sec + 0.5 - all right
# model_70_1 + 0.5 - all right
# model6/9 - 40 errors

res

[6205.600000005044, 6205.700000005045, 6205.800000005045, 6205.9000000050455]

In [467]:
y, sr = librosa.load('./data/thanos_message.wav', sr=SAMPLE_RATE)

In [41]:
model2 = BCResNet()
#model2.load_state_dict(torch.load('./model6.pth'))

In [None]:
model2 = BCResNet()
model2.load_state_dict(torch.load('./model_4.pth'))

In [494]:
%%time

res1 = detect_wake_words(model2, './data/thanos_message.wav')

Keywords detected on 35.599999999999966, 369.99999999998755, 1422.2000000001863, 1455.2000000001938, 1492.2000000002022, 1805.6000000002734, 1895.2000000002938, 1997.600000000317, 2004.8000000003187, 2005.800000000319, 2006.000000000319, 2032.600000000325, 2065.6000000003123, 2083.400000000296, 2084.400000000295, 2085.0000000002947, 2149.2000000002363, 2274.6000000001222, 2283.2000000001144, 2445.9999999999663, 2446.9999999999654, 2494.3999999999223, 2513.1999999999052, 2739.1999999996997, 2943.199999999514, 2944.1999999995132, 3277.1999999992104, 3410.599999999089, 3411.599999999088, 3450.9999999990523, 3451.9999999990514, 3515.5999999989936, 3516.5999999989926, 3518.599999998991, 3652.7999999988688, 4016.7999999985377, 4017.799999998537, 4363.199999998223, 4413.199999998177, 4777.599999997846, 5016.799999997628, 5024.399999997621, 5323.999999997349, 5324.999999997348, 5335.599999997338, 5445.599999997238, 5519.599999997171, 5520.59999999717, 5709.399999996998, 5710.399999996997, 6023

In [495]:
len(res1)

81

In [501]:
model2 = BCResNet()
model2.load_state_dict(torch.load('./model_9.pth'))

<All keys matched successfully>

In [502]:
%%time

res1 = detect_wake_words(model2, './data/thanos_message.wav')

Keywords detected on 35.599999999999966, 369.99999999998755, 1422.2000000001863, 1455.2000000001938, 1492.2000000002022, 1805.6000000002734, 1997.600000000317, 2004.8000000003187, 2005.800000000319, 2006.000000000319, 2032.600000000325, 2065.6000000003123, 2083.400000000296, 2084.400000000295, 2149.2000000002363, 2274.6000000001222, 2283.2000000001144, 2445.9999999999663, 2446.9999999999654, 2513.1999999999052, 2739.1999999996997, 2943.199999999514, 2944.1999999995132, 3277.1999999992104, 3278.1999999992095, 3410.599999999089, 3411.599999999088, 3450.9999999990523, 3451.9999999990514, 3515.5999999989936, 3516.5999999989926, 3518.599999998991, 4016.7999999985377, 4017.799999998537, 4363.199999998223, 4413.199999998177, 4777.599999997846, 5016.799999997628, 5323.999999997349, 5324.999999997348, 5445.599999997238, 5519.599999997171, 5520.59999999717, 5709.399999996998, 5710.399999996997, 6023.799999996712, 6024.799999996711, 6071.9999999966685, 6288.399999996472, 6289.399999996471, 6463.7

In [503]:
len(res1)

73

In [500]:
# hop
# 2006 +-
# 3411 ++
# 6849 +-
# 7223 +-
# 7744 +-

sec = 2005.8
audio = y[int(sec * SAMPLE_RATE): int(sec * SAMPLE_RATE + (SAMPLE_RATE * 2)) ]
import IPython
IPython.display.display(IPython.display.Audio(audio, rate=SAMPLE_RATE))

emphased = model2.emphasis(audio)
mel_spec_db = model2.get_log_mel_spectrogram(emphased)
print(mel_spec_db.shape)
predict(model2, 'cpu', mel_spec_db).item()

(128, 63)


1

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchaudio.transforms import MFCC

class TCSConv(nn.Module):
    '''
    An implementation of Time-channel Seperable Convolution
    **Arguments**
    in_channels : int
      The number of input channels to the layers
    out_channels : int
      The requested number of output channels of the layers
    kernel_size : int
      The size of the convolution kernel
    Example
    -------
    >>> inputs = torch.randn(1, 64, 400)
    >>> tcs_layer = TCSConv(64, 128, 11)
    >>> features = tcs_layer(inputs)
    >>> features.shape
    torch.Size([1, 128, 400])
    '''

    def __init__(self, in_channels, out_channels, kernel_size):
        super(TCSConv, self).__init__()

        self.depthwise_conv = nn.Conv1d(in_channels, in_channels, kernel_size, groups=in_channels,
                                        padding='same')  # effectively performing a depthwise convolution
        self.pointwise_conv = nn.Conv1d(in_channels, out_channels,
                                        kernel_size=1)  # effectively performing a pointwise convolution

    def forward(self, x):
        x = self.depthwise_conv(x)
        x = self.pointwise_conv(x)

        return x


class SubBlock(nn.Module):
    '''
    An implementation of a sub-block that is repeated R times
    **Arguments**
    in_channels : int
      The number of input channels to the layers
    out_channels : int
      The requested number of output channels of the layers
    kernel_size : int
      The size of the convolution kernel

    residual : None or torch.Tensor
      Only applicable for the final sub-block. If not None, will add 'residual' after batchnorm layer
    Example
    -------
    >>> inputs = torch.randn(1, 128, 600)

    >>> subblock = SubBlock(128, 64, 13)
    >>> outputs = subblock(inputs)
    >>> outputs.shape
    torch.Size([1, 64, 600])
    '''

    def __init__(self, in_channels, out_channels, kernel_size):
        super(SubBlock, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size

        self.tcs_conv = TCSConv(self.in_channels, self.out_channels, self.kernel_size)
        self.bnorm = nn.BatchNorm1d(self.out_channels)
        self.dropout = nn.Dropout()

    def forward(self, x, residual=None):
        x = self.tcs_conv(x)
        x = self.bnorm(x)

        # apply the residual if passed
        if residual is not None:
            x = x + residual

        x = F.relu(x)
        x = self.dropout(x)

        return x


class MainBlock(nn.Module):
    '''
    An implementation of the residual block containing R repeating sub-blocks
    **Arguments**
    in_channels : int
      The number of input channels to the residual block
    out_channels : int
      The requested number of output channels of the sub-blocks
    kernel_size : int
      The size of the convolution kernel
    R : int
      The number of repeating sub-blocks contained within this residual block

    residual : None or torch.Tensor
      Only applicable for the final sub-block. If not None, will add 'residual' after batchnorm layer
    Example
    -------
    >>> inputs = torch.randn(1, 128, 300)

    >>> block = MainBlock(128, 64, 13, 3)
    >>> outputs = block(inputs)
    >>> outputs.shape
    torch.Size([1, 64, 300])
    '''

    def __init__(self, in_channels, out_channels, kernel_size, R=1):
        super(MainBlock, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size

        self.residual_pointwise = nn.Conv1d(self.in_channels, self.out_channels, kernel_size=1)
        self.residual_batchnorm = nn.BatchNorm1d(self.out_channels)

        self.sub_blocks = nn.ModuleList()

        # Initial sub-block. If this is MainBlock 1, our input will be 128 channels which may not necessarily == out_channels
        self.sub_blocks.append(
            SubBlock(self.in_channels, self.out_channels, self.kernel_size)
        )

        # Other sub-blocks. Output of all of these blocks will be the same
        for i in range(R - 1):
            self.sub_blocks.append(
                SubBlock(self.out_channels, self.out_channels, self.kernel_size)
            )

    def forward(self, x):
        residual = self.residual_pointwise(x)
        residual = self.residual_batchnorm(residual)

        for i, layer in enumerate(self.sub_blocks):
            if (i + 1) == len(self.sub_blocks):  # compute the residual in the final sub-block
                x = layer(x, residual)
            else:
                x = layer(x)

        return x


class MatchboxNet(nn.Module):
    '''
    An implementation of MatchboxNet (https://arxiv.org/abs/2004.08531)
    The input is expected to be 64 channel MFCC features
    **Arguments**
    B : int
      The number of residual blocks in the model
    R : int
      The number of sub-blocks within each residual block
    C : int
      The size of the output channels within a sub-block
    kernel_sizes : None or list
      If None, kernel sizes will be assigned to values used in the paper. Otherwise kernel_sizes will be used
      len(kernel_sizes) must equal the number of blocks (B)
    NUM_CLASSES : int
      The number of classes in the dataset (i.e. number of keywords.) Defaults to 30 to match the Google Speech Commands Dataset
    Example
    -------
    >>> inputs = torch.randn(1, 64, 500)

    >>> model = MatchboxNet(B=3, R=2, C=64,bins=64, NUM_CLASSES=30)
    >>> outputs = model(inputs)
    >>> outputs.shape
    torch.Size([1, 30])
    '''

    def __init__(self, B, R, C, bins=64, kernel_sizes=None, NUM_CLASSES=30):
        super(MatchboxNet, self).__init__()
        if not kernel_sizes:
            kernel_sizes = [k * 2 + 11 for k in range(1, 5 + 1)]  # incrementing kernel size by 2 starting at 13

        # the prologue layers
        self.prologue_conv1 = nn.Conv1d(bins, 128, kernel_size=11, stride=2)
        self.prologue_bnorm1 = nn.BatchNorm1d(128)

        # the intermediate blocks
        self.blocks = nn.ModuleList()

        self.blocks.append(
            MainBlock(128, C, kernel_sizes[0], R=R)
        )

        for i in range(1, B):
            self.blocks.append(
                MainBlock(C, C, kernel_size=kernel_sizes[i], R=R)
            )

        # the epilogue layers
        self.epilogue_conv1 = nn.Conv1d(C, 128, kernel_size=29, dilation=2)
        self.epilogue_bnorm1 = nn.BatchNorm1d(128)

        self.epilogue_conv2 = nn.Conv1d(128, 128, kernel_size=1)
        self.epilogue_bnorm2 = nn.BatchNorm1d(128)

        self.epilogue_conv3 = nn.Conv1d(128, NUM_CLASSES, kernel_size=1)

        # Pool the timesteps into a single dimension using simple average pooling
        self.epilogue_adaptivepool = nn.AdaptiveAvgPool1d(1)

    def padding(self, batch, seq_len):
        if len(batch[0][0]) < seq_len:
            m = torch.nn.ConstantPad1d((0, seq_len - len(batch[0][0])), 0)
            batch = m(batch)
        return batch

    def emphasis(self, audio, pre_emphasis = 0.97):
        audio = np.append(audio[0], audio[1:] - pre_emphasis * audio[:-1])
        return audio

    def forward(self, x):
        # prologue block
        x = self.prologue_conv1(x)
        x = self.prologue_bnorm1(x)
        x = F.relu(x)

        # intermediate blocks
        for layer in self.blocks:
            x = layer(x)

        # epilogue blocks
        x = self.epilogue_conv1(x)
        x = self.epilogue_bnorm1(x)

        x = self.epilogue_conv2(x)
        x = self.epilogue_bnorm2(x)

        x = self.epilogue_conv3(x)
        x = self.epilogue_adaptivepool(x)
        x = x.squeeze(2)  # (N, 30, 1) > (N, 30)
        x = F.softmax(x, dim=1)  # softmax across classes and not batch

        return x


class MFCC_MatchboxNet(nn.Module):
    def __init__(self, bins: int, B: int, R: int, n_channels, kernel_sizes=None, num_classes=12):
        super(MFCC_MatchboxNet, self).__init__()
        self.sampling_rate = 16000
        self.bins = bins
        self.num_classes = num_classes
        self.mfcc_layer = MFCC(sample_rate=self.sampling_rate, n_mfcc=self.bins, log_mels=True)
        self.matchboxnet = MatchboxNet(B, R, n_channels, bins=self.bins, kernel_sizes=kernel_sizes,
                                       NUM_CLASSES=num_classes)

    def forward(self, waveform):
        mel_sepctogram = self.mfcc_layer(waveform)
        mel_sepctogram = mel_sepctogram.squeeze(1)
        mel_sepctogram = self.matchboxnet.padding(mel_sepctogram, 128)
        logits = self.matchboxnet(mel_sepctogram)
        return logits

In [13]:
model = MatchboxNet(B=3, R=2, C=64, bins=64, NUM_CLASSES=2)
model.load_state_dict(torch.load('./model_9_60_1.pth'))
model.float().to("cpu")
model_name = "matrixnet"

In [14]:
import logging
import os.path

# base level of logging
LOGGING_LEVEL = logging.INFO

# name of the file where logs will be store in
# 'logs.log' by default
LOG_FILE_NAME = 'logs.log'

# full path to logs file
LOG_FILE_PATH = './' + LOG_FILE_NAME


# logger BasicConfig that uses two handlers
# FileHandler - to write log-unit in file
# and StreamHadler to write in console
logging.basicConfig(level=LOGGING_LEVEL,
                    format='%(levelname)s::%(asctime)s::%(module)s::%(funcName)s::%(filename)s::%(lineno)d %(message)s',
                    handlers=[logging.FileHandler(LOG_FILE_PATH, mode='a')],
                    datefmt='%d-%b-%y %H:%M:%S'
                    )

# logger instance to import to another modules
logger = logging.getLogger(__name__)

In [15]:
import requests
import time
import numpy as np
import librosa
import soundfile as sf

hop_length = 0.5

def process_radio_stream(model, radio_url):
    try:
        if not os.path.exists("./outputs"):
            os.mkdir("./outputs")

        output_file = f'./outputs/radio.txt'
        with open(output_file, 'w') as f:
            f.write(f'Listening to {radio_url}:\n')

        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36'
        }

        SAVE_LOCATION = './outputs/temp.mp3'

        start_flag = True
        while not start_flag:
            try:
                audio = requests.get(radio_url, stream=True, headers=headers)
                audio.raise_for_status()
                start_flag = True
            except:
                time.sleep(10)

        audio = requests.get(radio_url, stream=True, headers=headers)
        audio.raise_for_status()

        signal = np.array([])
        time_line = 0
        next_predict_ind = 0
        found_flag = False
        found_ind = 0

        model.eval()
        for chunk in audio.iter_content(chunk_size=8192):
            if chunk:
                audio_file = open(SAVE_LOCATION, 'wb+')
                audio_file.write(chunk)
                audio_file.close()

                try:
                    my_signal, sample_rate = librosa.load(SAVE_LOCATION, sr=SAMPLE_RATE)
                    # print("my_signal,sample_rate", sample_rate)
                except Exception as e:
                    print(e)
                    continue

                time_line += my_signal.shape[0] / sample_rate
                signal = np.concatenate([signal, my_signal])

                # получаем первую секунду - потом начинаем предсказывать
                while next_predict_ind + 1 < time_line:
                    print(f"listned {next_predict_ind + 1} seconds", end=": ")
                    sample = signal[int((next_predict_ind) * sample_rate):int((next_predict_ind + 1) * sample_rate)]

                    sample = model.emphasis(sample)

                    if model_name == "matrixnet":
                        mfcc = librosa.feature.mfcc(y=sample, sr=SAMPLE_RATE, n_mfcc=64)
                        inputs = model.padding(torch.from_numpy(mfcc.reshape(1, 64, -1)), 128)
                        tensor = inputs.reshape(1, 64, -1).float()
                    elif model_name == "bcresnet":
                        mel_spec_db = model.get_log_mel_spectrogram(sample)
                        # Convert the mel spectrogram to a PyTorch tensor
                        tensor = torch.from_numpy(mel_spec_db).unsqueeze(0).unsqueeze(1).float()

                    with torch.no_grad():
                        prediction = model(tensor)
                        probs = F.softmax(prediction, dim=1)
                        pred = torch.argmax(probs, dim=1).item()

                    threshold = 0.5
                    if pred > threshold:
                        print(f"found on {found_ind} second; pred: {pred}")
                        found_flag = True
                        found_ind = next_predict_ind
                    else:
                        print(f"nothing found; pred: {pred}")

                    # сдвигаем окно на 0.5
                    next_predict_ind += hop_length
                if found_flag and (found_ind + 3 < time_line):
                    print(f"DETECTED!!!")
                    logger.info(f"DETECTED ON {found_ind} second")
                    with open(output_file, 'a') as f:
                        f.write(f'{radio_url}:{next_predict_ind+1:.0f}\n')

                    cut_name = f'./outputs/radio_{found_ind}.wav'
                    found_signal = signal[int((found_ind - 1) * sample_rate):int((found_ind + 5) * sample_rate)]
                    sf.write(cut_name, found_signal, samplerate=SAMPLE_RATE)
                    found_flag = False

    except KeyboardInterrupt:
        print("removed cache")


In [16]:
process_radio_stream(model, 'https://radio.maslovka-home.ru/soundcheck')

listned 1 seconds: nothing found; pred: 0
listned 1.5 seconds: nothing found; pred: 0
listned 2.0 seconds: nothing found; pred: 0
listned 2.5 seconds: nothing found; pred: 0
listned 3.0 seconds: nothing found; pred: 0
listned 3.5 seconds: nothing found; pred: 0
listned 4.0 seconds: nothing found; pred: 0
listned 4.5 seconds: nothing found; pred: 0
listned 5.0 seconds: nothing found; pred: 0
listned 5.5 seconds: nothing found; pred: 0
listned 6.0 seconds: nothing found; pred: 0
listned 6.5 seconds: nothing found; pred: 0
listned 7.0 seconds: nothing found; pred: 0
listned 7.5 seconds: nothing found; pred: 0
listned 8.0 seconds: nothing found; pred: 0
listned 8.5 seconds: nothing found; pred: 0
listned 9.0 seconds: nothing found; pred: 0
removed cache
