# Pipeline

## Imports

In [1]:
import os
import tensorflow as tf
import random
import wandb
from wandb.keras import WandbCallback

from preprocessing import preprocess
from dataset import make_dataset
from scaling import get_train_mean_std
from model import CRNN
from util import *

## Paths

In [2]:
DATA_DIR = "../../data/"
TRAIN_DIR = "train-clean-360/LibriSpeech/train-clean-360/"
VALID_DIR = "dev-clean/LibriSpeech/dev-clean/"
TEST_DIR = "test-clean/LibriSpeech/test-clean/"
NOISE_DIR = "noise/"

PROCESSED_DIR = os.path.join(DATA_DIR, "processed/")
DATASET_DIR = os.path.join(DATA_DIR, "dataset/")
MODEL_DIR = "../models/"

## Parameters

#### Dataset Paraameters

In [3]:
initial_s = 10
s = 5
max_k = 10
sample_rate = 16000
preprocessing = False
create_dataset = False
samples = [25, 10, 10] # [1820, 520, 520]
datasets = ["train", "valid", "test"]

#### Training Parameters

In [4]:
lr = 1e-3
epochs = 1
batch_size = 32

## Weights & Biases

In [5]:
wandb.init(project="test-project", entity="speaker-estimation")
wandb.config = {
  "learning_rate": lr,
  "epochs": epochs,
  "batch_size": batch_size
}

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmarlousnijman[0m ([33mspeaker-estimation[0m). Use [1m`wandb login --relogin`[0m to force relogin


## Preprocessing

In [6]:
if preprocessing:
    input_dirs = [TRAIN_DIR, VALID_DIR, TEST_DIR]

    for i, dataset in enumerate(datasets):
        print(f"Processing {dataset} set")

        input_dir = os.path.join(DATA_DIR, input_dirs[i])
        output_dir = os.path.join(PROCESSED_DIR, f"{dataset}/")

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            
        preprocess(input_dir, output_dir, initial_s)

## Creating Dataset

In [7]:
if create_dataset:
    for i, dataset in enumerate(datasets):
        train = True

        print(f"Processing {dataset} set")

        input_dir = os.path.join(PROCESSED_DIR, f"{dataset}/")
        output_dir = os.path.join(DATASET_DIR, f"{dataset}/")

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            
        if dataset != "train":
            train = False
            
        make_dataset(input_dir, output_dir, max_k, samples[i], train)


Processing train set


100%|██████████| 10/10 [00:08<00:00,  1.23it/s]


Processing valid set


 70%|███████   | 7/10 [00:12<00:05,  1.94s/it]

## Add Noise for k = 0

In [None]:
if create_dataset:
    noise_files = get_files(os.path.join(DATA_DIR, NOISE_DIR, "audio/"), ".wav")

    speech_categories = ["cafe/restaurant", "grocery_store", "metro_station"]
    noise_files = remove_speech_noise(noise_files, os.path.join(DATA_DIR, NOISE_DIR, "meta.txt"), speech_categories)

    for i, n_samples in enumerate(samples):
        print(f"Processing {datasets[i]} set")
        
        # Select same number of noise samples as 
        # k speaker samples in each dataset
        noise_samples = random.sample(noise_files, n_samples)

        # Remove used samples
        [noise_files.remove(n) for n in noise_samples]

        # Downsample and save noise in dataset directories
        process_noise(DATASET_DIR, datasets[i], noise_samples, sample_rate)


## Compute Train Set Mean and Standard Deviation

In [None]:
mean, std = get_train_mean_std(os.path.join(DATASET_DIR, "train/"))
print(f"Dataset mean: {mean}")
print(f"Dataset std: {std}")

## Train

### Data Generators

In [None]:
train_files = [f for f in os.listdir(os.path.join(DATASET_DIR, "train/")) if f.endswith(".wav")]
valid_files = [f for f in os.listdir(os.path.join(DATASET_DIR, "valid/")) if f.endswith(".wav")]

In [None]:
train_generator = CustomDataGenerator(os.path.join(DATASET_DIR, "train/"), 
                                    train_files, dim=(500, 201), 
                                    max_k=max_k, batch_size=batch_size, 
                                    mean=mean, std=std, 
                                    s=s, train=True, shuffle=True)

valid_generator = CustomDataGenerator(os.path.join(DATASET_DIR, "valid/"), 
                                    valid_files, dim=(500, 201), 
                                    max_k=max_k, batch_size=1, 
                                    mean=mean, std=std, 
                                    s=s, train=False, shuffle=False)

### Model

In [None]:
model = CRNN((500, 201, 1), max_k)
model.summary()

### Compile

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=lr, beta_1=0.9, beta_2=0.999, epsilon=1e-8),
    loss=tf.keras.losses.CategoricalCrossentropy(),
    metrics=tf.keras.metrics.CategoricalAccuracy(),
)

### Callbacks

In [None]:
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=os.path.join(MODEL_DIR, 'weights.{epoch:02d}-{val_loss:.2f}.h5',),
    save_weights_only=True,
    monitor='val_categorical_accuracy',
    mode='max',
    save_best_only=True)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=10,
)

wandb_callback = WandbCallback(save_model=False)

### Model Directory

In [None]:
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)

### Train

In [None]:
model.fit(train_generator, validation_data=valid_generator, epochs=epochs, callbacks=[checkpoint, early_stopping, wandb_callback])
model.save_weights(os.path.join(MODEL_DIR, 'final_weights.h5'))