# Number of Speaker Estimation Using CountNet

## Imports

In [1]:
import numpy as np
import os
import random
import soundfile as sf
import webrtcvad

## Paths

In [2]:
data_path = "../data/dev-clean/LibriSpeech/"
dev_data_path = os.path.join(data_path, "dev-clean/")
processed_data_path = os.path.join(data_path, "dev_clean_processed/")

In [3]:
if not os.path.exists(processed_data_path):
    os.makedirs(processed_data_path)

## Load Data

In [4]:
def get_files(path):
    filepaths = []
    files = []

    for (dirpath, dirnames, filenames) in os.walk(path):
        for filename in filenames:
            if filename.endswith(".flac"):
                filepaths.append(os.path.join(dirpath, filename))
                files.append(filename)

    return list(zip(filepaths, files))

In [5]:
def get_frames(audio, sample_rate, ms):
    n = int(sample_rate * (ms / 1000.0) * 2)
    offset = 0
    timestamp = 0.0
    duration = (float(n) / sample_rate) / 2.0

    frames = []
    while offset + n < len(audio):
        frames.append(audio[offset:offset + n])
        timestamp += duration
        offset += n

    return frames

In [6]:
def get_start_frames(frame_ids, start_id=0):
    start_frame_ids = []
    start = start_id
    for frame_id in frame_ids:
        if frame_id == start:
            start_frame_ids.append(frame_id)
            start += 1
    
    return start_frame_ids

In [7]:
def get_end_frames(frame_ids, end_id):
    end_frame_ids = []
    end = end_id
    
    frame_ids.reverse()
    for frame_id in frame_ids:
        if frame_id == end:
            end_frame_ids.append(frame_id)
            end -= 1
    
    return end_frame_ids

In [8]:
def select_audio(audio, sample_rate, s):
    samples = sample_rate * s
    start = 0
    if len(audio) > samples:
        start = random.choice(range(len(audio)-samples))

    return audio[start:start+samples]

In [9]:
for filepath, filename in get_files(dev_data_path):
    audio, sample_rate = sf.read(filepath)
    frames = get_frames(audio, sample_rate, 10)
    vad = webrtcvad.Vad(0)

    no_voice_frame_ids = []
    for i, frame in enumerate(frames):
        if(vad.is_speech(frame, sample_rate) == False):
            no_voice_frame_ids.append(i)

    if len(no_voice_frame_ids) > 0:
        start = max(get_start_frames(no_voice_frame_ids), default=0)
        end = min(get_end_frames(no_voice_frame_ids, len(frames)-1), default=len(frames))
        frames = frames[start:end]

    new_audio = np.asarray([item for sublist in frames for item in sublist])
    duration = len(new_audio) / sample_rate

    if duration >= 10:
        sel_new_audio = select_audio(new_audio, sample_rate, 10)
        sf.write(os.path.join(processed_data_path, filename.replace(".flac", ".wav")), sel_new_audio, sample_rate)

## Get Samples

In [10]:
def select_speakers(path, k):
    all_speakers = os.listdir(path)
    k_speakers = random.sample(all_speakers, k)
    # Keep track of which were already used

    return k_speakers

In [12]:
speakers = select_speakers(processed_data_path, 10)

In [17]:
all_audio = []
for speaker in speakers:
    audio, sample_rate = sf.read(os.path.join(processed_data_path,speaker))
    all_audio.append(audio)

test = np.sum(all_audio, axis=0)
sf.write("test.wav", test, sample_rate)

In [None]:
# For a certain value of k = {0, ..., 10}?
# [x] Collect random samples of different speakers
# [x] Remove begin and end silence
# [x] Add individual samples 
# [] Peak normalized
# [] Transformed to time-frequency matrix
# 
# To compute ground-truth k:
# Determine location where there is voice activity
# Compute maximum number of concurrent speakers in sample