In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

import hickle
import os.path
import ntpath

import tensorflow as tf
import numpy as np
import librosa
import librosa.display

from numpy.lib import stride_tricks

from grog.audioreader import AudioReader
from grog.models.model import Model
from grog.config import Config
from grog.fft import istft, istft1, stft_default
from grog.models.local_cluster import LocalCluster
from scipy.signal.windows import boxcar
import IPython.display as ipd
import matplotlib.pyplot as plt
import scipy

In [2]:
snapshot = hickle.load(open("../../workspace/istft_problem_timit.pkl", "rb"))

speech_mix = snapshot['mix']
global_mask = snapshot['mask']
embeddings = snapshot['embeddings']

In [3]:
window_size = 256
hop_length = 64
windows_per_sample = 100
amp_fac= 10000

In [4]:
spec0 = stft_default(speech_mix, window_size, hop_length)
spec = np.abs(spec0)
phase = spec0 / spec

log_spec = np.maximum(spec, 1e-10)
log_spec = 20. * np.log10(log_spec * amp_fac)
max_mag = np.max(log_spec)
speech_VAD = (log_spec > (max_mag - 40)).astype(int)
        
reshaped_mask = global_mask.transpose(1, 0, 2, 3).reshape(2, -1, 129)[:,:spec.shape[0],:]

In [11]:
# Option A

window = np.hanning(window_size)
#window = boxcar
Aout_audio1 = librosa.istft(np.transpose(spec * reshaped_mask[0] * phase),
    hop_length=hop_length, win_length=window_size,
    center=False, window=window
)
Aout_audio2 = librosa.istft(np.transpose(spec * reshaped_mask[1] * speech_VAD * phase),
    hop_length=hop_length, win_length=window_size,
    center=False, window=window
)
print(Aout_audio1.shape)
display(ipd.Audio(Aout_audio1, rate=8000))
display(ipd.Audio(Aout_audio2, rate=8000))
#y, sr = (Aout_audio1, 8000)
#plt.figure()
#plt.subplot(1, 1, 1)
#librosa.display.waveplot(y, sr=sr)
#plt.title('Monophonic')

(59648,)


In [15]:
# Option C
#window = np.square(np.hanning(window_size))
#window = 'boxcar'
#window = scipy.signal.hann(256, sym=False)
print(scipy.signal.check_NOLA(window, 256, 192))

Cout_audio1 = scipy.signal.istft(spec * reshaped_mask[0] * phase, fs=8000, window=window, noverlap=192, boundary=True, time_axis=0, freq_axis=1)
print(Cout_audio1[1].shape)
display(ipd.Audio(Cout_audio1[1], rate=8000))

True
(59392,)


In [78]:
# Option B
N_samples = len(embeddings)
Bout_audio1 = np.zeros([(N_samples*windows_per_sample - 1) * hop_length + window_size], dtype=np.float32)  # (windows - 1) * hop_length + window_size
Bout_audio2 = np.zeros([(N_samples*windows_per_sample - 1) * hop_length + window_size], dtype=np.float32)  # If windows = 1 => 0 * 0 + window_size

config = Config()
config.load_json("../../workspace/models/timit-4/config.json")
linear_reader = AudioReader(speech_mix, config, False)

start = 0
for mask, (_, data_batch) in zip(global_mask, embeddings):
    spec, phase, vad = linear_reader.get_tf_next()
    vad = vad[0]
    spec = spec[0]
    phase = phase[0]

    out_data1 = mask[0] * spec
    out_data2 = mask[1] * vad * spec

    sample_out1, sample_out2 = istft(windows_per_sample, window_size, hop_length, amp_fac, out_data1, out_data2, phase, phase)
    
    Bout_audio1[start:(start + len(sample_out1))] += sample_out1 
    Bout_audio2[start:(start + len(sample_out2))] += sample_out2
    start += hop_length * windows_per_sample

display(ipd.Audio(Bout_audio1, rate=8000))

In [63]:
np.sum(np.abs(Bout_audio1[:59648] - Aout_audio1))

204.45415