In [None]:
import sys, os, json, argparse, glob

import tensorflow as tf
import numpy as np
import librosa as lr
from tqdm import tqdm
import ntpath
import librosa

def audio_from_file(path, sr=None, ext=''):
    return lr.load('{}{}'.format(path, ext), sr=sr, mono=True, offset=0.0, duration=None, dtype=np.float32, res_type='kaiser_best')                

def audio_to_file(path, x, sr):    
    lr.output.write_wav(path, x.reshape(-1), sr, norm=False)   

import soundfile as sf
def convert_to_16k(in_path,out_path):
    y, s = librosa.load(in_path, sr=16000)
    y_16k = librosa.resample(y, s, 48000)
    sf.write(out_path, y_16k, 48000, format='WAV', subtype='PCM_16')

def audio_to_frames(x, n_frame, n_step=None):    

    if n_step is None:
        n_step = n_frame

    if len(x.shape) == 1:
        x.shape = (-1,1)

    n_overlap = n_frame - n_step
    n_frames = (x.shape[0] - n_overlap) // n_step       
    n_keep = n_frames * n_step + n_overlap

    strides = list(x.strides)
    strides[0] = strides[1] * n_step

    return np.lib.stride_tricks.as_strided(x[0:n_keep,:], (n_frames,n_frame), strides)


In [None]:
import json
with open('../full_data/dataset.json') as f:
    data = json.load(f)
files = data.keys()
paths = ['../full_data/real_audios/' + f for f in files]

In [None]:
y , s = audio_from_file('/media/sa47/Study/PHD/output.wav')

In [None]:
import time
start = time.time()
y = librosa.resample(y, s, 48000, res_type='kaiser_fast')
print(time.time() - start)

In [None]:
a = np.zeros((3,4),dtype=float)
b = np.ones((3,4),dtype=float)
c = np.hstack((a,b))

np.reshape(c,(-1))

In [None]:
def chunks(l, n):
    res = []
    for i in range(0, len(l), n):
        res.append(l[i:i+n])
    return res

In [None]:
# import samplerate
# from scipy.io import wavfile
# sr, x = wave.read('/media/sa47/Study/PHD/output.wav')  # 48 khz file
y_z = samplerate.resample(y, 44100 * 1.0 / 16000, 'sinc_best')

In [None]:
files = ['/media/sa47/Study/PHD/output.wav']

In [None]:
path = 'models/vad'
# files = paths
n_batch=256

print('load model from {}'.format(path))

if os.path.isdir(path):
    candidates = glob.glob(os.path.join(path, 'model.ckpt-*.meta'))
    if candidates:
        candidates.sort()                
        checkpoint_path, _ = os.path.splitext(candidates[-1])
else:
    checkpoint_path = path

if not all([os.path.exists(checkpoint_path + x) for x in ['.data-00000-of-00001', '.index', '.meta']]):
    print('ERROR: could not load model')
    raise FileNotFoundError

vocabulary_path = checkpoint_path + '.json'
if not os.path.exists(vocabulary_path):
    vocabulary_path = os.path.join(os.path.dirname(checkpoint_path), 'vocab.json')
if not os.path.exists(vocabulary_path):
    print('ERROR: could not load vocabulary')
    raise FileNotFoundError

with open(vocabulary_path, 'r') as fp:
    vocab = json.load(fp)

graph = tf.Graph()

segments = {}

#graph.as_default()

with graph.as_default():

    saver = tf.train.import_meta_graph(checkpoint_path + '.meta')

    x = graph.get_tensor_by_name(vocab['x'])
    y = graph.get_tensor_by_name(vocab['y'])            
    init = graph.get_operation_by_name(vocab['init'])
    logits = graph.get_tensor_by_name(vocab['logits'])            
    ph_n_shuffle = graph.get_tensor_by_name(vocab['n_shuffle'])
    ph_n_repeat = graph.get_tensor_by_name(vocab['n_repeat'])
    ph_n_batch = graph.get_tensor_by_name(vocab['n_batch'])
    sr = vocab['sample_rate']

    sess = tf.Session()
    # with tf.Session() as sess:

    saver.restore(sess, checkpoint_path)

In [None]:
import time

In [None]:
start = time.time()
with graph.as_default():
    for file in tqdm(files):
        old_file = file
        #convert_to_16k(file,'temp.wav')
#         file = old_file
        
        if os.path.exists(file):
            print(time.time() - start)
            sound, sr = audio_from_file(file, sr=sr)
            sound = librosa.resample(sound, sr , 48000, res_type='zero_order_hold')
            
            s_count = 0
            for x_data in range(0,len(sound),24000):
                print(s_count)
                s_count += 0.5
                select_data = sound[x_data:x_data+24000]
                select_data = np.concatenate((select_data,np.zeros(48000 - len(select_data))))
                input = audio_to_frames(select_data, x.shape[1])
                labels = np.zeros((input.shape[0],), dtype=np.int32)
                sess.run(init, feed_dict = { x : input, y : labels, ph_n_shuffle : 1, ph_n_repeat : 1, ph_n_batch : n_batch })                        
                count = 0
                n_total = input.shape[0]
                while True:
                    try:
                        output = sess.run(logits) 
                        labels[count:count+output.shape[0]] = np.argmax(output, axis=1)                                
                        count += output.shape[0]
                        print('{:.2f}%\r'.format(100 * (count/n_total)), end='', flush=True)
                    except tf.errors.OutOfRangeError:                                                                                
                        break                                             
                noise = input[np.argwhere(labels==0),:].reshape(-1,1)
                speech = input[np.argwhere(labels==1),:].reshape(-1,1)
                name, ext = os.path.splitext(file)

                start_index = -1
                segs = []
                for idx_ in range(0,len(labels)):

                    if labels[idx_] == 1 and start_index == -1:
                        start_index = idx_

                    if labels[idx_] == 0 or (labels[idx_] == 1 and idx_ == len(labels) - 1):
                        if start_index != -1:
                            segs.append([start_index,idx_])
                        start_index = -1

                print(segs)
                segments[ntpath.basename(old_file)] = segs

        #                     audio_to_file(os.path.join(name + '.speech' + ext), speech, sr)                    
        #                     audio_to_file(os.path.join(name + '.noise' + ext), noise, sr)                                        
        #                     return labels, x
            else:
                print('skip [file not found]')
            print(time.time() - start)

In [None]:
class VAD:
    def __init__(self):
        path = 'vad/models/vad'
        # files = paths
        n_batch=256

        print('load model from {}'.format(path))

        if os.path.isdir(path):
            candidates = glob.glob(os.path.join(path, 'model.ckpt-*.meta'))
            if candidates:
                candidates.sort()                
                checkpoint_path, _ = os.path.splitext(candidates[-1])
        else:
            checkpoint_path = path

        if not all([os.path.exists(checkpoint_path + x) for x in ['.data-00000-of-00001', '.index', '.meta']]):
            print('ERROR: could not load model')
            raise FileNotFoundError

        vocabulary_path = checkpoint_path + '.json'
        if not os.path.exists(vocabulary_path):
            vocabulary_path = os.path.join(os.path.dirname(checkpoint_path), 'vocab.json')
        if not os.path.exists(vocabulary_path):
            print('ERROR: could not load vocabulary')
            raise FileNotFoundError

        with open(vocabulary_path, 'r') as fp:
            vocab = json.load(fp)

        graph = tf.Graph()

        segments = {}

        #graph.as_default()

        with graph.as_default():

            saver = tf.train.import_meta_graph(checkpoint_path + '.meta')

            x = graph.get_tensor_by_name(vocab['x'])
            y = graph.get_tensor_by_name(vocab['y'])            
            init = graph.get_operation_by_name(vocab['init'])
            logits = graph.get_tensor_by_name(vocab['logits'])            
            ph_n_shuffle = graph.get_tensor_by_name(vocab['n_shuffle'])
            ph_n_repeat = graph.get_tensor_by_name(vocab['n_repeat'])
            ph_n_batch = graph.get_tensor_by_name(vocab['n_batch'])
            sr = vocab['sample_rate']

            sess = tf.Session()
            # with tf.Session() as sess:

            saver.restore(sess, checkpoint_path)
        
        self.graph = graph
        self.sess = sess
        
    def predict(self):
        pass

In [None]:
a = VAD()

In [None]:
import json
data = json.load(open('full.json'))
data

In [None]:
from pydub import AudioSegment
from tqdm import tqdm
count = 0

for file in tqdm(data):
    if file in segments:
        print(file)
        #audio = AudioSegment.from_wav('/media/sa47/Intertainment/vnlp/speech/full_data/real_audios/' + file)

        segs = data[file]
        segs = sorted(segs, key = lambda x : x['start'])
        segs.insert(0,{'start':0.0,'end':0.0,'speaker':[0]})

        cur_max = 0

        for i in range(0,len(segs) - 1):
            cur_max = max(cur_max,segs[i]['end'])
            if segs[i+1]['start'] - cur_max > 0.5:
                start_noise = cur_max
                end_noise = segs[i+1]['start']
                
                print(start_noise,end_noise)
#                 noise = audio[int(cur_max*1000) : int(segs[i+1]['start']*1000)]
#                 noise.export('EEND/noises/noise_' + str(count) + '.wav',format='wav')
#                 count += 1

In [None]:
segments

In [None]:
start_index = -1
segs = []
for idx_ in range(0,len(labels)):
    
    if labels[idx_] == 1 and start_index == -1:
        start_index = idx_
    
    if labels[idx_] == 0 or (labels[idx_] == 1 and idx_ == len(labels) - 1):
        if start_index != -1:
            segs.append([start_index,idx_])
        start_index = -1

In [None]:
from vad import VAD
import numpy as np
detector = VAD(frame_duration = 0.5, model_path = 'models/vad')
SAMPLING_RATE = 44100

In [None]:
import time
import numpy as np
a = open('/home/ubuntu/vad/test_wavs/LJ002-0292.wav','rb')
header = a.read(44)

start = time.time()
# for i in range(0,10):
frames = a.read(44100)
array_frames = np.frombuffer(frames,dtype=np.int16)
array_frames = array_frames.astype(np.float32, order='C') / 32768.0
print(time.time() - start)

In [None]:
44100 * 2

In [None]:
import soundfile
arr , _ = soundfile.read('/home/ubuntu/chatweb/audio_logs/27-21-07.wav')

In [None]:
from pydub import AudioSegment
AudioSegment.from_wav('/home/ubuntu/chatweb/audio_logs/27-21-07.wav')

In [None]:
import numpy
a = numpy.array([1,2,3,4,5,6])
np.tile(a, (3, 1))

In [None]:
import soundfile
a, _ = soundfile.read('/home/ubuntu/chatweb/audio_logs/27-00-29.wav')

In [None]:
len(a) / 48000

In [None]:
np.zeros(5)

In [None]:
a = numpy.zeros(48000)

a = numpy.reshape(a,(-1, 48000))
numpy.reshape(a,(-1))

In [None]:
np.save(open('/home/ubuntu/silent.npy','wb'),silent)

In [None]:

convert_cmd = 'ffmpeg -i ' + response_audio + ' ' + response_audio_wav
Popen(convert_cmd.split(), stdout=PIPE, stderr=PIPE).wait()

convert_cmd = 'ffmpeg -i ' + response_audio + ' -ar 44100 ' + response_audio_mp3
Popen(convert_cmd.split(), stdout=PIPE, stderr=PIPE).wait()

nchannels, sampwidth, framerate = extract_audio_info(binaryHeader)
cur_user['writer'].setnchannels(nchannels)
cur_user['writer'].setsampwidth(sampwidth)
cur_user['writer'].setframerate(framerate)
cur_user['sampling_rate'] = framerate