In [116]:
import audiosegment

In [33]:
import webrtcvad

In [110]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import numpy as np
np.set_printoptions(suppress=True)
import matplotlib
matplotlib.use('TkAgg')
from matplotlib import pyplot as plt
from collections.abc import Iterable
import noisereduce



In [102]:
SAMPLE_RATE = 32000
SAMPLE_WIDTH = 1./SAMPLE_RATE
FRAME_DURATION = 0.01
FRAME_WIDTH = int(SAMPLE_RATE * FRAME_DURATION)
MIN_FREQ = 1./FRAME_DURATION

def FREQ_BY_NUM(i):
    return MIN_FREQ * i

In [103]:
def FFT(frame):
    fft = np.abs(np.fft.rfft(frame).real)
    return fft

def CUTFFT(fft, max_freq):
    return fft[:int(max_freq*FRAME_DURATION)]

def SIGNAL_MEAN(signal):
    return np.mean(signal)

def SIGNAL_GEOM_MEAN(signal):
    return np.exp(np.sum(np.log(1e-11 + signal))/len(signal))

def FFT_MAX_FREQ(fft):
    return FREQ_BY_NUM(np.argmax(fft))

def SFM(fft):
    return 10 * np.log10(1e-11 + SIGNAL_GEOM_MEAN(fft)/(1e-11 + SIGNAL_MEAN(fft)))
    
def ENERGY(signal):
    return SAMPLE_WIDTH*np.sum(signal.real**2) #

def IDENTITY(fft):
    return fft

In [104]:
class Characteristic:
    def __init__(self, name):
        self.name = name
        self.value = []
        
    def add_frame(self, frame):
        self.value.append(frame)

In [105]:
class Silence:
    def __init__(self, agr=1, chars=()):
        self.vad = webrtcvad.Vad(agr)
        self.values = []
        self.chars=chars
        
    def is_voice(self, frame):
        val = self.vad.is_speech(frame, SAMPLE_RATE)
        self.values.append(1.*val)
        return val

In [106]:
class RTPlot:
    def __init__(self, silence, frame_duration=FRAME_DURATION, max_duration=10):
        self.silence = silence
        self.frame_duration = frame_duration
        self.max_duration = max_duration
        self.fig, self.ax = plt.subplots(len(silence.chars) + 1, 1, sharex=True)
        self.fig.show()
    
    def plot(self, current_time):
        start_time = max(0., current_time - self.max_duration)
        frame_start, frame_end = int(start_time/self.frame_duration), int(current_time/self.frame_duration)
        for i, v in enumerate(self.silence.chars):
            self.ax[i].cla()
            value = v.value[frame_start:frame_end]
            if len(value) == 0: continue
            if isinstance(value[0], Iterable) > 0:
                extent = [start_time, current_time, 0, 1]
                self.ax[i].imshow(np.swapaxes(np.array(value), 0, 1), cmap=plt.get_cmap('hot'), aspect='auto', origin='lower', interpolation=None, extent=extent, vmin=0)
            else:
                x = [start_time + self.frame_duration*j for j in range(len(value))]
                self.ax[i].plot(x, value, 'b-')
                thr = v.kwargs.get('threshold', None)
                if thr is not None: self.ax[i].plot([x[0], x[-1]], [thr, thr], 'r-')
            self.ax[i].set_xlabel("time, s")
            self.ax[i].set_ylabel(v.name)
        self.ax[-1].cla()
        self.ax[-1].set_xlabel('voice (=1), silence(=0)')
        value = self.silence.values[frame_start:frame_end]
        self.ax[-1].plot([start_time + self.frame_duration*j for j in range(len(value))], value, 'g-')
        self.fig.canvas.draw()
        self.fig.canvas.flush_events()

In [None]:
import pyaudio
import time


p=pyaudio.PyAudio()
stream=p.open(format=pyaudio.paInt16, channels=1, rate=SAMPLE_RATE, input=True, frames_per_buffer=FRAME_WIDTH)

DURATION = 10
PLOT_EVERY = 1
ESTIMATION_TIME = 0.3
import wave

noise_wav = wave.open("noise.wav", 'rb')
noise_data = noise_wav.readframes(-1)


wavefile = wave.open("tmp.wav", "wb")
wavefile.setnchannels(1)
wavefile.setsampwidth(p.get_sample_size(pyaudio.paInt16))
wavefile.setframerate(SAMPLE_RATE)

MAX_FREQ = 8000
VOICE_FREQS = (100, 1500)

silence = Silence(agr=3, chars=[Characteristic("FFT")])
plot = RTPlot(silence, frame_duration=FRAME_DURATION, max_duration=10)


for i in range(int(DURATION/FRAME_DURATION)):
    current_time = i * FRAME_DURATION
    rawdata = stream.read(FRAME_WIDTH)
    
    
    
    wavefile.writeframes(rawdata)
    data = np.array(np.frombuffer(rawdata, dtype=np.int16), dtype='float32')
    reduced_voice = noisereduce.reduce_noise(audio_clip=data, noise_clip=np.array(np.frombuffer(noise_data, dtype=np.int16), dtype='float32'), verbose=False)
    print(reduced_voice)
    fft = FFT(data)
    signal = CUTFFT(fft, MAX_FREQ)
    
    silence.is_voice(rawdata)
    for c in silence.chars:
        c.add_frame(signal)
    
    if i % int(PLOT_EVERY/FRAME_DURATION) == 0:
        plot.plot(current_time)
    
stream.stop_stream()
stream.close()
p.terminate()