In [2]:
!pip install -r requirement.txt



In [4]:
import tensorflow as tf
import numpy as np
import os
import glob
import pickle
import pyaudio
import time
from numpy import genfromtxt
from keras import backend as K
from keras.models import load_model
import sys
    
K.set_image_data_format('channels_first')
# np.set_printoptions(threshold=np.nan)
np.set_printoptions(threshold=sys.maxsize)

import pyaudio
from IPython.display import Audio, display, clear_output
import wave
from scipy.io.wavfile import read

from sklearn.mixture import GaussianMixture as GMM
# from sklearn.mixture import GMM 
# from sklearn import mixture

import warnings

warnings.filterwarnings("ignore")

from sklearn import preprocessing
# for converting audio to mfcc
import python_speech_features as mfcc

In [5]:
def calculate_delta(array):
    rows,cols = array.shape
    deltas = np.zeros((rows,20))
    N = 2
    for i in range(rows):
        index = []
        j = 1
        while j <= N:
            if i-j < 0:
                first = 0
            else:
                first = i-j
            if i+j > rows -1:
                second = rows -1
            else:
                second = i+j
            index.append((second,first))
            j+=1
        deltas[i] = ( array[index[0][0]]-array[index[0][1]] + (2 * (array[index[1][0]]-array[index[1][1]])) ) / 10
    return deltas

#convert audio to mfcc features
def extract_features(audio,rate):    
    mfcc_feat = mfcc.mfcc(audio,rate, 0.025, 0.01,20,appendEnergy = True, nfft=1103)
    mfcc_feat = preprocessing.scale(mfcc_feat)
    delta = calculate_delta(mfcc_feat)

    #combining both mfcc features and delta
    combined = np.hstack((mfcc_feat,delta)) 
    return combined

In [12]:
def add_user():
    
    name = input("Enter Name:")
     # check for existing database
    if os.path.exists('./voice_database/embeddings.pickle'):
        with open('./voice_database/embeddings.pickle', 'rb') as database:
            db = pickle.load(database)   
            
            if name in db:
                print("Name Already Exists! Try Another Name...")
                return
    else:
        #if database not exists than creating new database
        db = {}


    FORMAT = pyaudio.paInt16
    CHANNELS = 2
    RATE = 44100
    CHUNK = 1024
    RECORD_SECONDS = 3

    source = "./voice_database/" + name


    os.mkdir(source)

    for i in range(3):
        audio = pyaudio.PyAudio()

        if i == 0:
            j = 3
            while j>=0:
                time.sleep(1.0)
                print("Speak your name in {} seconds".format(j))
                clear_output(wait=True)

                j-=1

        elif i ==1:
            print("Speak your name one more time")
            time.sleep(0.5)

        else:
            print("Speak your name one last time")
            time.sleep(0.5)

        # start Recording
        stream = audio.open(format=FORMAT, channels=CHANNELS,
                    rate=RATE, input=True,
                    frames_per_buffer=CHUNK)

        print("recording...")
        frames = []

        for _ in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
            data = stream.read(CHUNK)
            frames.append(data)

        # stop Recording
        stream.stop_stream()
        stream.close()
        audio.terminate()

        # saving wav file of speaker
        waveFile = wave.open(source + '/' + str((i+1)) + '.wav', 'wb')
        waveFile.setnchannels(CHANNELS)
        waveFile.setsampwidth(audio.get_sample_size(FORMAT))
        waveFile.setframerate(RATE)
        waveFile.writeframes(b''.join(frames))
        waveFile.close()
        print("Done")

    dest =  "./gmm_models/"
    count = 1

    for path in os.listdir(source):
        path = os.path.join(source, path)

        features = np.array([])

        # reading audio files of speaker
        (sr, audio) = read(path)

        # extract 40 dimensional MFCC & delta MFCC features
        vector   = extract_features(audio,sr)

        if features.size == 0:
            features = vector
        else:
            features = np.vstack((features, vector))

        # when features of 3 files of speaker are concatenated, then do model training
        if count == 3:    
            gmm = GMM(n_components = 16, max_iter=200,covariance_type='diag',n_init = 3)
            gmm.fit(features)

            # saving the trained gaussian model
            pickle.dump(gmm, open(dest + name + '.GMM', 'wb'))
            print(name + ' added successfully') 

            features = np.asarray(())
            count = 0
        count = count + 1

if __name__ == '__main__':
    add_user()

recording...
Done
Speak your name one more time
recording...
Done
Speak your name one last time
recording...
Done
c added successfully


In [13]:
def recognize():
    # Voice Authentication
    FORMAT = pyaudio.paInt16
    CHANNELS = 2
    RATE = 44100
    CHUNK = 1024
    RECORD_SECONDS = 3
    FILENAME = "./test.wav"
    try:
        while True:
            audio = pyaudio.PyAudio()

            # start Recording
            stream = audio.open(format=FORMAT, channels=CHANNELS,
                            rate=RATE, input=True,
                            frames_per_buffer=CHUNK)

            print("recording...")
            frames = []

            for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
                data = stream.read(CHUNK)
                frames.append(data)
            print("finished recording")


            # stop Recording
            stream.stop_stream()
            stream.close()
            audio.terminate()

            # saving wav file 
            waveFile = wave.open(FILENAME, 'wb')
            waveFile.setnchannels(CHANNELS)
            waveFile.setsampwidth(audio.get_sample_size(FORMAT))
            waveFile.setframerate(RATE)
            waveFile.writeframes(b''.join(frames))
            waveFile.close()

            modelpath = "./gmm_models/"

            gmm_files = [os.path.join(modelpath,fname) for fname in 
                        os.listdir(modelpath) if fname.endswith('.GMM')]

            models    = [pickle.load(open(fname,'rb')) for fname in gmm_files]

            speakers   = [fname.split("/")[-1].split(".GMM")[0] for fname 
                        in gmm_files]
            
            if len(models) == 0:
                print("No Users Authorized!")
                break
                
            #read test file
            sr,audio = read(FILENAME)

            # extract mfcc features
            vector = extract_features(audio,sr)
            log_likelihood = np.zeros(len(models)) 

            #checking with each model one by one
            for i in range(len(models)):
                gmm = models[i]         
                scores = np.array(gmm.score(vector))
                log_likelihood[i] = scores.sum()

            pred = np.argmax(log_likelihood)
            identity = speakers[pred]

            # if voice not recognized than terminate the process
            if identity == 'unknown':
                    print("Not Recognized! Try again...")
                    continue
            
            print( "Recognized as - ", identity)
    except KeyboardInterrupt:
        print("Stopped")
        pass

In [None]:
recognize()

recording...
finished recording
Recognized as -  c
recording...
finished recording
Recognized as -  c
recording...
finished recording
Recognized as -  c
recording...
finished recording
Recognized as -  c
recording...
finished recording
Recognized as -  c
recording...
