In [13]:
import pandas as pd
import numpy as np
import os
import tqdm
import pickle
import librosa
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout,Conv1D
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping
from sklearn.model_selection import train_test_split

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Building the Model**

In [15]:
def create_model(vector_length=128):
    model = Sequential()
    
    model.add(Dense(256, input_shape=(vector_length,)))
    model.add(Dropout(0.3))
    model.add(Dense(256, activation="relu"))
    model.add(Dropout(0.3))
    model.add(Dense(128, activation="relu"))
    model.add(Dropout(0.3))
    model.add(Dense(128, activation="relu"))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.3))
    
    model.add(Dense(1, activation="sigmoid"))
    
    model.compile(loss="binary_crossentropy", metrics=["accuracy"], optimizer="adam")
    
    return model

**Testing with voice**

In [16]:
def extract_feature(file_name, **kwargs):
    
    mfcc = kwargs.get("mfcc")
    chroma = kwargs.get("chroma")
    mel = kwargs.get("mel")
    contrast = kwargs.get("contrast")
    tonnetz = kwargs.get("tonnetz")
    X, sample_rate = librosa.core.load(file_name)
    if chroma or contrast:
        stft = np.abs(librosa.stft(X))
    result = np.array([])
    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
        result = np.hstack((result, mfccs))
    if chroma:
        chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
        result = np.hstack((result, chroma))
    if mel:
        mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
        result = np.hstack((result, mel))
    if contrast:
        contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
        result = np.hstack((result, contrast))
    if tonnetz:
        tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
        result = np.hstack((result, tonnetz))
    return result

In [17]:


import os
import librosa

def resize_spectrogram(spec, length, fact=-80):

    
    canvas = np.ones((len(spec), length)) * fact

    if spec.shape[1] <= length:
        canvas[:, : spec.shape[1]] = spec
    else:
        canvas[:, :length] = spec[:, :length]
    return canvas

def compute_mel_spec(filename, sr=16_000, hop_length=512, duration=3.0):

    
    y, sr = librosa.load(os.path.join(filename), sr=sr)

    
    x_mel = librosa.feature.melspectrogram(y=y, sr=sr)

    
    x_mel = librosa.power_to_db(x_mel, ref=np.max)

    
    mel_strength = np.mean(x_mel, axis=1)

    
    length = int(duration * sr / hop_length)

    
    x_mel = resize_spectrogram(x_mel, length, fact=-80)

    return x_mel, mel_strength

In [18]:
file="audio.wav"

model = create_model()

model.load_weights("/content/drive/MyDrive/ML project/model.h5")

Loaded_model = pickle.load(open("/content/drive/MyDrive/ML project/finalized_model_10000_valid.sav", 'rb'))

In [19]:
def detect_audio():

  features = extract_feature(file, mel=True).reshape(1, -1)
  
  male_prob = model.predict(features)[0][0]
  female_prob = 1 - male_prob
  gender = "male" if male_prob > female_prob else "female"

  print("Result:", gender)
  print(f"Probabilities::: Male: {male_prob*100:.2f}%    Female: {female_prob*100:.2f}%")


  mels_strengths1=compute_mel_spec(file, sr=16_000)
  best_clf = Loaded_model.best_estimator_
  print(best_clf.predict([mels_strengths1[1]]))

  value=best_clf.predict([mels_strengths1[1]])

  if value[0]==0:
      print("Teen");
  elif value[0]==1:
      print("Twenties")
  elif value[0]==2:
      print("Thirties")
  elif value[0]==3:
      print("Fourties")
  elif value[0]==4:
      print("Fifties")
  elif value[0]==5:
      print("Sixties")
  elif value[0]==6:
      print("Seventies")
  elif value[0]==7:
      print("Eighties")
  elif value[0]==8:
      print("Ninties")

In [22]:
file="/content/drive/MyDrive/ML project/countdwn-23609.mp3"
detect_audio();



Result: male
Probabilities::: Male: 96.64%    Female: 3.36%




[4]
Fifties


In [20]:
# all imports
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode

RECORD = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""

def record(sec=3):
  print("Start Speaking")
  display(Javascript(RECORD))
  s = output.eval_js('record(%d)' % (sec*1000))
  b = b64decode(s.split(',')[1])
  with open('audio.wav','wb') as f:
    f.write(b)
  return 'audio.wav'

In [21]:
record();
detect_audio();

Start Speaking


<IPython.core.display.Javascript object>



Result: female
Probabilities::: Male: 1.01%    Female: 98.99%
[0]
Teen


