In [23]:
import os
import librosa
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import soundfile as sf


In [24]:
DATA_PATH_TRAIN = './dataset/train'
DATA_PATH_TEST = './dataset/test'

SAMPLE_RATE = 22050
MFCC_COUNT = 40
TOP_DB = 20


In [25]:
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift

def remove_silence(audio, sr, top_db=TOP_DB):
    y_trimmed, _ = librosa.effects.trim(audio, top_db=top_db)
    return y_trimmed

def augment_audio(audio, sr):
    augmenter = Compose([
        AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.3),
        TimeStretch(min_rate=0.8, max_rate=1.25, p=0.3),
        PitchShift(min_semitones=-4, max_semitones=4, p=0.3),
        Shift(min_fraction=-0.5, max_fraction=0.5, p=0.3),
    ])
    audio_augmented = augmenter(samples=audio, sample_rate=sr)
    return audio_augmented

def extract_features(audio, sr):
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=MFCC_COUNT)
    chroma_stft = librosa.feature.chroma_stft(y=audio, sr=sr)
    spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
    tonnetz = librosa.feature.tonnetz(y=audio, sr=sr)
    return np.concatenate((np.mean(mfccs.T, axis=0), 
                           np.mean(chroma_stft.T, axis=0),
                           np.mean(spectral_contrast.T, axis=0),
                           np.mean(tonnetz.T, axis=0)))


In [26]:
def load_data(data_path, augment=False):
    features = []
    labels = []
    for label in ["human", "other"]:
        folder_path = os.path.join(data_path, label)
        for filename in os.listdir(folder_path):
            if not (filename.endswith('.mp3') or filename.endswith('.wav') or filename.endswith('.flac')):
                continue
            file_path = os.path.join(folder_path, filename)
            audio, _ = librosa.load(file_path, sr=SAMPLE_RATE, mono=True)
            audio = remove_silence(audio, SAMPLE_RATE)
            if augment:
                audio = augment_audio(audio, SAMPLE_RATE)
            feature = extract_features(audio, SAMPLE_RATE)
            features.append(feature)
            labels.append(label)
    return np.array(features), np.array(labels)


In [27]:
X, y = load_data(DATA_PATH_TRAIN, augment=True)

le = LabelEncoder()
y = le.fit_transform(y)


  return pitch_tuning(


In [33]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [34]:
model = Sequential([
    Dense(100, activation='relu', kernel_initializer="he_normal", input_shape=(X_train.shape[1],)),
    Dense(50, activation='relu', kernel_initializer="he_normal"),
    Dense(1, activation='sigmoid'),
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [35]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=32, callbacks=[es])


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 40: early stopping


<keras.callbacks.History at 0x1bc2e7bd240>

In [36]:
y_val_pred_prob = model.predict(X_val)
y_val_pred = (y_val_pred_prob > 0.5).astype("int32")

cm = confusion_matrix(y_val, y_val_pred)
print("Confusion Matrix:")
print(cm)

cr = classification_report(y_val, y_val_pred)
print("Classification Report:")
print(cr)

loss, accuracy = model.evaluate(X_val, y_val, verbose=0)
print("Model Accuracy: {:.2f}%".format(accuracy*100))
print("Model Loss: {:.2f}".format(loss))


Confusion Matrix:
[[ 309   14]
 [  38 1365]]
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.96      0.92       323
           1       0.99      0.97      0.98      1403

    accuracy                           0.97      1726
   macro avg       0.94      0.96      0.95      1726
weighted avg       0.97      0.97      0.97      1726

Model Accuracy: 96.99%
Model Loss: 0.11


In [37]:
def predict_audio_class(audio_file_path, model, le):
    audio, _ = librosa.load(audio_file_path, sr=SAMPLE_RATE, mono=True)
    audio = remove_silence(audio, SAMPLE_RATE)
    feature = extract_features(audio, SAMPLE_RATE)
    feature = np.expand_dims(feature, axis=0)  # because the model expects 2D array
    prediction_prob = model.predict(feature)
    prediction = (prediction_prob > 0.5).astype("int32")
    prediction_label = le.inverse_transform(prediction)[0]
    return prediction_label


In [43]:
audio_file_path = "./dataset/test/roy.wav"  # Adjust if necessary
prediction = predict_audio_class(audio_file_path, model, le)
print(f"The audio is predicted as: {prediction}")


The audio is predicted as: human


  y = column_or_1d(y, warn=True)


In [46]:
import joblib

# Save your trained model
model.save("./model-human-speech-detection/model.h5")

# Save your label encoder
joblib.dump(le, "./model-human-speech-detection/label_encoder.joblib")

['./model-human-speech-detection/label_encoder.joblib']