In [127]:
import os
import librosa
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [128]:
DATA_PATH_TRAIN = "C:/Users/KB/OneDrive/Desktop/Identify-your-Own-Voice-main/dataset/train"
DATA_PATH_TEST = "C:/Users/KB/OneDrive/Desktop/Identify-your-Own-Voice-main/dataset/test"
SAMPLE_RATE = 22050
MFCC_COUNT = 13

In [143]:
def extract_mfccs(file_path):
    audio, _ = librosa.load(file_path, sr=SAMPLE_RATE, mono=True)
    mfccs = librosa.feature.mfcc(y=audio, sr=SAMPLE_RATE, n_mfcc=MFCC_COUNT)
    return np.mean(mfccs.T, axis=0)




In [144]:
def load_data(data_path):
    mfccs = []
    labels = []
    for label in ["human", "other"]:
        folder_path = os.path.join(data_path, label)
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            mfcc = extract_mfccs(file_path)
            mfccs.append(mfcc)
            labels.append(label)
    return np.array(mfccs), np.array(labels)


In [145]:
def load_test_data(data_path):
    mfccs = []
    filenames = []
    for filename in os.listdir(data_path):
        file_path = os.path.join(data_path, filename)
        mfcc = extract_mfccs(file_path)
        mfccs.append(mfcc)
        filenames.append(filename)
    return np.array(mfccs), filenames


In [146]:
# Load the training data
X, y = load_data(DATA_PATH_TRAIN)

# Convert labels from text to integers
le = LabelEncoder()
y = le.fit_transform(y)


  audio, _ = librosa.load(file_path, sr=SAMPLE_RATE, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


In [147]:
# Divide the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [148]:
# # Define the model
# model = Sequential()
# model.add(Dense(32, activation='relu', input_shape=(MFCC_COUNT,)))
# model.add(Dense(1, activation='sigmoid'))

# Build a simple model
model = Sequential([
    Dense(100, activation='selu', kernel_initializer="lecun_normal", input_shape=(MFCC_COUNT,)),
    Dense(50, activation='selu', kernel_initializer="lecun_normal"),
    Dense(1, activation='sigmoid'),
])


In [149]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [150]:
# Define EarlyStopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)


In [151]:
# Train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32, callbacks=[es])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1fe4dd3dae0>

In [152]:
# Make predictions
y_val_pred_prob = model.predict(X_val)
y_val_pred = (y_val_pred_prob > 0.5).astype("int32")

# Calculate the confusion matrix
cm = confusion_matrix(y_val, y_val_pred)
print("Confusion Matrix:")
print(cm)

# Calculate the classification report
cr = classification_report(y_val, y_val_pred)
print("Classification Report:")
print(cr)

# Evaluate the model
loss, accuracy = model.evaluate(X_val, y_val, verbose=0)
print("Model Accuracy: {:.2f}%".format(accuracy*100))
print("Model Loss: {:.2f}".format(loss))


Confusion Matrix:
[[ 292   33]
 [  98 1304]]
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.90      0.82       325
           1       0.98      0.93      0.95      1402

    accuracy                           0.92      1727
   macro avg       0.86      0.91      0.88      1727
weighted avg       0.93      0.92      0.93      1727

Model Accuracy: 92.41%
Model Loss: 0.22


In [154]:
def predict_audio_class(audio_file_path, model, le):
    mfcc = extract_mfccs(audio_file_path)
    mfcc = np.expand_dims(mfcc, axis=0)  # because the model expects 2D array
    prediction_prob = model.predict(mfcc)
    prediction = (prediction_prob > 0.5).astype("int32")
    prediction_label = le.inverse_transform(prediction)[0]
    return prediction_label


In [155]:
# Make prediction on a single file
audio_file_path = "C:/Users/KB/OneDrive/Desktop/Identify-your-Own-Voice-main/dataset/test/testaudio (1987).wav"  # Adjust if necessary
prediction = predict_audio_class(audio_file_path, model, le)
print(f"The audio is predicted as: {prediction}")

The audio is predicted as: other


  y = column_or_1d(y, warn=True)


In [153]:
model.save ("./my_model/ASR")



INFO:tensorflow:Assets written to: ./my_model/ASR\assets


INFO:tensorflow:Assets written to: ./my_model/ASR\assets
