# Speaker Recognition with Naïve Bayes and Bagging Ensemble
This notebook performs speaker recognition using audio data from two speakers: **George** and **Jackson**. It includes:
- Audio preprocessing using `librosa`
- Feature extraction (MFCCs, Spectral Rolloff, ZCR)
- Implementation of a **Naïve Bayes classifier from scratch**
- Evaluation against `scikit-learn`'s `GaussianNB`
- Implementation of **Bagging Ensemble** with Naïve Bayes and Logistic Regression

# Import Libraries

In [19]:
import os
import librosa
import numpy as np
import pandas as pd
import librosa.effects
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from collections import Counter
from sklearn.naive_bayes import GaussianNB


In [5]:
george_folder = '/Users/hassa/Desktop/GDG/Supervised-Learning/Task_3/george'
jackson_folder = '/Users/hassa/Desktop/GDG/Supervised-Learning/Task_3/jackson'

# Audio Preprocessing

In [6]:
def preprocess_audio(file_path):
    try:
        signal, sr = librosa.load(file_path)
        
        signal, _ = librosa.effects.trim(signal)  
        
        signal = librosa.util.normalize(signal)
        
        return signal, sr
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None, None


# Feature Extraction

In [7]:
def extract_features(file_path):
    signal, sr = preprocess_audio(file_path)
    if signal is None:
        return None

    mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=13)  
    mfccs = np.mean(mfccs.T, axis=0)  
    
    spectral_rolloff = librosa.feature.spectral_rolloff(y=signal, sr=sr)[0]  
    zcr = librosa.feature.zero_crossing_rate(y=signal)[0]  
    
    features = np.hstack([mfccs, spectral_rolloff.mean(), zcr.mean()])
    return features


# Load and Process Data

In [8]:
data = []
labels = []


# Process the George folder
for filename in os.listdir(george_folder):
    file_path = os.path.join(george_folder, filename)
    if os.path.isfile(file_path):
        features = extract_features(file_path)
        if features is not None:
            data.append(features)
            labels.append('george')

# Process the Jackson folder
for filename in os.listdir(jackson_folder):
    file_path = os.path.join(jackson_folder, filename)
    if os.path.isfile(file_path):
        features = extract_features(file_path)
        if features is not None:
            data.append(features)
            labels.append('jackson')


df = pd.DataFrame(data)
df['label'] = labels


feature_columns = [f"mfcc_{i+1}" for i in range(13)] + ["spectral_rolloff", "zero_crossing_rate"]
df.columns = feature_columns + ['label']


df = df.sample(frac=1).reset_index(drop=True)

X = df.drop(columns='label')
y = df['label']

df.head()


Unnamed: 0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,spectral_rolloff,zero_crossing_rate,label
0,-246.137878,216.687088,-38.796513,8.815818,13.374378,-47.711304,-13.927596,-8.828659,-29.201393,-2.4096,-1.546351,-13.657865,-7.672847,1478.991057,0.044742,jackson
1,-272.25119,166.416351,-99.52153,16.647449,36.567478,-48.509678,-21.056341,-36.550667,-49.181263,7.645572,-18.115288,-42.182991,4.148391,2659.350586,0.083696,george
2,-251.647247,160.244904,-44.033691,10.318113,5.280222,-44.314095,4.977806,-4.780592,-45.944427,-14.767281,-19.791658,-44.414085,-17.324881,1939.334106,0.037842,george
3,-274.095917,175.837616,-111.480713,13.345282,32.131676,-52.038898,-15.786862,-38.824608,-55.34314,11.196049,-17.315891,-47.920212,5.41144,2849.919434,0.104761,george
4,-193.464005,186.775986,-85.782654,-2.21306,-1.627923,-74.069504,-9.857377,-6.227119,-53.334595,-18.015398,-14.073372,-32.152668,-5.531947,2535.534668,0.058419,george


In [9]:
print("df shape:", df.shape)
print("number of samples:", len(df))
print("df columns:", df.columns)
print("df labels:", df['label'].value_counts())

df shape: (1000, 16)
number of samples: 1000
df columns: Index(['mfcc_1', 'mfcc_2', 'mfcc_3', 'mfcc_4', 'mfcc_5', 'mfcc_6', 'mfcc_7',
       'mfcc_8', 'mfcc_9', 'mfcc_10', 'mfcc_11', 'mfcc_12', 'mfcc_13',
       'spectral_rolloff', 'zero_crossing_rate', 'label'],
      dtype='object')
df labels: label
jackson    500
george     500
Name: count, dtype: int64


# Naïve Bayes Classifier from Scratch

In [None]:
class NaïveBayes:
    def __init__(self):
        self.classes = None
        self.priors = {}
        self.mean_dict = {}
        self.std_dict = {}

    def mean(self,X):
        return np.sum(X,axis=0) / X.shape[0]
    
    def std(self,X,mean):
        return np.sum((X-mean)**2, axis=0) / X.shape[0]
    
    def fit(self, X, Y):
        X = np.array(X, dtype=np.float64)
        Y = np.array(Y)

        self.classes = np.unique(Y)
        for Class in self.classes:
            X_class = X[Y == Class]
            self.priors[Class] = X_class.shape[0] / X.shape[0]
            self.mean_dict[Class] = self.mean(X_class)
            self.std_dict[Class] = self.std(X_class, self.mean_dict[Class]) + 1e-9

    def gaussian_probability(self, X, mean, var):
        exponent = np.exp(-((X-mean)**2 )/ (2*var))
        return (1/np.sqrt(2*np.pi*var))*exponent
    
    def predict(self,X):
        X = np.array(X, dtype=np.float64)
        return np.array([self.predict_single(x) for x in X])   
    
    def predict_single(self, X):
        posteriors=[]
        for Class in self.classes:
            prior = np.log(self.priors[Class])
            class_conditional = np.sum(np.log(self.gaussian_probability(X, self.mean_dict[Class], self.std_dict[Class])))
            posterior = prior + class_conditional
            posteriors.append(posterior)
        return self.classes[np.argmax(posteriors)]
    
    def evaluate(self , y_true , y_pred):
        accuracy = np.mean(y_true == y_pred)

        classes = np.unique(y_true)
        precision, recall, f1 = [], [], []

        for cls in classes:
            TP = np.sum((y_pred == cls) & (y_true == cls))
            FP = np.sum((y_pred == cls) & (y_true != cls))
            FN = np.sum((y_pred != cls) & (y_true == cls))

            p = TP / (TP + FP + 1e-9)
            r = TP / (TP + FN + 1e-9)
            f = 2 * p * r / (p + r + 1e-9)

            precision.append(p)
            recall.append(r)
            f1.append(f)

        return {
            'accuracy': accuracy,
            'precision': np.mean(precision),
            'recall': np.mean(recall),
            'f1_score': np.mean(f1)
        }
        

# Split Data

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

Training set size: 800
Testing set size: 200


# Evaluate Naïve Bayes Classifier

In [12]:
model = NaïveBayes()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
results = model.evaluate(y_test, y_pred)
print("Evaluation Results:")
print(f"Accuracy : {results['accuracy']:.3f}")
print(f"Precision: {results['precision']:.3f}")
print(f"Recall   : {results['recall']:.3f}")
print(f"F1 Score : {results['f1_score']:.3f}")

Evaluation Results:
Accuracy : 0.965
Precision: 0.966
Recall   : 0.965
F1 Score : 0.965


In [20]:
# Custom Naïve Bayes
custom_model = NaïveBayes()
custom_model.fit(X_train, y_train)
y_pred_custom = custom_model.predict(X_test)
custom_results = custom_model.evaluate(y_test, y_pred_custom)

# Scikit-learn GaussianNB
sklearn_model = GaussianNB()
sklearn_model.fit(X_train, y_train)
y_pred_sklearn = sklearn_model.predict(X_test)

sklearn_results = {
    'accuracy': accuracy_score(y_test, y_pred_sklearn),
    'precision': precision_score(y_test, y_pred_sklearn, average='macro'),
    'recall': recall_score(y_test, y_pred_sklearn, average='macro'),
    'f1_score': f1_score(y_test, y_pred_sklearn, average='macro')
}

def print_results(name, results):
    print(f"\n{name} Results:")
    for metric, value in results.items():
        print(f"{metric.capitalize()}: {value:.3f}")

print_results("Custom Naïve Bayes", custom_results)
print_results("Scikit-learn GaussianNB", sklearn_results)


Custom Naïve Bayes Results:
Accuracy: 0.965
Precision: 0.966
Recall: 0.965
F1_score: 0.965

Scikit-learn GaussianNB Results:
Accuracy: 0.965
Precision: 0.966
Recall: 0.965
F1_score: 0.965


## Bagging Ensemble
Use both Naïve Bayes and Logistic Regression with bagging.

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from collections import Counter

def baggingEnsamble(baseModel , x_train , y_train, estimators = 10):
    models=[]
    for i in range(estimators):
        x_sample , y_sample = resample(x_train,y_train)
        model = baseModel()
        model.fit(x_sample , y_sample)
        models.append(model)
    return models

def predictEnsamble(models , X):
    predictions = []
    for model in models:
        y_pred = model.predict(X)
        predictions.append(y_pred)
    predictions = np.array(predictions).T
    finalPredictions = [Counter(row).most_common(1)[0][0] for row in predictions]
    return finalPredictions

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)

    print("Accuracy:", round(accuracy, 4))
    print("Precision:", round(precision, 4))
    print("Recall:", round(recall, 4))
    print("F1 Score:", round(f1, 4))

In [18]:
NB_models = baggingEnsamble(NaïveBayes , X_train,y_train)
y_pred_NB = predictEnsamble(NB_models , X_test)
LR_models = baggingEnsamble(lambda: LogisticRegression(max_iter=1000) , X_train,y_train)
y_pred_LR = predictEnsamble(LR_models , X_test)

print('Naïve Bayes:')
evaluate_model(y_test, y_pred_NB)
print('---------------------\nLogistic Regression:')
evaluate_model(y_test, y_pred_LR)

Naïve Bayes:
Accuracy: 0.965
Precision: 0.9658
Recall: 0.9645
F1 Score: 0.9649
---------------------
Logistic Regression:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
