In [1]:
import os
import librosa
import numpy as np
import pandas as pd
import librosa.effects

george_folder = '/Users/hassa/Desktop/GDG/Supervised-Learning/Task_3/george'
jackson_folder = '/Users/hassa/Desktop/GDG/Supervised-Learning/Task_3/jackson'

In [2]:
def preprocess_audio(file_path):
    try:
        signal, sr = librosa.load(file_path)
        
        signal, _ = librosa.effects.trim(signal)  
        
        signal = librosa.util.normalize(signal)
        
        return signal, sr
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None, None


In [3]:
def extract_features(file_path):
    signal, sr = preprocess_audio(file_path)
    if signal is None:
        return None

    mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=13)  
    mfccs = np.mean(mfccs.T, axis=0)  
    
    spectral_rolloff = librosa.feature.spectral_rolloff(y=signal, sr=sr)[0]  
    zcr = librosa.feature.zero_crossing_rate(y=signal)[0]  
    
    features = np.hstack([mfccs, spectral_rolloff.mean(), zcr.mean()])
    return features


In [4]:

data = []
labels = []


# Process the George folder
for filename in os.listdir(george_folder):
    file_path = os.path.join(george_folder, filename)
    if os.path.isfile(file_path):
        features = extract_features(file_path)
        if features is not None:
            data.append(features)
            labels.append('george')

# Process the Jackson folder
for filename in os.listdir(jackson_folder):
    file_path = os.path.join(jackson_folder, filename)
    if os.path.isfile(file_path):
        features = extract_features(file_path)
        if features is not None:
            data.append(features)
            labels.append('jackson')


df = pd.DataFrame(data)
df['label'] = labels


feature_columns = [f"mfcc_{i+1}" for i in range(13)] + ["spectral_rolloff", "zero_crossing_rate"]
df.columns = feature_columns + ['label']


df = df.sample(frac=1).reset_index(drop=True)

X = df.drop(columns='label')
y = df['label']

df.head()


Unnamed: 0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,spectral_rolloff,zero_crossing_rate,label
0,-266.633453,204.262238,-3.221847,8.541867,10.596744,-38.084026,-12.778536,5.193983,-21.292635,-18.602156,-6.878771,-10.424617,-9.013573,1004.370117,0.034133,jackson
1,-249.892548,180.811737,-30.749771,36.82407,45.62402,-27.611139,-16.735493,-13.680727,-27.383505,-3.470161,-16.029514,-32.115929,-7.893184,1633.293457,0.031152,jackson
2,-225.29895,209.843521,-52.314709,28.46933,30.290186,-38.201267,11.379951,2.348416,-33.057568,6.141239,-0.5273,-22.610695,-3.61581,2044.75708,0.028625,jackson
3,-314.429199,157.61261,-90.622612,11.607381,27.772308,-41.582893,-12.292714,-35.225574,-47.567688,12.056046,-16.593351,-46.65662,2.663131,2823.989868,0.10262,george
4,-233.638489,224.178879,-49.212692,5.595926,2.856314,-53.649235,-8.151447,-14.896815,-33.523003,7.558509,-2.305782,-17.214939,-0.324689,1580.297852,0.059191,jackson


In [5]:
print("df shape:", df.shape)
print("number of samples:", len(df))
print("df columns:", df.columns)
print("df labels:", df['label'].value_counts())

df shape: (1000, 16)
number of samples: 1000
df columns: Index(['mfcc_1', 'mfcc_2', 'mfcc_3', 'mfcc_4', 'mfcc_5', 'mfcc_6', 'mfcc_7',
       'mfcc_8', 'mfcc_9', 'mfcc_10', 'mfcc_11', 'mfcc_12', 'mfcc_13',
       'spectral_rolloff', 'zero_crossing_rate', 'label'],
      dtype='object')
df labels: label
jackson    500
george     500
Name: count, dtype: int64


# Naïve Bayes Classifier from Scratch

In [15]:
class NaïveBayes:
    def __init__(self):
        self.classes = None
        self.priors = {}
        self.mean_dict = {}
        self.std_dict = {}

    def mean(self,X):
        return np.sum(X,axis=0) / X.shape[0]
    
    def std(self,X,mean):
        return np.sum((X-mean)**2, axis=0) / X.shape[0]
    
    def fit(self, X, Y):
        X = np.array(X, dtype=np.float64)
        Y = np.array(Y)

        self.classes = np.unique(Y)
        for Class in self.classes:
            X_class = X[Y == Class]
            self.priors[Class] = X_class.shape[0] / X.shape[0]
            self.mean_dict[Class] = self.mean(X_class)
            self.std_dict[Class] = self.std(X_class, self.mean_dict[Class]) + 1e-9

    def gaussian_probability(self, X, mean, var):
        exponent = np.exp(-((X-mean)**2 )/ (2*var))
        return (1/np.sqrt(2*np.pi*var))*exponent
    
    def predict(self,X):
        X = np.array(X, dtype=np.float64)
        return np.array([self.predict_single(x) for x in X])   
    
    def predict_single(self, X):
        posteriors=[]
        for Class in self.classes:
            prior = np.log(self.priors[Class])
            class_conditional = np.sum(np.log(self.gaussian_probability(X, self.mean_dict[Class], self.std_dict[Class])))
            posterior = prior + class_conditional
            posteriors.append(posterior)
        return self.classes[np.argmax(posteriors)]
    
    def evaluate(self , y_true , y_pred):
        accuracy = np.mean(y_true == y_pred)

        classes = np.unique(y_true)
        precision, recall, f1 = [], [], []

        for cls in classes:
            TP = np.sum((y_pred == cls) & (y_true == cls))
            FP = np.sum((y_pred == cls) & (y_true != cls))
            FN = np.sum((y_pred != cls) & (y_true == cls))

            p = TP / (TP + FP + 1e-9)
            r = TP / (TP + FN + 1e-9)
            f = 2 * p * r / (p + r + 1e-9)

            precision.append(p)
            recall.append(r)
            f1.append(f)

        return {
            'accuracy': accuracy,
            'precision': np.mean(precision),
            'recall': np.mean(recall),
            'f1_score': np.mean(f1)
        }
        


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")


Training set size: 800
Testing set size: 200


In [17]:
model = NaïveBayes()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
results = model.evaluate(y_test, y_pred)
print("Evaluation Results:")
print(f"Accuracy : {results['accuracy']:.3f}")
print(f"Precision: {results['precision']:.3f}")
print(f"Recall   : {results['recall']:.3f}")
print(f"F1 Score : {results['f1_score']:.3f}")



Evaluation Results:
Accuracy : 0.965
Precision: 0.965
Recall   : 0.966
F1 Score : 0.965


In [22]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# --- Custom Naïve Bayes ---
custom_model = NaïveBayes()
custom_model.fit(X_train, y_train)
y_pred_custom = custom_model.predict(X_test)
custom_results = custom_model.evaluate(y_test, y_pred_custom)

# --- Built-in GaussianNB ---
sklearn_model = GaussianNB()
sklearn_model.fit(X_train, y_train)
y_pred_sklearn = sklearn_model.predict(X_test)

# Evaluate sklearn model
sklearn_results = {
    'accuracy': accuracy_score(y_test, y_pred_sklearn),
    'precision': precision_score(y_test, y_pred_sklearn, average='macro'),
    'recall': recall_score(y_test, y_pred_sklearn, average='macro'),
    'f1_score': f1_score(y_test, y_pred_sklearn, average='macro')
}

# --- Print Comparison ---
def print_results(title, results):
    print(f"\n📊 {title}")
    print(f"✅ Accuracy : {results['accuracy']:.3f}")
    print(f"🎯 Precision: {results['precision']:.3f}")
    print(f"📥 Recall   : {results['recall']:.3f}")
    print(f"📈 F1 Score : {results['f1_score']:.3f}")

print_results("Custom Naïve Bayes", custom_results)
print_results("Scikit-learn GaussianNB", sklearn_results)



📊 Custom Naïve Bayes
✅ Accuracy : 0.965
🎯 Precision: 0.965
📥 Recall   : 0.966
📈 F1 Score : 0.965

📊 Scikit-learn GaussianNB
✅ Accuracy : 0.965
🎯 Precision: 0.965
📥 Recall   : 0.966
📈 F1 Score : 0.965


In [18]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)

model.fit(X_train, y_train)


In [19]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.9900


In [20]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[102   2]
 [  0  96]]


In [10]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)


Classification Report:
              precision    recall  f1-score   support

      george       0.99      0.99      0.99       107
     jackson       0.99      0.99      0.99        93

    accuracy                           0.99       200
   macro avg       0.99      0.99      0.99       200
weighted avg       0.99      0.99      0.99       200



In [21]:
comparison_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred
})

print(comparison_df.head())


      Actual Predicted
521  jackson   jackson
737   george    george
740  jackson   jackson
660  jackson   jackson
411   george    george
