In [1]:
import os
import librosa
import numpy as np
import pandas as pd
import librosa.effects

george_folder = '/Users/mac/Desktop/Projects/Supervised Third Year/Task_3/george'
jackson_folder = '/Users/mac/Desktop/Projects/Supervised Third Year/Task_3/jackson'


In [2]:
def preprocess_audio(file_path):
    try:
        signal, sr = librosa.load(file_path)
        
        signal, _ = librosa.effects.trim(signal)  
        
        signal = librosa.util.normalize(signal)
        
        return signal, sr
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None, None


In [3]:
def extract_features(file_path):
    signal, sr = preprocess_audio(file_path)
    if signal is None:
        return None

    mfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=13)  
    mfccs = np.mean(mfccs.T, axis=0)  
    
    spectral_rolloff = librosa.feature.spectral_rolloff(y=signal, sr=sr)[0]  
    zcr = librosa.feature.zero_crossing_rate(y=signal)[0]  
    
    features = np.hstack([mfccs, spectral_rolloff.mean(), zcr.mean()])
    return features


In [4]:

data = []
labels = []


# Process the George folder
for filename in os.listdir(george_folder):
    file_path = os.path.join(george_folder, filename)
    if os.path.isfile(file_path):
        features = extract_features(file_path)
        if features is not None:
            data.append(features)
            labels.append('george')

# Process the Jackson folder
for filename in os.listdir(jackson_folder):
    file_path = os.path.join(jackson_folder, filename)
    if os.path.isfile(file_path):
        features = extract_features(file_path)
        if features is not None:
            data.append(features)
            labels.append('jackson')


df = pd.DataFrame(data)
df['label'] = labels


feature_columns = [f"mfcc_{i+1}" for i in range(13)] + ["spectral_rolloff", "zero_crossing_rate"]
df.columns = feature_columns + ['label']


df = df.sample(frac=1).reset_index(drop=True)

X = df.drop(columns='label')
y = df['label']

df.head()


Unnamed: 0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,spectral_rolloff,zero_crossing_rate,label
0,-245.0811,223.108887,-43.605213,21.952164,19.386084,-52.142738,-0.840359,7.428283,-27.554871,-0.669427,-5.016583,-26.104843,-6.545174,1609.991455,0.040336,jackson
1,-249.708603,156.77092,-76.142441,-3.588093,9.995524,-50.471012,-7.238127,-7.410536,-45.083729,-14.577906,-12.124034,-30.12993,-8.10509,2260.986328,0.075747,george
2,-245.802826,130.122406,-103.586411,41.38253,42.605679,-47.899086,-8.10101,-22.411518,-45.224514,-0.389926,-30.855272,-43.325623,6.006252,3226.615906,0.133484,george
3,-233.975189,206.794434,-14.344894,-7.068472,-6.940811,-33.51926,-2.431716,-8.353852,-21.628042,-6.718249,-20.801037,-13.348879,8.378547,1179.481201,0.041772,jackson
4,-225.183075,211.39679,-24.976473,-8.217365,-3.713726,-32.735058,2.691828,-6.54157,-25.232832,-4.207321,-18.426573,-16.177134,10.712893,1246.875,0.045759,jackson


In [5]:
print("df shape:", df.shape)
print("number of samples:", len(df))
print("df columns:", df.columns)
print("df labels:", df['label'].value_counts())

df shape: (1000, 16)
number of samples: 1000
df columns: Index(['mfcc_1', 'mfcc_2', 'mfcc_3', 'mfcc_4', 'mfcc_5', 'mfcc_6', 'mfcc_7',
       'mfcc_8', 'mfcc_9', 'mfcc_10', 'mfcc_11', 'mfcc_12', 'mfcc_13',
       'spectral_rolloff', 'zero_crossing_rate', 'label'],
      dtype='object')
df labels: label
jackson    500
george     500
Name: count, dtype: int64


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")


Training set size: 800
Testing set size: 200


In [7]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)

model.fit(X_train, y_train)


In [8]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.9900


In [9]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[106   1]
 [  1  92]]


In [10]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)


Classification Report:
              precision    recall  f1-score   support

      george       0.99      0.99      0.99       107
     jackson       0.99      0.99      0.99        93

    accuracy                           0.99       200
   macro avg       0.99      0.99      0.99       200
weighted avg       0.99      0.99      0.99       200



In [11]:
comparison_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred
})

print(comparison_df.head())


      Actual Predicted
521  jackson   jackson
737   george    george
740  jackson   jackson
660   george    george
411   george    george
