#### Train the Model**

We used a simple yet powerful Random Forest Classifier to identify speakers based on their voice features.

In [None]:
# Import libraries

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib

#### **Load processed audio CSV**

In [6]:
csv_path = r"../raw/audio_data/processed_audio_features.csv"
df = pd.read_csv(csv_path)
print(f" Loaded data: {df.shape[0]} rows, {df.shape[1]} columns")
print(" Audio features loaded successfully!")
print(df.head())

 Loaded data: 112 rows, 69 columns
 Audio features loaded successfully!
                                      sample_id person  phrase  \
0      JD_Confirm_Transaction3.WAV.ogg_original     JD     NaN   
1      JD_Confirm_Transaction3.WAV.ogg_pitch_up     JD     NaN   
2  JD_Confirm_Transaction3.WAV.ogg_time_stretch     JD     NaN   
3         JD_Confirm_Transaction3.WAV.ogg_noise     JD     NaN   
4              JD_Yes_Approve3.WAV.ogg_original     JD     NaN   

        augmentation  sample_rate  mfcc1_mean   mfcc1_std  mfcc1_min  \
0           Original        16000  -421.62290  117.957740 -603.38245   
1     Pitch Shift +2        16000  -440.98620  127.579640 -647.51404   
2  Time Stretch 1.1x        16000  -441.73760  123.142660 -648.08734   
3   Background Noise        16000  -156.31702   44.884453 -239.05902   
4           Original        16000  -449.03710  126.793560 -629.15845   

   mfcc1_max  mfcc2_mean  ...  rolloff_min  rolloff_max  energy_mean  \
0 -231.42770  103.924740  

#### **Fill missing 'person' from 'sample_id'**

In [7]:
if df['person'].isna().all():
    print(" 'person' column empty. Extracting from 'sample_id'...")
    df['person'] = df['sample_id'].apply(lambda x: x.split('_')[0])

# Drop rows with missing 'person'
df = df.dropna(subset=['person'])
print(f" Data after filling 'person': {df.shape[0]} rows")

 Data after filling 'person': 112 rows


#### **Prepare features (X) and labels (y)**

In [8]:
exclude_cols = ['sample_id', 'person', 'phrase', 'augmentation', 'sample_rate']
X = df.drop(columns=[col for col in exclude_cols if col in df.columns])
y = df['person']

print(f"Features shape: {X.shape}")
print(f"Number of classes: {len(y.unique())}")

Features shape: (112, 64)
Number of classes: 3


#### **Split Data for Training and Testing**

We divide the dataset into training (80%) and testing (20%). This helps evaluate how well your model generalizes.

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")

Training samples: 89, Testing samples: 23


#### **Train Random Forest Classifier**

In [10]:
model = RandomForestClassifier(
    n_estimators=600,
    max_depth=50,
    min_samples_split=2,
    min_samples_leaf=1,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)
print(" Model training complete!")

 Model training complete!


In [12]:
model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    max_depth=None,
    n_jobs=-1
)

model.fit(X_train, y_train)
print(" Model training complete!")

 Model training complete!


#### **Evaluate on test set**

In [14]:
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Test accuracy: {acc*100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Test accuracy: 100.00%

Classification Report:
              precision    recall  f1-score   support

          JD       1.00      1.00      1.00         4
      Mariam       1.00      1.00      1.00        14
      Noella       1.00      1.00      1.00         5

    accuracy                           1.00        23
   macro avg       1.00      1.00      1.00        23
weighted avg       1.00      1.00      1.00        23



### **Step 8: Save model and metadata**

In [None]:
import os

save_dir = r"../../models"
os.makedirs(save_dir, exist_ok=True)

joblib.dump(model, os.path.join(save_dir, 'audio_model.pkl'))
joblib.dump(X.columns.tolist(), os.path.join(save_dir, 'feature_names.pkl'))
joblib.dump(model.classes_, os.path.join(save_dir, 'class_names.pkl'))

print(f" Model and metadata saved in {save_dir}")

 Model and metadata saved in C:\users\LENOVO\Documents\formative2-mlp\Audios
