In [1]:
dataset = 'archive (1)\dataset\audio\speakers'


In [40]:
import librosa
import numpy as np

SAMPLE_RATE = 16000
DURATION = 2
SAMPLES = SAMPLE_RATE * DURATION
N_MFCC = 40

def preprocess_audio(file_path):
    audio, sr = librosa.load(file_path, sr=SAMPLE_RATE, mono=True)
    audio, _ = librosa.effects.trim(audio)

    if len(audio) < SAMPLES:
        audio = np.pad(audio, (0, SAMPLES - len(audio)))
    else:
        audio = audio[:SAMPLES]

    return audio, sr


In [41]:
file_path = r"E:\voice_windows_ai\archive (1)\dataset\audio\speakers\2BqVo8kVB2Skwgyb\1d9f3920-4474-11e9-a9a5-5dbec3b8816a.wav"

audio, sr = preprocess_audio(file_path)

print(audio.shape)
print(sr)


(32000,)
16000


In [25]:
import IPython.display as ipd
ipd.Audio(audio, rate=sr)

In [42]:
def extract_features(audio, sr):
    mfcc = librosa.feature.mfcc(
        y=audio,
        sr=sr,
        n_mfcc=N_MFCC
    )

    # statistical pooling (speaker independent)
    mfcc_mean = np.mean(mfcc, axis=1)
    mfcc_std = np.std(mfcc, axis=1)

    return np.concatenate([mfcc_mean, mfcc_std])  # shape: (80,)


In [43]:
import os

DATASET_PATH = r"archive (1)\dataset\audio\speakers"

def build_unlabeled_dataset(dataset_path):
    X = []
    meta = []

    for speaker_id in os.listdir(dataset_path):
        speaker_path = os.path.join(dataset_path, speaker_id)

        if not os.path.isdir(speaker_path):
            continue

        for file in os.listdir(speaker_path):
            if not file.endswith(".wav"):
                continue

            file_path = os.path.join(speaker_path, file)

            try:
                audio, sr = preprocess_audio(file_path)
                features = extract_features(audio, sr)

                X.append(features)
                meta.append({
                    "speaker_id": speaker_id,
                    "file": file,
                    "path": file_path
                })

            except Exception as e:
                print("Error:", file_path, e)

    return np.array(X), meta


In [44]:
X, meta = build_unlabeled_dataset(DATASET_PATH)

print("Feature matrix shape:", X.shape)
print("First sample metadata:", meta[0])


Feature matrix shape: (11309, 80)
First sample metadata: {'speaker_id': '2BqVo8kVB2Skwgyb', 'file': '03592c80-447c-11e9-a9a5-5dbec3b8816a.wav', 'path': 'archive (1)\\dataset\\audio\\speakers\\2BqVo8kVB2Skwgyb\\03592c80-447c-11e9-a9a5-5dbec3b8816a.wav'}


In [47]:
print('Nans', np.isnan(X).sum())
print('Infs', np.isinf(X).sum())

Nans 0
Infs 0


In [62]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [67]:
NUM_COMMANDS = 6

In [68]:
from sklearn.cluster import KMeans

kmeans = KMeans(
    n_clusters=NUM_COMMANDS,
    random_state=42,
    n_init=10
)

cluster_ids = kmeans.fit_predict(X_scaled)


In [69]:
for cid in range(NUM_COMMANDS):
    print(f"\nðŸ”¹ Cluster {cid}")
    samples = [meta[i]["path"] for i in range(len(meta)) if cluster_ids[i] == cid][:5]
    for s in samples:
        print(" ", s)



ðŸ”¹ Cluster 0
  archive (1)\dataset\audio\speakers\2BqVo8kVB2Skwgyb\03592c80-447c-11e9-a9a5-5dbec3b8816a.wav
  archive (1)\dataset\audio\speakers\2BqVo8kVB2Skwgyb\120de470-447a-11e9-a9a5-5dbec3b8816a.wav
  archive (1)\dataset\audio\speakers\2BqVo8kVB2Skwgyb\1d9f3920-4474-11e9-a9a5-5dbec3b8816a.wav
  archive (1)\dataset\audio\speakers\2BqVo8kVB2Skwgyb\230c5690-447e-11e9-a9a5-5dbec3b8816a.wav
  archive (1)\dataset\audio\speakers\2BqVo8kVB2Skwgyb\256b2060-4479-11e9-a9a5-5dbec3b8816a.wav

ðŸ”¹ Cluster 1
  archive (1)\dataset\audio\speakers\2BqVo8kVB2Skwgyb\34e73840-447d-11e9-a9a5-5dbec3b8816a.wav
  archive (1)\dataset\audio\speakers\2BqVo8kVB2Skwgyb\78895af0-4479-11e9-a9a5-5dbec3b8816a.wav
  archive (1)\dataset\audio\speakers\2BqVo8kVB2Skwgyb\7ee96810-447c-11e9-a9a5-5dbec3b8816a.wav
  archive (1)\dataset\audio\speakers\2BqVo8kVB2Skwgyb\9e5f9b10-447c-11e9-a9a5-5dbec3b8816a.wav
  archive (1)\dataset\audio\speakers\2BqVo8kVB2Skwgyb\ce762960-447a-11e9-a9a5-5dbec3b8816a.wav

ðŸ”¹ Cluster 2
  

In [70]:
cluster_to_command = {
    0: "open_browser",
    1: "close_browser",
    2: "volume_up",
    3: "volume_down",
    4: "play_music",
    5: "stop_music"
}


In [71]:
y = np.array([cluster_to_command[c] for c in cluster_ids])



In [72]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print("Classes:", label_encoder.classes_)


Classes: ['close_browser' 'open_browser' 'play_music' 'stop_music' 'volume_down'
 'volume_up']


In [76]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multi:softprob",
    eval_metric="mlogloss",
    random_state=42
)

model.fit(X_train, y_train)

print("Train accuracy:", model.score(X_train, y_train))
print("Test accuracy:", model.score(X_test, y_test))


Train accuracy: 1.0
Test accuracy: 0.9363395225464191


In [77]:
with open("voice_command_xgb.pkl", "rb") as f:
    data = pickle.load(f)


FileNotFoundError: [Errno 2] No such file or directory: 'voice_command_xgb.pkl'