In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import os

# H√†m load d·ªØ li·ªáu theo batch
def load_data_by_batch(csv_path, batch_id):
    df = pd.read_csv(csv_path)

    if "B·ªô" not in df.columns:
        print("‚ö†Ô∏è L·ªói: C·ªôt 'B·ªô' kh√¥ng c√≥ trong file CSV!")
        return None

    df["B·ªô"] = df["B·ªô"].astype(int)
    batch_df = df[df["B·ªô"] == batch_id]

    if batch_df.empty:
        print(f"‚ö†Ô∏è Batch {batch_id} kh√¥ng c√≥ d·ªØ li·ªáu trong CSV!")
        return None

    n_fft = batch_df["n_fft"].values[0]
    hop_length = batch_df["hop_length"].values[0]

    train_row = batch_df[batch_df["T·∫≠p"] == "train"]
    test_row = batch_df[batch_df["T·∫≠p"] == "test"]

    try:
        train_x_path = train_row["X_file_path"].values[0]
        train_y_path = train_row["y_file_path"].values[0]
        test_x_path = test_row["X_file_path"].values[0]
        test_y_path = test_row["y_file_path"].values[0]

        if not (os.path.exists(train_x_path) and os.path.exists(train_y_path) and os.path.exists(test_x_path) and os.path.exists(test_y_path)):
            print(f"‚ö†Ô∏è L·ªói: M·ªôt trong c√°c file .npy c·ªßa batch {batch_id} kh√¥ng t·ªìn t·∫°i!")
            return None

        X_train = np.load(train_x_path)
        y_train = np.load(train_y_path)
        X_test = np.load(test_x_path)
        y_test = np.load(test_y_path)
    
    except Exception as e:
        print(f"‚ö†Ô∏è L·ªói khi load d·ªØ li·ªáu batch {batch_id}: {e}")
        return None

    return X_train, y_train, X_test, y_test, n_fft, hop_length

# H√†m ch·∫°y KNN v·ªõi nhi·ªÅu m·ª©c MFCC
def run_knn_experiment(X_train, y_train, X_test, y_test, n_fft, hop_length, mfcc_steps):
    scaler = StandardScaler()

    # Scale to√†n b·ªô MFCC
    num_samples_train, num_features = X_train.shape
    num_samples_test = X_test.shape[0]

    X_train_scaled = scaler.fit_transform(X_train.reshape(num_samples_train, -1))
    X_test_scaled = scaler.transform(X_test.reshape(num_samples_test, -1))

    knn = KNeighborsClassifier()
    knn_params = {
        'n_neighbors': list(range(1, 21)),  # T√¨m s·ªë k t·ª´ 1 ƒë·∫øn 20
        'weights': ['uniform', 'distance'],  # Ki·ªÉu t√≠nh tr·ªçng s·ªë
        'metric': ['euclidean', 'manhattan', 'minkowski']  # Kho·∫£ng c√°ch
    }

    results = []
    for i in mfcc_steps:
        print(f"‚ñ∂ ƒêang th·ª≠ nghi·ªám v·ªõi {i} MFCC...")

        X_train_subset = X_train_scaled[:, :i]
        X_test_subset = X_test_scaled[:, :i]

        start_time = time.time()

        # Random search v·ªõi √≠t fold h∆°n ƒë·ªÉ tƒÉng t·ªëc
        random_search = RandomizedSearchCV(knn, knn_params, n_iter=10, cv=3, random_state=42, n_jobs=-1)
        random_search.fit(X_train_subset, y_train)

        best_params = random_search.best_params_
        y_pred = random_search.predict(X_test_subset)

        test_accuracy = accuracy_score(y_test, y_pred)
        macro_f1 = f1_score(y_test, y_pred, average='macro')
        confusion = confusion_matrix(y_test, y_pred)

        print(f"üîπ MFCC: {i}, Best K: {best_params['n_neighbors']}")
        print("Confusion Matrix:\n", confusion)

        elapsed_time = time.time() - start_time

        # L∆∞u k·∫øt qu·∫£
        results.append({
            'numofMFCC': i,
            'n_fft': n_fft,
            'Hop_length': hop_length,
            'Best_hyperparameters': best_params,
            'test_accuracy': test_accuracy,
            'macro_f1': macro_f1,
            'time': elapsed_time
        })

    return pd.DataFrame(results)

# Ch·∫°y th·ª≠ nghi·ªám
mfcc_steps = [13, 20, 40, 80]
csv_path = r"C:\Users\manhm\Desktop\BeeSoundClassifier\data\extracted_features\mfcc\mfcc_extraction_log.csv"

for batch_id in range(4):
    print(f"\nüöÄ Ch·∫°y th·ª≠ nghi·ªám Batch {batch_id}...")
    data = load_data_by_batch(csv_path, batch_id)

    if data is None:
        continue  # N·∫øu batch kh√¥ng c√≥ d·ªØ li·ªáu, b·ªè qua

    X_train, y_train, X_test, y_test, n_fft, hop_length = data
    df = run_knn_experiment(X_train, y_train, X_test, y_test, n_fft, hop_length, mfcc_steps)

    print(f"\nüìå K·∫øt qu·∫£ Batch {batch_id}:")
    print(df)

    result_path = f"knn_results_batch_{batch_id}.csv"
    df.to_csv(result_path, index=False)
    print(f"‚úÖ ƒê√£ l∆∞u k·∫øt qu·∫£ v√†o {result_path}")



üöÄ Ch·∫°y th·ª≠ nghi·ªám Batch 0...
‚ñ∂ ƒêang th·ª≠ nghi·ªám v·ªõi 13 MFCC...
üîπ MFCC: 13, Best K: 17
Confusion Matrix:
 [[1008   70   17]
 [ 121  364  207]
 [   2   18  953]]
‚ñ∂ ƒêang th·ª≠ nghi·ªám v·ªõi 20 MFCC...
üîπ MFCC: 20, Best K: 17
Confusion Matrix:
 [[1033   57    5]
 [ 127  357  208]
 [   0   22  951]]
‚ñ∂ ƒêang th·ª≠ nghi·ªám v·ªõi 40 MFCC...
üîπ MFCC: 40, Best K: 14
Confusion Matrix:
 [[1053   40    2]
 [ 140  351  201]
 [   0   29  944]]
‚ñ∂ ƒêang th·ª≠ nghi·ªám v·ªõi 80 MFCC...
üîπ MFCC: 80, Best K: 14
Confusion Matrix:
 [[1063   30    2]
 [ 127  365  200]
 [   2   27  944]]

üìå K·∫øt qu·∫£ Batch 0:
   numofMFCC  n_fft  Hop_length  \
0         13   1024         256   
1         20   1024         256   
2         40   1024         256   
3         80   1024         256   

                                Best_hyperparameters  test_accuracy  macro_f1  \
0  {'weights': 'distance', 'n_neighbors': 17, 'me...       0.842391  0.809512   
1  {'weights': 'distance', '