In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.ensemble import ExtraTreesClassifier  # Import Extra Trees

def load_data_by_batch(csv_path, batch_id):
    df = pd.read_csv(csv_path)
    df["Bá»™"] = df["Bá»™"].astype(int)
    batch_df = df[df["Bá»™"] == batch_id]
    
    n_fft = batch_df["n_fft"].values[0]
    hop_length = batch_df["hop_length"].values[0]
    
    train_row = batch_df[batch_df["Táº­p"] == "train"]
    test_row = batch_df[batch_df["Táº­p"] == "test"]
    
    X_train = np.load(train_row["X_file_path"].values[0])
    y_train = np.load(train_row["y_file_path"].values[0])
    X_test = np.load(test_row["X_file_path"].values[0])
    y_test = np.load(test_row["y_file_path"].values[0])
    
    return X_train, y_train, X_test, y_test, n_fft, hop_length

def run_et_experiment(X_train, y_train, X_test, y_test, n_fft, hop_length, mfcc_steps):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train.reshape(X_train.shape[0], -1))
    X_test_scaled = scaler.transform(X_test.reshape(X_test.shape[0], -1))
    
    et = ExtraTreesClassifier(n_estimators=100, random_state=42)
    
    results = []
    for i in mfcc_steps:
        X_train_subset = X_train_scaled[:, :i]
        X_test_subset = X_test_scaled[:, :i]
        
        start_time = time.time()
        et.fit(X_train_subset, y_train)
        y_pred = et.predict(X_test_subset)
        test_accuracy = accuracy_score(y_test, y_pred)
        macro_f1 = f1_score(y_test, y_pred, average='macro')
        confusion = confusion_matrix(y_test, y_pred)
        
        print(f"MFCC: {i}")
        print(confusion)
        
        elapsed_time = time.time() - start_time
        results.append({
            'numofMFCC': i,
            'n_fft': n_fft,
            'Hop_length': hop_length,
            'test_accuracy': test_accuracy,
            'macro_f1': macro_f1,
            'time': elapsed_time
        })
    
    return pd.DataFrame(results)

# Cháº¡y thá»­ nghiá»‡m vá»›i Extra Trees
mfcc_steps = [13, 20, 40, 80]
csv_path = r"C:\Users\manhm\Desktop\BeeSoundClassifier\data\extracted_features\mfcc\mfcc_extraction_log.csv"

for i in range(0, 4):
    batch_id = i
    data = load_data_by_batch(csv_path, batch_id)
    
    if data is None:
        continue
    
    print(f"\nðŸ“‚ Batch {batch_id}")
    print("Training Extra Trees Classifier")
    X_train, y_train, X_test, y_test, n_fft, hop_length = data
    df_et = run_et_experiment(X_train, y_train, X_test, y_test, n_fft, hop_length, mfcc_steps)
    print(f"\nðŸ“Œ Káº¿t quáº£ Batch {batch_id} - Extra Trees:")
    print(df_et)
    df_et.to_csv(f"et_results_batch_{batch_id}.csv", index=False)



ðŸ“‚ Batch 0
Training Extra Trees Classifier
MFCC: 13
[[1026   64    5]
 [ 125  379  188]
 [   2   30  941]]
MFCC: 20
[[1033   59    3]
 [ 120  379  193]
 [   0   30  943]]
MFCC: 40
[[1038   55    2]
 [ 123  374  195]
 [   0   27  946]]
MFCC: 80
[[1052   43    0]
 [ 127  369  196]
 [   0   19  954]]

ðŸ“Œ Káº¿t quáº£ Batch 0 - Extra Trees:
   numofMFCC  n_fft  Hop_length  test_accuracy  macro_f1      time
0         13   1024         256       0.850000  0.818889  1.390945
1         20   1024         256       0.853261  0.821827  1.773349
2         40   1024         256       0.854348  0.821974  2.137008
3         80   1024         256       0.860507  0.827046  2.057748

ðŸ“‚ Batch 1
Training Extra Trees Classifier
MFCC: 13
[[1029   61    5]
 [ 121  379  192]
 [   2   35  936]]
MFCC: 20
[[1034   59    2]
 [ 122  382  188]
 [   0   27  946]]
MFCC: 40
[[1042   51    2]
 [ 117  380  195]
 [   0   26  947]]
MFCC: 80
[[1049   45    1]
 [ 128  371  193]
 [   1   17  955]]

ðŸ“Œ Káº¿t quáº£ Ba