In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression  # Import Logistic Regression

def load_data_by_batch(csv_path, batch_id):
    df = pd.read_csv(csv_path)
    df["Bộ"] = df["Bộ"].astype(int)
    batch_df = df[df["Bộ"] == batch_id]
    
    n_fft = batch_df["n_fft"].values[0]
    hop_length = batch_df["hop_length"].values[0]
    
    train_row = batch_df[batch_df["Tập"] == "train"]
    test_row = batch_df[batch_df["Tập"] == "test"]
    
    X_train = np.load(train_row["X_file_path"].values[0])
    y_train = np.load(train_row["y_file_path"].values[0])
    X_test = np.load(test_row["X_file_path"].values[0])
    y_test = np.load(test_row["y_file_path"].values[0])
    
    return X_train, y_train, X_test, y_test, n_fft, hop_length

def run_logistic_experiment(X_train, y_train, X_test, y_test, n_fft, hop_length, mfcc_steps):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train.reshape(X_train.shape[0], -1))
    X_test_scaled = scaler.transform(X_test.reshape(X_test.shape[0], -1))
    
    log_reg = LogisticRegression(max_iter=1000, random_state=42)
    
    results = []
    for i in mfcc_steps:
        X_train_subset = X_train_scaled[:, :i]
        X_test_subset = X_test_scaled[:, :i]
        
        start_time = time.time()
        log_reg.fit(X_train_subset, y_train)
        y_pred = log_reg.predict(X_test_subset)
        test_accuracy = accuracy_score(y_test, y_pred)
        macro_f1 = f1_score(y_test, y_pred, average='macro')
        confusion = confusion_matrix(y_test, y_pred)
        
        print(f"MFCC: {i}")
        print(confusion)
        
        elapsed_time = time.time() - start_time
        results.append({
            'numofMFCC': i,
            'n_fft': n_fft,
            'Hop_length': hop_length,
            'test_accuracy': test_accuracy,
            'macro_f1': macro_f1,
            'time': elapsed_time
        })
    
    return pd.DataFrame(results)

# Chạy thử nghiệm với Logistic Regression
mfcc_steps = [13, 20, 40, 80]
csv_path = r"C:\Users\manhm\Desktop\BeeSoundClassifier\data\extracted_features\mfcc\mfcc_extraction_log.csv"

for i in range(0, 4):
    batch_id = i
    data = load_data_by_batch(csv_path, batch_id)
    
    if data is None:
        continue
    
    print(f"\n📂 Batch {batch_id}")
    print("Training Logistic Regression")
    X_train, y_train, X_test, y_test, n_fft, hop_length = data
    df_logistic = run_logistic_experiment(X_train, y_train, X_test, y_test, n_fft, hop_length, mfcc_steps)
    print(f"\n📌 Kết quả Batch {batch_id} - Logistic Regression:")
    print(df_logistic)
    df_logistic.to_csv(f"logistic_results_batch_{batch_id}.csv", index=False)



📂 Batch 0
Training Logistic Regression
MFCC: 13
[[833 163  99]
 [211 233 248]
 [117  81 775]]
MFCC: 20
[[865 171  59]
 [206 254 232]
 [ 49  97 827]]
MFCC: 40
[[955 130  10]
 [204 260 228]
 [ 11  92 870]]
MFCC: 80
[[971 118   6]
 [189 291 212]
 [  7  73 893]]

📌 Kết quả Batch 0 - Logistic Regression:
   numofMFCC  n_fft  Hop_length  test_accuracy  macro_f1      time
0         13   1024         256       0.667029  0.625654  0.154560
1         20   1024         256       0.705072  0.663500  0.145437
2         40   1024         256       0.755435  0.707445  0.138278
3         80   1024         256       0.780797  0.737093  0.256205

📂 Batch 1
Training Logistic Regression
MFCC: 13
[[833 162 100]
 [209 236 247]
 [112  85 776]]
MFCC: 20
[[863 173  59]
 [206 254 232]
 [ 53  95 825]]
MFCC: 40
[[960 123  12]
 [204 253 235]
 [ 13  89 871]]
MFCC: 80
[[970 119   6]
 [185 299 208]
 [  8  75 890]]

📌 Kết quả Batch 1 - Logistic Regression:
   numofMFCC  n_fft  Hop_length  test_accuracy  macro_f1     