In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.metrics import classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import optuna
import pickle

In [4]:
label_dict = pickle.load(open('../data/label_dict.pkl', 'rb'))

In [5]:
def train_with_10fold(model, name, data_dir='../data/orig'):
    accuracies = []
    all_preds = []
    all_labels = []

    for test_id in range(10):
        dfs = []
        for i in range(10):
            if i != test_id:
                df = pd.read_csv(f'{data_dir}/fold{i}.csv')
                dfs.append(df)
        train_df = pd.concat(dfs, ignore_index=True)
        test_df = pd.read_csv(f'{data_dir}/fold{test_id}.csv')

        # X, y
        X_train, y_train = train_df.drop(['label', 'audio'], axis=1), train_df['label']
        X_test, y_test = test_df.drop(['label', 'audio'], axis=1), test_df['label']

        # train & evaluation
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        accuracies.append(acc)

        all_preds.extend(y_pred)
        all_labels.extend(y_test)

        print(f'Fold {test_id} Accuracy: {acc:.4f}')

    print(f'\nAverage Accuracy: {np.mean(accuracies):.4f}')
    print(f'Total: {len(all_labels)}')

    cm = confusion_matrix(all_labels, all_preds)
    plt.figure(figsize=(8,6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix (10-Fold CV) Using {name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()
    
    print(classification_report(all_labels, all_preds))

    return all_preds, all_labels


In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(verbose=0)
_ = train_with_10fold(lr, 'Logistic')


In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
_ = train_with_10fold(knn, 'KNN')

In [None]:
from sklearn.svm import SVC

svc = SVC(kernel='rbf')
_ = train_with_10fold(svc, 'SVM (RBF kernel)')


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
_ = train_with_10fold(rf, 'Random Forest')

In [None]:
from lightgbm import LGBMClassifier

lgb = LGBMClassifier(verbose = 0)
_ = train_with_10fold(lgb, 'LightGBM')

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier()
_ = train_with_10fold(xgb, 'XGBoost')

In [56]:
results_before_tuning = {
    'Logistic': {'0': 0.36, '1': 0.57, '2': 0.54, '3': 0.59, '4': 0.60, 
                 '5': 0.52, '6': 0.67, '7': 0.57, '8': 0.69, '9': 0.58},
    'KNN':      {'0': 0.35, '1': 0.55, '2': 0.45, '3': 0.56, '4': 0.48, 
                 '5': 0.48, '6': 0.76, '7': 0.39, '8': 0.64, '9': 0.56},
    'SVM-RBF':  {'0': 0.38, '1': 0.66, '2': 0.56, '3': 0.66, '4': 0.64, 
                 '5': 0.57, '6': 0.81, '7': 0.54, '8': 0.71, '9': 0.64},
    'RF':       {'0': 0.38, '1': 0.59, '2': 0.53, '3': 0.62, '4': 0.57, 
                 '5': 0.56, '6': 0.77, '7': 0.59, '8': 0.71, '9': 0.60},
    'LGBM':     {'0': 0.34, '1': 0.60, '2': 0.55, '3': 0.63, '4': 0.58, 
                 '5': 0.53, '6': 0.78, '7': 0.58, '8': 0.71, '9': 0.62},
    'XGB':      {'0': 0.35, '1': 0.59, '2': 0.54, '3': 0.63, '4': 0.58, 
                 '5': 0.52, '6': 0.79, '7': 0.55, '8': 0.70, '9': 0.63},
}

In [None]:
df = pd.DataFrame(results_before_tuning).T  # 模型為 index，label 為 column
df = df.reset_index().melt(id_vars='index', var_name='Class', value_name='F1-score')
df.rename(columns={'index': 'Model'}, inplace=True)

# 對照 label 名稱（如果有 label_dict 的話）
df['Class'] = df['Class'].astype(int).map(label_dict)

# 繪圖
plt.figure(figsize=(14, 6))
sns.barplot(data=df, x='Class', y='F1-score', hue='Model')
plt.xticks(rotation=45)
plt.title("Per-class F1-score comparison across models")
plt.tight_layout()
plt.show()

In [58]:
acc_before_tuning = {
    "Logistic": 0.5595,
    "KNN": 0.4962,
    "SVM-RBF": 0.6029,
    "RF": 0.5857,
    "LGBM": 0.5847,
    "XGB": 0.5813
}

df = pd.DataFrame({
    'model': acc_before_tuning.keys(),
    'acc': acc_before_tuning.values()
})

In [None]:
plt.figure(figsize=(12, 7)) 
ax = sns.barplot(data=df, x='model', y='acc', palette = 'deep')

for i in range(6):
	ax.bar_label(ax.containers[i], fmt='%.4f', fontsize=12) 

plt.title("Overall Model Accuracy Comparison", fontsize=16, pad=20)
plt.xlabel("Model", fontsize=14)
plt.xticks(fontsize=12) 
plt.ylabel("Accuracy", fontsize=14) 
sns.despine()
plt.tight_layout() 
plt.show()