In [None]:
import subprocess
from ast import literal_eval

def run(command):
    process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
    out, err = process.communicate()
    print(out.decode('utf-8').strip())

In [None]:
print('# CPU')
run('cat /proc/cpuinfo | egrep -m 1 "^model name"')
run('cat /proc/cpuinfo | egrep -m 1 "^cpu MHz"')
run('cat /proc/cpuinfo | egrep -m 1 "^cpu cores"')
print("")

print('# RAM')
run('cat /proc/meminfo | egrep "^MemTotal"')
print("")

print('# OS')
run('uname -a')
print("")

print('# GPU')
run('lspci | grep VGA')

# 1. Loading the dataset
the features are extracted using a slight modification of the actual code provided by [Borzi](https://github.com/UNICT-Fake-Audio/fake-audio-detector)

In [None]:
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

# ganti dataset ngikut upload nnti
train = pd.read_csv("/kaggle/input/borzi-full/train_set.csv")
dev = pd.read_csv("/kaggle/input/borzi-full/dev_set.csv")
eval = pd.read_csv("/kaggle/input/borzi-full/eval_set.csv")

# Ganti value label
train['label'] = train['label'].map({'bonafide': 1, 'spoof': 0})
dev['label'] = dev['label'].map({'bonafide': 1, 'spoof': 0})
eval['label'] = eval['label'].map({'bonafide': 1, 'spoof': 0})

# Drop col gk penting
train = train.drop('AUDIO_FILE_NAME', axis=1)
dev = dev.drop('AUDIO_FILE_NAME', axis=1)
eval = eval.drop('AUDIO_FILE_NAME', axis=1)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import os

# Concatenate the datasets
data = pd.concat([train, dev, eval], axis=0)

non_feature_columns = ['SYSTEM_ID', 'label', 'spectral_bandwidth']
feature_columns = [col for col in data.columns if col not in non_feature_columns]

# Calculate feature variance
feature_variances = data[feature_columns].var()

# Identify low-variance features (threshold can be adjusted; using 0.01 here as an example)
low_variance_threshold = 0.01
low_variance_features = feature_variances[feature_variances < low_variance_threshold]

# Sort SYSTEM_ID before analysis
data['SYSTEM_ID'] = pd.Categorical(data['SYSTEM_ID'], categories=sorted(data['SYSTEM_ID'].unique()), ordered=True)

# Plot feature distributions for all features
feature_distributions_dir = "/mnt/data/feature_distributions/"
os.makedirs(feature_distributions_dir, exist_ok=True)

# Get a colormap with as many unique colors as there are SYSTEM_IDs
unique_system_ids = sorted(data['SYSTEM_ID'].unique())
color_map = cm.get_cmap('tab20', len(unique_system_ids))

# Save distribution plots for each feature
for feature in feature_columns:
    plt.figure(figsize=(10, 6))
    for idx, system_id in enumerate(unique_system_ids):
        subset = data[data['SYSTEM_ID'] == system_id]
        plt.hist(subset[feature], bins=30, alpha=0.5, color=color_map(idx), label=f"SYSTEM_ID: {system_id}")
    plt.title(f"Distribution of {feature}")
    plt.xlabel(feature)
    plt.ylabel("Frequency")
    plt.legend()
    plt.tight_layout()
    plt.show()

# Return summary of low-variance features
low_variance_features

## 1.2 EER calculations

In [None]:
from sklearn.metrics import classification_report, roc_curve, accuracy_score
def eval_metr(y_true, y_pred, P_target=0.5):
    # Compute ROC curve
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    fnr = 1 - tpr  # False Negative Rate (Miss Rate)

    # Compute EER
    abs_diff = np.abs(fpr - fnr)
    eer_index = np.argmin(abs_diff)
    eer = (fpr[eer_index] + fnr[eer_index]) / 2

    return eer, min_tdcf

# 2. Model Pipeline

## 2.1 Random Forest

### 2.1.1 Singular feature

In [None]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
def random_forest_pipeline(train, dev, eval):
    excluded_columns = ["label", "duration", "size", "spectral_bandwidth"]
    report_list = []
    csv_data = []

    # Iterate over all columns except excluded ones
    for col_name in train.columns:
        if col_name in excluded_columns:
            continue

        # Check for NaN values and print a warning if present
        if train[col_name].isna().any() or dev[col_name].isna().any() or eval[col_name].isna().any():
            print(f"Warning: Column '{col_name}' contains NaN values. Filling with mean.")

        # Step 1: Extract X_set and y_set from train, dev, and eval datasets
        X_train, y_train = train[[col_name]].fillna(train[col_name].mean()), train['label']
        X_dev, y_dev = dev[[col_name]].fillna(dev[col_name].mean()), dev['label']
        X_eval, y_eval = eval[[col_name]].fillna(eval[col_name].mean()), eval['label']

        # Step 2: Apply SMOTE to balance the training data
        smote = SMOTE(random_state=42)
        X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

        # Step 3: Create and train the Random Forest model
        rf_model = RandomForestClassifier(random_state=42)
        rf_model.fit(X_train_smote, y_train_smote)

        # Step 4: Evaluate on dev set
        y_dev_pred = rf_model.predict(X_dev)
        y_dev_prob = rf_model.predict_proba(X_dev)[:, 1]
        accuracy_dev = accuracy_score(y_dev, y_dev_pred)
        eer_dev, tdcf_dev = eval_metr(y_dev, y_dev_prob, 0.1588, 2.1007)
        dev_report = classification_report(y_dev, y_dev_pred)
    
        # Step 5: Evaluate on eval set
        y_eval_pred = rf_model.predict(X_eval)
        y_eval_prob = rf_model.predict_proba(X_eval)[:, 1]
        accuracy_eval = accuracy_score(y_eval, y_eval_pred)
        eer_eval, tdcf_eval = eval_metr(y_eval, y_eval_prob, 0.1847, 2.0173)
        eval_report = classification_report(y_eval, y_eval_pred)

        # Step 6: Store the results in the text report
        report_list.append(f"=== Evaluation for Feature: {col_name} ===\n")
        report_list.append("\n=== Evaluation on Dev Set ===")
        report_list.append(f"Accuracy: {accuracy_dev:.4f}")
        report_list.append("Classification Report:")
        report_list.append(dev_report)
        report_list.append("Custom Eval Metric:")
        report_list.append(f"EER on validation data: {eer_dev * 100:.2f}%")
        report_list.append(f"Min t-DCF on validation data: {tdcf_dev:.4f}\n")

        report_list.append("\n=== Evaluation on Eval Set ===")
        report_list.append(f"Accuracy: {accuracy_eval:.4f}")
        report_list.append("Classification Report:")
        report_list.append(eval_report)
        report_list.append("Custom Eval Metric:")
        report_list.append(f"EER on testing data: {eer_eval * 100:.2f}%")
        report_list.append(f"Min t-DCF on testing data: {tdcf_eval:.4f}\n\n")

        # Step 7: Store the results in the CSV data
        csv_data.append({
            "Feature": col_name,
            "Dev Accuracy": accuracy_dev,
            "Dev EER": eer_dev * 100,
            "Dev Min t-DCF": tdcf_dev,
            "Eval Accuracy": accuracy_eval,
            "Eval EER": eer_eval * 100,
            "Eval Min t-DCF": tdcf_eval
        })

    # Step 8: Save the detailed report to a text file
    with open('rf_evaluation_report.txt', 'w') as f:
        f.writelines("\n".join(report_list))

    # Step 9: Save the CSV data to a file
    df_csv = pd.DataFrame(csv_data)
    df_csv.to_csv('rf_evaluation_metrics.csv', index=False)

    print("Detailed evaluation report saved to rf_evaluation_report.txt")
    print("Summary metrics saved to rf_evaluation_metrics.csv")

In [None]:
random_forest_pipeline(train, dev, eval)

### 2.1.2 All Feature

In [None]:
import joblib
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
def random_forest_pipeline_all(train, dev, eval, excluded_columns):
    # Step 1: Prepare the data
    feature_columns = [col for col in train.columns if col not in excluded_columns]
    
    # Function to impute NaN values with column mean
    def impute_missing_values(df, feature_columns):
        for col in feature_columns:
            if df[col].isna().any():
                print(f"Column '{col}' contains NaN values. Filling with mean.")
                df[col].fillna(df[col].mean(), inplace=True)
        return df
    
    # Impute missing values in train, dev, and eval datasets
    train = impute_missing_values(train, feature_columns)
    dev = impute_missing_values(dev, feature_columns)
    eval = impute_missing_values(eval, feature_columns)

    # Separate features and target
    X_train, y_train = train[feature_columns], train['label']
    X_dev, y_dev = dev[feature_columns], dev['label']
    X_eval, y_eval = eval[feature_columns], eval['label']

    # Step 2: Apply SMOTE to balance the training data
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    # Step 3: Create and train the Random Forest model
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train_smote, y_train_smote)

    # Step 4: Evaluate on dev set
    y_dev_pred = rf_model.predict(X_dev)
    y_dev_prob = rf_model.predict_proba(X_dev)[:, 1]
    accuracy_dev = accuracy_score(y_dev, y_dev_pred)
    eer_dev, tdcf_dev = eval_metr(y_dev, y_dev_prob, 0.1588, 2.1007)
    dev_report = classification_report(y_dev, y_dev_pred)
    
    # Step 5: Evaluate on eval set
    y_eval_pred = rf_model.predict(X_eval)
    y_eval_prob = rf_model.predict_proba(X_eval)[:, 1]
    accuracy_eval = accuracy_score(y_eval, y_eval_pred)
    eer_eval, tdcf_eval = eval_metr(y_eval, y_eval_prob, 0.1847, 2.0173)
    eval_report = classification_report(y_eval, y_eval_pred)

    # Step 6: Print out the results
    print("\n=== Evaluation on Dev Set ===")
    print(f"Accuracy: {accuracy_dev:.4f}")
    print("Classification Report:")
    print(dev_report)
    print("Custom Eval Metrics:")
    print(f"EER on validation data: {eer_dev * 100:.2f}%")
    print(f"Min t-DCF on validation data: {tdcf_dev:.4f}")

    print("\n=== Evaluation on Eval Set ===")
    print(f"Accuracy: {accuracy_eval:.4f}")
    print("Classification Report:")
    print(eval_report)
    print("Custom Eval Metrics:")
    print(f"EER on testing data: {eer_eval * 100:.2f}%")
    print(f"Min t-DCF on testing data: {tdcf_eval:.4f}")

    # Step 7: Save the trained model
    joblib.dump(rf_model, 'random_forest_model.pkl')
    print("Random Forest model saved as random_forest_model.pkl")

In [None]:
excluded_columns = ["label", "duration", "size", "spectral_bandwidth"]
# Assume `data` is a DataFrame that includes features and a 'label' column
random_forest_pipeline_all(train, dev, eval, excluded_columns)

## 2.2 AdaBoost

### 2.2.1 Singular Features

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from imblearn.over_sampling import SMOTE
def adaboost_pipeline(train, dev, eval):
    excluded_columns = ["label", "duration", "size", "spectral_bandwidth"]
    report_list = []
    csv_data = []

    # Iterate over all columns except excluded ones
    for col_name in train.columns:
        if col_name in excluded_columns:
            continue

        # Check for NaN values and print a warning if present
        if train[col_name].isna().any() or dev[col_name].isna().any() or eval[col_name].isna().any():
            print(f"Warning: Column '{col_name}' contains NaN values. Filling with mean.")

        # Step 1: Extract X_set and y_set from train, dev, and eval datasets
        X_train, y_train = train[[col_name]].fillna(train[col_name].mean()), train['label']
        X_dev, y_dev = dev[[col_name]].fillna(dev[col_name].mean()), dev['label']
        X_eval, y_eval = eval[[col_name]].fillna(eval[col_name].mean()), eval['label']

        # Step 2: Apply SMOTE to balance the training data
        smote = SMOTE(random_state=42)
        X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

        # Step 3: Create and train the Random Forest model
        ada_model = AdaBoostClassifier(random_state=42)
        ada_model.fit(X_train_smote, y_train_smote)

        # Step 4: Evaluate on dev set
        y_dev_pred = ada_model.predict(X_dev)
        y_dev_prob = ada_model.predict_proba(X_dev)[:, 1]
        accuracy_dev = accuracy_score(y_dev, y_dev_pred)
        eer_dev, tdcf_dev = eval_metr(y_dev, y_dev_prob, 0.1588, 2.1007)
        dev_report = classification_report(y_dev, y_dev_pred)
    
        # Step 5: Evaluate on eval set
        y_eval_pred = ada_model.predict(X_eval)
        y_eval_prob = ada_model.predict_proba(X_eval)[:, 1]
        accuracy_eval = accuracy_score(y_eval, y_eval_pred)
        eer_eval, tdcf_eval = eval_metr(y_eval, y_eval_prob, 0.1847, 2.0173)
        eval_report = classification_report(y_eval, y_eval_pred)

        # Step 6: Store the results in the text report
        report_list.append(f"=== Evaluation for Feature: {col_name} ===\n")
        report_list.append("\n=== Evaluation on Dev Set ===")
        report_list.append(f"Accuracy: {accuracy_dev:.4f}")
        report_list.append("Classification Report:")
        report_list.append(dev_report)
        report_list.append("Custom Eval Metric:")
        report_list.append(f"EER on validation data: {eer_dev * 100:.2f}%")
        report_list.append(f"Min t-DCF on validation data: {tdcf_dev:.4f}\n")

        report_list.append("\n=== Evaluation on Eval Set ===")
        report_list.append(f"Accuracy: {accuracy_eval:.4f}")
        report_list.append("Classification Report:")
        report_list.append(eval_report)
        report_list.append("Custom Eval Metric:")
        report_list.append(f"EER on testing data: {eer_eval * 100:.2f}%")
        report_list.append(f"Min t-DCF on testing data: {tdcf_eval:.4f}\n\n")

        # Step 7: Store the results in the CSV data
        csv_data.append({
            "Feature": col_name,
            "Dev Accuracy": accuracy_dev,
            "Dev EER": eer_dev * 100,
            "Dev Min t-DCF": tdcf_dev,
            "Eval Accuracy": accuracy_eval,
            "Eval EER": eer_eval * 100,
            "Eval Min t-DCF": tdcf_eval
        })

    # Step 8: Save the detailed report to a text file
    with open('ada_evaluation_report.txt', 'w') as f:
        f.writelines("\n".join(report_list))

    # Step 9: Save the CSV data to a file
    df_csv = pd.DataFrame(csv_data)
    df_csv.to_csv('ada_evaluation_metrics.csv', index=False)

    print("Detailed evaluation report saved to ada_evaluation_report.txt")
    print("Summary metrics saved to ada_evaluation_metrics.csv")

In [None]:
adaboost_pipeline(train, dev, eval)

### 2.2.2 All Features

In [None]:
import joblib
from sklearn.ensemble import AdaBoostClassifier
from imblearn.over_sampling import SMOTE
def adaboost_pipeline_all(train, dev, eval, excluded_columns):
    # Step 1: Prepare the data
    feature_columns = [col for col in train.columns if col not in excluded_columns]
    
    # Function to impute NaN values with column mean
    def impute_missing_values(df, feature_columns):
        for col in feature_columns:
            if df[col].isna().any():
                print(f"Column '{col}' contains NaN values. Filling with mean.")
                df[col].fillna(df[col].mean(), inplace=True)
        return df
    
    # Impute missing values in train, dev, and eval datasets
    train = impute_missing_values(train, feature_columns)
    dev = impute_missing_values(dev, feature_columns)
    eval = impute_missing_values(eval, feature_columns)

    # Separate features and target
    X_train, y_train = train[feature_columns], train['label']
    X_dev, y_dev = dev[feature_columns], dev['label']
    X_eval, y_eval = eval[feature_columns], eval['label']

    # Step 1: Apply SMOTE to balance the training data
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    # Step 2: Create and train the Random Forest model
    ada_model = AdaBoostClassifier(random_state=42)
    ada_model.fit(X_train_smote, y_train_smote)

    # Step 3: Evaluate on dev set
    y_dev_pred = ada_model.predict(X_dev)
    y_dev_prob = ada_model.predict_proba(X_dev)[:, 1]
    accuracy_dev = accuracy_score(y_dev, y_dev_pred)
    eer_dev, tdcf_dev = eval_metr(y_dev, y_dev_prob, 0.1588, 2.1007)
    dev_report = classification_report(y_dev, y_dev_pred)

    # Step 4: Evaluate on eval set
    y_eval_pred = ada_model.predict(X_eval)
    y_eval_prob = ada_model.predict_proba(X_eval)[:, 1]
    accuracy_eval = accuracy_score(y_eval, y_eval_pred)
    eer_eval, tdcf_eval = eval_metr(y_eval, y_eval_prob, 0.1847, 2.0173)
    eval_report = classification_report(y_eval, y_eval_pred)

    # Step 5: Print out the results
    print("\n=== Evaluation on Dev Set ===")
    print(f"Accuracy: {accuracy_dev:.4f}")
    print("Classification Report:")
    print(dev_report)
    print("Custom Eval Metrics:")
    print(f"EER on validation data: {eer_dev * 100:.2f}%")
    print(f"Min t-DCF on validation data: {tdcf_dev:.4f}")

    print("\n=== Evaluation on Eval Set ===")
    print(f"Accuracy: {accuracy_eval:.4f}")
    print("Classification Report:")
    print(eval_report)
    print("Custom Eval Metrics:")
    print(f"EER on testing data: {eer_eval * 100:.2f}%")
    print(f"Min t-DCF on testing data: {tdcf_eval:.4f}")

    # Step 6: Save the trained model
    joblib.dump(ada_model, 'adaboost_model.pkl')
    print("adaboost model saved as adaboost_model.pkl")

In [None]:
excluded_columns = ["label", "duration", "size", "spectral_bandwidth"]
adaboost_pipeline_all(train, dev, eval, excluded_columns)

## 2.3 XGBoost

### 2.3.1 Singular Features

In [None]:
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
def xgboost_pipeline(train, dev, eval):
    excluded_columns = ["label", "duration", "size", "spectral_bandwidth"]
    report_list = []
    csv_data = []

    # Iterate over all columns except excluded ones
    for col_name in train.columns:
        if col_name in excluded_columns:
            continue

        # Check for NaN values and print a warning if present
        if train[col_name].isna().any() or dev[col_name].isna().any() or eval[col_name].isna().any():
            print(f"Warning: Column '{col_name}' contains NaN values. Filling with mean.")

        # Step 1: Extract X_set and y_set from train, dev, and eval datasets
        X_train, y_train = train[[col_name]].fillna(train[col_name].mean()), train['label']
        X_dev, y_dev = dev[[col_name]].fillna(dev[col_name].mean()), dev['label']
        X_eval, y_eval = eval[[col_name]].fillna(eval[col_name].mean()), eval['label']

        # Step 2: Apply SMOTE to balance the training data
        smote = SMOTE(random_state=42)
        X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

        # Step 3: Create and train the Random Forest model
        xgb_model = AdaBoostClassifier(random_state=42)
        xgb_model.fit(X_train_smote, y_train_smote)

        # Step 4: Evaluate on dev set
        y_dev_pred = xgb_model.predict(X_dev)
        y_dev_prob = xgb_model.predict_proba(X_dev)[:, 1]
        accuracy_dev = accuracy_score(y_dev, y_dev_pred)
        eer_dev, tdcf_dev = eval_metr(y_dev, y_dev_prob, 0.1588, 2.1007)
        dev_report = classification_report(y_dev, y_dev_pred)
    
        # Step 5: Evaluate on eval set
        y_eval_pred = xgb_model.predict(X_eval)
        y_eval_prob = xgb_model.predict_proba(X_eval)[:, 1]
        accuracy_eval = accuracy_score(y_eval, y_eval_pred)
        eer_eval, tdcf_eval = eval_metr(y_eval, y_eval_prob, 0.1847, 2.0173)
        eval_report = classification_report(y_eval, y_eval_pred)

        # Step 6: Store the results in the text report
        report_list.append(f"=== Evaluation for Feature: {col_name} ===\n")
        report_list.append("\n=== Evaluation on Dev Set ===")
        report_list.append(f"Accuracy: {accuracy_dev:.4f}")
        report_list.append("Classification Report:")
        report_list.append(dev_report)
        report_list.append("Custom Eval Metric:")
        report_list.append(f"EER on validation data: {eer_dev * 100:.2f}%")
        report_list.append(f"Min t-DCF on validation data: {tdcf_dev:.4f}\n")

        report_list.append("\n=== Evaluation on Eval Set ===")
        report_list.append(f"Accuracy: {accuracy_eval:.4f}")
        report_list.append("Classification Report:")
        report_list.append(eval_report)
        report_list.append("Custom Eval Metric:")
        report_list.append(f"EER on testing data: {eer_eval * 100:.2f}%")
        report_list.append(f"Min t-DCF on testing data: {tdcf_eval:.4f}\n\n")

        # Step 7: Store the results in the CSV data
        csv_data.append({
            "Feature": col_name,
            "Dev Accuracy": accuracy_dev,
            "Dev EER": eer_dev * 100,
            "Dev Min t-DCF": tdcf_dev,
            "Eval Accuracy": accuracy_eval,
            "Eval EER": eer_eval * 100,
            "Eval Min t-DCF": tdcf_eval
        })

    # Step 8: Save the detailed report to a text file
    with open('xgb_evaluation_report.txt', 'w') as f:
        f.writelines("\n".join(report_list))

    # Step 9: Save the CSV data to a file
    df_csv = pd.DataFrame(csv_data)
    df_csv.to_csv('xgb_evaluation_metrics.csv', index=False)

    print("Detailed evaluation report saved to xgb_evaluation_report.txt")
    print("Summary metrics saved to xgb_evaluation_metrics.csv")

In [None]:
xgboost_pipeline(train, dev, eval)

### 2.3.2 All Features

In [None]:
import joblib
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
def xgboost_pipeline_all(train, dev, eval, excluded_columns):
    # Step 1: Prepare the data
    feature_columns = [col for col in train.columns if col not in excluded_columns]
    
    # Function to impute NaN values with column mean
    def impute_missing_values(df, feature_columns):
        for col in feature_columns:
            if df[col].isna().any():
                print(f"Column '{col}' contains NaN values. Filling with mean.")
                df[col].fillna(df[col].mean(), inplace=True)
        return df
    
    # Impute missing values in train, dev, and eval datasets
    train = impute_missing_values(train, feature_columns)
    dev = impute_missing_values(dev, feature_columns)
    eval = impute_missing_values(eval, feature_columns)

    # Separate features and target
    X_train, y_train = train[feature_columns], train['label']
    X_dev, y_dev = dev[feature_columns], dev['label']
    X_eval, y_eval = eval[feature_columns], eval['label']

    # Step 1: Apply SMOTE to balance the training data
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    # Step 2: Create and train the Random Forest model
    xgb_model = XGBClassifier(random_state=42)
    xgb_model.fit(X_train_smote, y_train_smote)

    # Step 3: Evaluate on dev set
    y_dev_pred = xgb_model.predict(X_dev)
    y_dev_prob = xgb_model.predict_proba(X_dev)[:, 1]
    accuracy_dev = accuracy_score(y_dev, y_dev_pred)
    dev_report = classification_report(y_dev, y_dev_pred)
    eer_dev, tdcf_dev = eval_metr(y_dev, y_dev_prob, 0.1588, 2.1007)

    # Step 4: Evaluate on eval set
    y_eval_pred = xgb_model.predict(X_eval)
    y_eval_prob = xgb_model.predict_proba(X_eval)[:, 1]
    accuracy_eval = accuracy_score(y_eval, y_eval_pred)
    eval_report = classification_report(y_eval, y_eval_pred)
    eer_eval, tdcf_eval = eval_metr(y_eval, y_eval_prob, 0.1847, 2.0173)

    # Step 5: Print out the results
    print("\n=== Evaluation on Dev Set ===")
    print(f"Accuracy: {accuracy_dev:.4f}")
    print("Classification Report:")
    print(dev_report)
    print("Custom Eval Metrics:")
    print(f"EER on validation data: {eer_dev * 100:.2f}%")
    print(f"Min t-DCF on validation data: {tdcf_dev:.4f}")

    print("\n=== Evaluation on Eval Set ===")
    print(f"Accuracy: {accuracy_eval:.4f}")
    print("Classification Report:")
    print(eval_report)
    print("Custom Eval Metrics:")
    print(f"EER on testing data: {eer_eval * 100:.2f}%")
    print(f"Min t-DCF on testing data: {tdcf_eval:.4f}")

    # Step 6: Save the trained model
    joblib.dump(xgb_model, 'xgboost_model.pkl')
    print("xgboost model saved as xgboost_model.pkl")

In [None]:
excluded_columns = ["label", "duration", "size", "spectral_bandwidth"]
xgboost_pipeline_all(train, dev, eval, excluded_columns)

# 3. Preselect Features

## 3.1 Feature initialization

In [None]:
def filter_features(train, dev, eval):
    SELECT_FEATURES = ['bit_rate', 'lfcc', 'mindom', 'size', 'spectral_flatness', 'spectral_centroid', 'spectral_mean', 'spectral_rms', 'spectral_spread']

    # Step 1: Select only the columns in SELECT_FEATURES plus 'label'
    selected_columns = SELECT_FEATURES + ['label']

    # Filter the datasets to include only the selected columns
    train_filtered = train[selected_columns]
    dev_filtered = dev[selected_columns]
    eval_filtered = eval[selected_columns]

    return train_filtered, dev_filtered, eval_filtered

train_filtered, dev_filtered, eval_filtered = filter_features(train, dev, eval)

## 3.2 Random Forest

In [None]:
import joblib
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
def rf_pipeline_select(train, dev, eval):
    # Step 1: Prepare the data
    feature_columns = [col for col in train.columns if col != 'label']
    
    # Function to impute NaN values with column mean
    def impute_missing_values(df, feature_columns):
        for col in feature_columns:
            if df[col].isna().any():
                print(f"Column '{col}' contains NaN values. Filling with mean.")
                df[col].fillna(df[col].mean(), inplace=True)
        return df
    
    # Impute missing values in train, dev, and eval datasets
    train = impute_missing_values(train, feature_columns)
    dev = impute_missing_values(dev, feature_columns)
    eval = impute_missing_values(eval, feature_columns)

    # Separate features and target
    X_train, y_train = train[feature_columns], train['label']
    X_dev, y_dev = dev[feature_columns], dev['label']
    X_eval, y_eval = eval[feature_columns], eval['label']

    # Step 2: Apply SMOTE to balance the training data
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    # Step 3: Create and train the Random Forest model
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train_smote, y_train_smote)

    # Step 4: Evaluate on dev set
    y_dev_pred = rf_model.predict(X_dev)
    y_dev_prob = rf_model.predict_proba(X_dev)[:, 1]
    accuracy_dev = accuracy_score(y_dev, y_dev_pred)
    eer_dev, tdcf_dev = eval_metr(y_dev, y_dev_prob, 0.1588, 2.1007)
    dev_report = classification_report(y_dev, y_dev_pred)
    
    # Step 5: Evaluate on eval set
    y_eval_pred = rf_model.predict(X_eval)
    y_eval_prob = rf_model.predict_proba(X_eval)[:, 1]
    accuracy_eval = accuracy_score(y_eval, y_eval_pred)
    eer_eval, tdcf_eval = eval_metr(y_eval, y_eval_prob, 0.1847, 2.0173)
    eval_report = classification_report(y_eval, y_eval_pred)

    # Step 6: Print out the results
    print("\n=== Evaluation on Dev Set ===")
    print(f"Accuracy: {accuracy_dev:.4f}")
    print("Classification Report:")
    print(dev_report)
    print("Custom Eval Metrics:")
    print(f"EER on validation data: {eer_dev * 100:.2f}%")
    print(f"Min t-DCF on validation data: {tdcf_dev:.4f}")

    print("\n=== Evaluation on Eval Set ===")
    print(f"Accuracy: {accuracy_eval:.4f}")
    print("Classification Report:")
    print(eval_report)
    print("Custom Eval Metrics:")
    print(f"EER on testing data: {eer_eval * 100:.2f}%")
    print(f"Min t-DCF on testing data: {tdcf_eval:.4f}")

    # Step 7: Save the trained model
    joblib.dump(rf_model, 'rf_select_model.pkl')
    print("Random Forest model saved as rf_select_model.pkl")

In [None]:
rf_pipeline_select(train_filtered, dev_filtered, eval_filtered)

## 3.3 AdaBoost

In [None]:
import joblib
from sklearn.ensemble import AdaBoostClassifier
from imblearn.over_sampling import SMOTE
def ada_pipeline_select(train, dev, eval):
    # Step 1: Prepare the data
    feature_columns = [col for col in train.columns if col != 'label']
    
    # Function to impute NaN values with column mean
    def impute_missing_values(df, feature_columns):
        for col in feature_columns:
            if df[col].isna().any():
                print(f"Column '{col}' contains NaN values. Filling with mean.")
                df[col].fillna(df[col].mean(), inplace=True)
        return df
    
    # Impute missing values in train, dev, and eval datasets
    train = impute_missing_values(train, feature_columns)
    dev = impute_missing_values(dev, feature_columns)
    eval = impute_missing_values(eval, feature_columns)

    # Separate features and target
    X_train, y_train = train[feature_columns], train['label']
    X_dev, y_dev = dev[feature_columns], dev['label']
    X_eval, y_eval = eval[feature_columns], eval['label']

    # Step 2: Apply SMOTE to balance the training data
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    # Step 3: Create and train the Random Forest model
    ada_model = AdaBoostClassifier(random_state=42)
    ada_model.fit(X_train_smote, y_train_smote)

    # Step 4: Evaluate on dev set
    y_dev_pred = ada_model.predict(X_dev)
    y_dev_prob = ada_model.predict_proba(X_dev)[:, 1]
    accuracy_dev = accuracy_score(y_dev, y_dev_pred)
    eer_dev, tdcf_dev = eval_metr(y_dev, y_dev_prob, 0.1588, 2.1007)
    dev_report = classification_report(y_dev, y_dev_pred)
    
    # Step 5: Evaluate on eval set
    y_eval_pred = ada_model.predict(X_eval)
    y_eval_prob = ada_model.predict_proba(X_eval)[:, 1]
    accuracy_eval = accuracy_score(y_eval, y_eval_pred)
    eer_eval, tdcf_eval = eval_metr(y_eval, y_eval_prob, 0.1847, 2.0173)
    eval_report = classification_report(y_eval, y_eval_pred)

    # Step 6: Print out the results
    print("\n=== Evaluation on Dev Set ===")
    print(f"Accuracy: {accuracy_dev:.4f}")
    print("Classification Report:")
    print(dev_report)
    print("Custom Eval Metrics:")
    print(f"EER on validation data: {eer_dev * 100:.2f}%")
    print(f"Min t-DCF on validation data: {tdcf_dev:.4f}")

    print("\n=== Evaluation on Eval Set ===")
    print(f"Accuracy: {accuracy_eval:.4f}")
    print("Classification Report:")
    print(eval_report)
    print("Custom Eval Metrics:")
    print(f"EER on testing data: {eer_eval * 100:.2f}%")
    print(f"Min t-DCF on testing data: {tdcf_eval:.4f}")

    # Step 7: Save the trained model
    joblib.dump(ada_model, 'ada_select_model.pkl')
    print("Adaboost model saved as ada_select_model.pkl")

In [None]:
ada_pipeline_select(train_filtered, dev_filtered, eval_filtered)

## 3.4 XGBoost

In [None]:
import joblib
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
def xgb_pipeline_select(train, dev, eval):
    # Step 1: Prepare the data
    feature_columns = [col for col in train.columns if col != 'label']
    
    # Function to impute NaN values with column mean
    def impute_missing_values(df, feature_columns):
        for col in feature_columns:
            if df[col].isna().any():
                print(f"Column '{col}' contains NaN values. Filling with mean.")
                df[col].fillna(df[col].mean(), inplace=True)
        return df
    
    # Impute missing values in train, dev, and eval datasets
    train = impute_missing_values(train, feature_columns)
    dev = impute_missing_values(dev, feature_columns)
    eval = impute_missing_values(eval, feature_columns)

    # Separate features and target
    X_train, y_train = train[feature_columns], train['label']
    X_dev, y_dev = dev[feature_columns], dev['label']
    X_eval, y_eval = eval[feature_columns], eval['label']

    # Step 2: Apply SMOTE to balance the training data
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    # Step 3: Create and train the Random Forest model
    xgb_model = XGBClassifier(random_state=42)
    xgb_model.fit(X_train_smote, y_train_smote)

    # Step 4: Evaluate on dev set
    y_dev_pred = xgb_model.predict(X_dev)
    y_dev_prob = xgb_model.predict_proba(X_dev)[:, 1]
    accuracy_dev = accuracy_score(y_dev, y_dev_pred)
    eer_dev, tdcf_dev = eval_metr(y_dev, y_dev_prob, 0.1588, 2.1007)
    dev_report = classification_report(y_dev, y_dev_pred)
    
    # Step 5: Evaluate on eval set
    y_eval_pred = xgb_model.predict(X_eval)
    y_eval_prob = xgb_model.predict_proba(X_eval)[:, 1]
    accuracy_eval = accuracy_score(y_eval, y_eval_pred)
    eer_eval, tdcf_eval = eval_metr(y_eval, y_eval_prob, 0.1847, 2.0173)
    eval_report = classification_report(y_eval, y_eval_pred)

    # Step 6: Print out the results
    print("\n=== Evaluation on Dev Set ===")
    print(f"Accuracy: {accuracy_dev:.4f}")
    print("Classification Report:")
    print(dev_report)
    print("Custom Eval Metrics:")
    print(f"EER on validation data: {eer_dev * 100:.2f}%")
    print(f"Min t-DCF on validation data: {tdcf_dev:.4f}")

    print("\n=== Evaluation on Eval Set ===")
    print(f"Accuracy: {accuracy_eval:.4f}")
    print("Classification Report:")
    print(eval_report)
    print("Custom Eval Metrics:")
    print(f"EER on testing data: {eer_eval * 100:.2f}%")
    print(f"Min t-DCF on testing data: {tdcf_eval:.4f}")

    # Step 7: Save the trained model
    joblib.dump(xgb_model, 'xgb_select_model.pkl')
    print("XGBoost model saved as xgb_select_model.pkl")

In [None]:
xgb_pipeline_select(train_filtered, dev_filtered, eval_filtered)

# 4. Bayesian Optimization on Select Features
Bayesian Optimization using optuna

## 4.1 Optuna Initialization

In [None]:
import optuna
import joblib
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

SELECT_FEATURES = ['bit_rate', 'lfcc', 'mindom', 'size', 'spectral_flatness', 'spectral_centroid', 'spectral_mean', 'spectral_rms', 'spectral_spread']

def prepare_data(train, dev):
    # Filter datasets using SELECT_FEATURES
    X_train, y_train = train[SELECT_FEATURES], train['label']
    X_dev, y_dev = dev[SELECT_FEATURES], dev['label']

    # Apply SMOTE to balance the training data
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    return X_train_smote, y_train_smote, X_dev, y_dev
    
def objective(trial, model_type, X_train, y_train, X_dev, y_dev):
    if model_type == 'RandomForest':
        # Hyperparameters for Random Forest
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        max_depth = trial.suggest_int('max_depth', 10, 100)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42
        )

    elif model_type == 'AdaBoost':
        # Hyperparameters for AdaBoost
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 1.0)
        model = AdaBoostClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            random_state=42
        )

    elif model_type == 'XGBoost':
        # Hyperparameters for XGBoost
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        max_depth = trial.suggest_int('max_depth', 3, 10)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
        subsample = trial.suggest_float('subsample', 0.5, 1.0)
        model = XGBClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            subsample=subsample,
            random_state=42,
            use_label_encoder=False,
            eval_metric='logloss'
        )

    # Train the model
    model.fit(X_train, y_train)
    y_dev_prob = model.predict_proba(X_dev)[:, 1]
    eer, min_tdcf = eval_metr(y_dev, y_dev_prob, C0=0.1588, C1=2.1007)

    return eer

# Function to optimize models using Optuna
def optimize_model(train, dev, model_type):
    X_train, y_train, X_dev, y_dev = prepare_data(train, dev)

    study = optuna.create_study(direction='minimize')  # Minimize EER
    study.optimize(lambda trial: objective(trial, model_type, X_train, y_train, X_dev, y_dev), n_trials=50)

    print(f"Best trial for {model_type}: {study.best_trial.params}")
    print(f"Best EER for {model_type}: {study.best_value:.4f}")
    return study

In [None]:
# Optimize Random Forest
rf_study = optimize_model(train, eval, 'RandomForest')

# Optimize AdaBoost
ada_study = optimize_model(train, eval, 'AdaBoost')

# Optimize XGBoost
xgb_study = optimize_model(train, eval, 'XGBoost')

# Save the best model
joblib.dump(rf_study.best_trial.params, 'best_rf_model_params.pkl')
joblib.dump(ada_study.best_trial.params, 'best_adaboost_model_params.pkl')
joblib.dump(xgb_study.best_trial.params, 'best_xgboost_model_params.pkl')

## 4.2 Feature Selection

In [None]:
def filter_features(train, dev, eval):
    SELECT_FEATURES = ['bit_rate', 'lfcc', 'mindom', 'spectral_flatness', 'spectral_centroid', 'spectral_mean', 'spectral_rms', 'spectral_spread']

    # Step 1: Select only the columns in SELECT_FEATURES plus 'label'
    selected_columns = SELECT_FEATURES + ['label']

    # Filter the datasets to include only the selected columns
    train_filtered = train[selected_columns]
    dev_filtered = dev[selected_columns]
    eval_filtered = eval[selected_columns]

    return train_filtered, dev_filtered, eval_filtered

train_filtered, dev_filtered, eval_filtered = filter_features(train, dev, eval)

## 4.3 Random Forest

In [None]:
params = {'n_estimators': 191, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 5}
def rf_pipeline_select(train, dev, eval):
    # Step 1: Prepare the data
    feature_columns = [col for col in train.columns if col != 'label']
    
    # Function to impute NaN values with column mean
    def impute_missing_values(df, feature_columns):
        for col in feature_columns:
            if df[col].isna().any():
                print(f"Column '{col}' contains NaN values. Filling with mean.")
                df[col].fillna(df[col].mean(), inplace=True)
        return df
    
    # Impute missing values in train, dev, and eval datasets
    train = impute_missing_values(train, feature_columns)
    dev = impute_missing_values(dev, feature_columns)
    eval = impute_missing_values(eval, feature_columns)

    # Separate features and target
    X_train, y_train = train[feature_columns], train['label']
    X_dev, y_dev = dev[feature_columns], dev['label']
    X_eval, y_eval = eval[feature_columns], eval['label']

    # Step 2: Apply SMOTE to balance the training data
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    # Step 3: Create and train the Random Forest model
    rf_model = RandomForestClassifier(random_state=42, **params)
    rf_model.fit(X_train_smote, y_train_smote)

    # Step 4: Evaluate on dev set
    y_dev_pred = rf_model.predict(X_dev)
    y_dev_prob = rf_model.predict_proba(X_dev)[:, 1]
    accuracy_dev = accuracy_score(y_dev, y_dev_pred)
    eer_dev, tdcf_dev = eval_metr(y_dev, y_dev_prob, 0.1588, 2.1007)
    dev_report = classification_report(y_dev, y_dev_pred)
    
    # Step 5: Evaluate on eval set
    y_eval_pred = rf_model.predict(X_eval)
    y_eval_prob = rf_model.predict_proba(X_eval)[:, 1]
    accuracy_eval = accuracy_score(y_eval, y_eval_pred)
    eer_eval, tdcf_eval = eval_metr(y_eval, y_eval_prob, 0.1847, 2.0173)
    eval_report = classification_report(y_eval, y_eval_pred)

    # Step 6: Print out the results
    print("\n=== Evaluation on Dev Set ===")
    print(f"Accuracy: {accuracy_dev:.4f}")
    print("Classification Report:")
    print(dev_report)
    print("Custom Eval Metrics:")
    print(f"EER on validation data: {eer_dev * 100:.2f}%")
    print(f"Min t-DCF on validation data: {tdcf_dev:.4f}")

    print("\n=== Evaluation on Eval Set ===")
    print(f"Accuracy: {accuracy_eval:.4f}")
    print("Classification Report:")
    print(eval_report)
    print("Custom Eval Metrics:")
    print(f"EER on testing data: {eer_eval * 100:.2f}%")
    print(f"Min t-DCF on testing data: {tdcf_eval:.4f}")

    # Step 7: Save the trained model
    joblib.dump(rf_model, 'rf_select_model.pkl')
    print("Random Forest model saved as rf_select_model.pkl")

In [None]:
rf_pipeline_select(train_filtered, dev_filtered, eval_filtered)

## 4.4 AdaBoost

In [None]:
params = {'n_estimators': 116, 'learning_rate': 0.05192761614314263}
def ada_pipeline_select(train, dev, eval):
    # Step 1: Prepare the data
    feature_columns = [col for col in train.columns if col != 'label']
    
    # Function to impute NaN values with column mean
    def impute_missing_values(df, feature_columns):
        for col in feature_columns:
            if df[col].isna().any():
                print(f"Column '{col}' contains NaN values. Filling with mean.")
                df[col].fillna(df[col].mean(), inplace=True)
        return df
    
    # Impute missing values in train, dev, and eval datasets
    train = impute_missing_values(train, feature_columns)
    dev = impute_missing_values(dev, feature_columns)
    eval = impute_missing_values(eval, feature_columns)

    # Separate features and target
    X_train, y_train = train[feature_columns], train['label']
    X_dev, y_dev = dev[feature_columns], dev['label']
    X_eval, y_eval = eval[feature_columns], eval['label']

    # Step 2: Apply SMOTE to balance the training data
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    # Step 3: Create and train the Random Forest model
    ada_model = AdaBoostClassifier(random_state=42, **params)
    ada_model.fit(X_train_smote, y_train_smote)

    # Step 4: Evaluate on dev set
    y_dev_pred = ada_model.predict(X_dev)
    y_dev_prob = ada_model.predict_proba(X_dev)[:, 1]
    accuracy_dev = accuracy_score(y_dev, y_dev_pred)
    eer_dev, tdcf_dev = eval_metr(y_dev, y_dev_prob, 0.1588, 2.1007)
    dev_report = classification_report(y_dev, y_dev_pred)
    
    # Step 5: Evaluate on eval set
    y_eval_pred = ada_model.predict(X_eval)
    y_eval_prob = ada_model.predict_proba(X_eval)[:, 1]
    accuracy_eval = accuracy_score(y_eval, y_eval_pred)
    eer_eval, tdcf_eval = eval_metr(y_eval, y_eval_prob, 0.1847, 2.0173)
    eval_report = classification_report(y_eval, y_eval_pred)

    # Step 6: Print out the results
    print("\n=== Evaluation on Dev Set ===")
    print(f"Accuracy: {accuracy_dev:.4f}")
    print("Classification Report:")
    print(dev_report)
    print("Custom Eval Metrics:")
    print(f"EER on validation data: {eer_dev * 100:.2f}%")
    print(f"Min t-DCF on validation data: {tdcf_dev:.4f}")

    print("\n=== Evaluation on Eval Set ===")
    print(f"Accuracy: {accuracy_eval:.4f}")
    print("Classification Report:")
    print(eval_report)
    print("Custom Eval Metrics:")
    print(f"EER on testing data: {eer_eval * 100:.2f}%")
    print(f"Min t-DCF on testing data: {tdcf_eval:.4f}")

    # Step 7: Save the trained model
    joblib.dump(ada_model, 'ada_select_model.pkl')
    print("Adaboost model saved as ada_select_model.pkl")

In [None]:
ada_pipeline_select(train_filtered, dev_filtered, eval_filtered)

## 4.5 XGBoost

In [None]:
params = {'n_estimators': 68, 'max_depth': 7, 'learning_rate': 0.026718036292448403, 'subsample': 0.6261578971619668}
def xgb_pipeline_select(train, dev, eval):
    # Step 1: Prepare the data
    feature_columns = [col for col in train.columns if col != 'label']
    
    # Function to impute NaN values with column mean
    def impute_missing_values(df, feature_columns):
        for col in feature_columns:
            if df[col].isna().any():
                print(f"Column '{col}' contains NaN values. Filling with mean.")
                df[col].fillna(df[col].mean(), inplace=True)
        return df
    
    # Impute missing values in train, dev, and eval datasets
    train = impute_missing_values(train, feature_columns)
    dev = impute_missing_values(dev, feature_columns)
    eval = impute_missing_values(eval, feature_columns)

    # Separate features and target
    X_train, y_train = train[feature_columns], train['label']
    X_dev, y_dev = dev[feature_columns], dev['label']
    X_eval, y_eval = eval[feature_columns], eval['label']

    # Step 2: Apply SMOTE to balance the training data
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    # Step 3: Create and train the Random Forest model
    xgb_model = XGBClassifier(random_state=42, **params)
    xgb_model.fit(X_train_smote, y_train_smote)

    # Step 4: Evaluate on dev set
    y_dev_pred = xgb_model.predict(X_dev)
    y_dev_prob = xgb_model.predict_proba(X_dev)[:, 1]
    accuracy_dev = accuracy_score(y_dev, y_dev_pred)
    eer_dev, tdcf_dev = eval_metr(y_dev, y_dev_prob, 0.1588, 2.1007)
    dev_report = classification_report(y_dev, y_dev_pred)
    
    # Step 5: Evaluate on eval set
    y_eval_pred = xgb_model.predict(X_eval)
    y_eval_prob = xgb_model.predict_proba(X_eval)[:, 1]
    accuracy_eval = accuracy_score(y_eval, y_eval_pred)
    eer_eval, tdcf_eval = eval_metr(y_eval, y_eval_prob, 0.1847, 2.0173)
    eval_report = classification_report(y_eval, y_eval_pred)

    # Step 6: Print out the results
    print("\n=== Evaluation on Dev Set ===")
    print(f"Accuracy: {accuracy_dev:.4f}")
    print("Classification Report:")
    print(dev_report)
    print("Custom Eval Metrics:")
    print(f"EER on validation data: {eer_dev * 100:.2f}%")
    print(f"Min t-DCF on validation data: {tdcf_dev:.4f}")

    print("\n=== Evaluation on Eval Set ===")
    print(f"Accuracy: {accuracy_eval:.4f}")
    print("Classification Report:")
    print(eval_report)
    print("Custom Eval Metrics:")
    print(f"EER on testing data: {eer_eval * 100:.2f}%")
    print(f"Min t-DCF on testing data: {tdcf_eval:.4f}")

    # Step 7: Save the trained model
    joblib.dump(xgb_model, 'xgb_select_model.pkl')
    print("XGBoost model saved as xgb_select_model.pkl")

In [None]:
xgb_pipeline_select(train_filtered, dev_filtered, eval_filtered)

# 5. Feature Importance

In [None]:
import joblib
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import matplotlib.pyplot as plt

# Shared pipeline function for all models
def model_pipeline(train, dev, eval, excluded_columns, model, model_name, eval_metr_params):
    feature_columns = [col for col in train.columns if col not in excluded_columns]

    def impute_missing_values(df, feature_columns):
        for col in feature_columns:
            if df[col].isna().any():
                print(f"Column '{col}' contains NaN values. Filling with mean.")
                df[col].fillna(df[col].mean(), inplace=True)
        return df

    train = impute_missing_values(train, feature_columns)
    dev = impute_missing_values(dev, feature_columns)
    eval = impute_missing_values(eval, feature_columns)

    X_train, y_train = train[feature_columns], train['label']
    X_dev, y_dev = dev[feature_columns], dev['label']
    X_eval, y_eval = eval[feature_columns], eval['label']

    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    model.fit(X_train_smote, y_train_smote)

    y_eval_pred = model.predict(X_eval)
    y_eval_prob = model.predict_proba(X_eval)[:, 1]
    accuracy_eval = accuracy_score(y_eval, y_eval_pred)
    eer_eval, tdcf_eval = eval_metr(y_eval, y_eval_prob, *eval_metr_params['eval'])
    eval_report = classification_report(y_eval, y_eval_pred)

    print(f"\n=== {model_name} Evaluation on Eval Set ===")
    print(f"Accuracy: {accuracy_eval:.4f}")
    print("Classification Report:")
    print(eval_report)
    print("Custom Eval Metrics:")
    print(f"EER on testing data: {eer_eval * 100:.2f}%")
    print(f"Min t-DCF on testing data: {tdcf_eval:.4f}")

    # Feature Importance
    if hasattr(model, 'feature_importances_'):
        feature_importance = pd.DataFrame({
            'Feature': feature_columns,
            'Importance': model.feature_importances_
        }).sort_values(by='Importance', ascending=False)
        print(f"\n=== {model_name} Feature Importance ===")
        print(feature_importance.head(20))
        # Plot top 20 features
        top_features = feature_importance.head(20)
        plt.figure(figsize=(10, 8))
        plt.barh(top_features['Feature'][::-1], top_features['Importance'][::-1], color='skyblue')
        plt.xlabel('Importance')
        plt.title(f'Top 20 Features - {model_name}')
        plt.tight_layout()
        plt.show()
        print(f"Feature importance plot saved as {model_name.lower()}_feature_importance_plot.png")
    else:
        print(f"{model_name} does not support feature importance extraction.")

# Example usage:
eval_metr_params = {
    'dev': [0.1588, 2.1007],
    'eval': [0.1847, 2.0173]
}

excluded_columns = ["label", "duration", "size", "spectral_bandwidth"]

## 5.1 Random Forest

In [None]:
# Random Forest
random_forest = RandomForestClassifier(random_state=42)
model_pipeline(train, dev, eval, excluded_columns, random_forest, "Random Forest", eval_metr_params)

## 5.2 AdaBoost

In [None]:
# AdaBoost
adaboost = AdaBoostClassifier(random_state=42)
model_pipeline(train, dev, eval, excluded_columns, adaboost, "AdaBoost", eval_metr_params)

## 5.3 XGBoost

In [None]:
# XGBoost
xgboost = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model_pipeline(train, dev, eval, excluded_columns, xgboost, "XGBoost", eval_metr_params)

## 5.4 Conclusion

|Feature|XGBoost Importance|Random Forest Importance|AdaBoost Importance|Selected?|
|-|-|-|-|-|
|minfun	|0.119968	|0.100231	|0.22	|✅|
|spectral_entropy	|0.119771	|0.106699	|0.04	|✅|
|meanfun	|0.049779	|0.101235	|0.08	|✅|
|mode_frequency	|0.092469	|0.089528	|0.08	|✅|
|bit_rate	|0.027392	|0.075031	|0.06	|✅|
|peak_frequency	|N/A	|0.075427	|0.20	|✅|
|energy	|0.032100	|0.075796	|N/A	|✅|
|zcr	|0.029568	|0.046573	|0.04	|✅|
|modindex	|0.020655	|0.035107	|0.04	|✅|
|dfrange	|0.020519	|0.021332	|0.06	|✅|

# 6. Model with Top 10 Features

In [None]:
import joblib
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import matplotlib.pyplot as plt

# Shared pipeline function for all models
def model_pipeline(train, dev, eval, excluded_columns, model, model_name, eval_metr_params):
    selected_features = [
        "minfun", "spectral_entropy", "meanfun", "mode_frequency", "bit_rate", 
        "peak_frequency", "energy", "zcr", "modindex", "dfrange"
    ]

    def impute_missing_values(df, feature_columns):
        for col in feature_columns:
            if df[col].isna().any():
                print(f"Column '{col}' contains NaN values. Filling with mean.")
                df[col].fillna(df[col].mean(), inplace=True)
        return df

    train = impute_missing_values(train, selected_features)
    dev = impute_missing_values(dev, selected_features)
    eval = impute_missing_values(eval, selected_features)

    X_train, y_train = train[selected_features], train['label']
    X_dev, y_dev = dev[selected_features], dev['label']
    X_eval, y_eval = eval[selected_features], eval['label']

    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

    model.fit(X_train_smote, y_train_smote)

    y_dev_pred = model.predict(X_dev)
    y_dev_prob = model.predict_proba(X_dev)[:, 1]
    accuracy_dev = accuracy_score(y_dev, y_dev_pred)
    eer_dev, tdcf_dev = eval_metr(y_dev, y_dev_prob, *eval_metr_params['dev'])
    dev_report = classification_report(y_dev, y_dev_pred)

    y_eval_pred = model.predict(X_eval)
    y_eval_prob = model.predict_proba(X_eval)[:, 1]
    accuracy_eval = accuracy_score(y_eval, y_eval_pred)
    eer_eval, tdcf_eval = eval_metr(y_eval, y_eval_prob, *eval_metr_params['eval'])
    eval_report = classification_report(y_eval, y_eval_pred)

    print(f"\n=== {model_name} Evaluation on Dev Set ===")
    print(f"Accuracy: {accuracy_dev:.4f}")
    print("Classification Report:")
    print(dev_report)
    print("Custom Eval Metrics:")
    print(f"EER on validation data: {eer_dev * 100:.2f}%")
    print(f"Min t-DCF on validation data: {tdcf_dev:.4f}")

    print(f"\n=== {model_name} Evaluation on Eval Set ===")
    print(f"Accuracy: {accuracy_eval:.4f}")
    print("Classification Report:")
    print(eval_report)
    print("Custom Eval Metrics:")
    print(f"EER on testing data: {eer_eval * 100:.2f}%")
    print(f"Min t-DCF on testing data: {tdcf_eval:.4f}")

    joblib.dump(model, f'{model_name.lower()}_model.pkl')
    print(f"{model_name} model saved as {model_name.lower()}_model.pkl")

    # Feature Importance
    if hasattr(model, 'feature_importances_'):
        feature_importance = pd.DataFrame({
            'Feature': selected_features,
            'Importance': model.feature_importances_
        }).sort_values(by='Importance', ascending=False)
        print(f"\n=== {model_name} Feature Importance ===")
        print(feature_importance)
        feature_importance.to_csv(f'{model_name.lower()}_feature_importance.csv', index=False)
        print(f"Feature importance saved as {model_name.lower()}_feature_importance.csv")

        # Plot top features
        plt.figure(figsize=(10, 8))
        bars = plt.barh(feature_importance['Feature'][::-1], feature_importance['Importance'][::-1], color='skyblue')
        plt.xlabel('Importance')
        plt.title(f'Top Features - {model_name}')
        plt.tight_layout()
        plt.savefig(f'{model_name.lower()}_feature_importance_plot.png')
        plt.show()
        print(f"Feature importance plot saved as {model_name.lower()}_feature_importance_plot.png")
    else:
        print(f"{model_name} does not support feature importance extraction.")

# Example usage:
eval_metr_params = {
    'dev': [0.1588, 2.1007],
    'eval': [0.1847, 2.0173]
}

excluded_columns = ["label", "duration", "size", "spectral_bandwidth"]

## 6.1 Random Forest

In [None]:
# Random Forest
random_forest = RandomForestClassifier(random_state=42)
model_pipeline(train, dev, eval, excluded_columns, random_forest, "Random Forest", eval_metr_params)

## 6.2 AdaBoost

In [None]:
# AdaBoost
adaboost = AdaBoostClassifier(random_state=42)
model_pipeline(train, dev, eval, excluded_columns, adaboost, "AdaBoost", eval_metr_params)

## 6.3 XGBoost

In [None]:
# XGBoost
xgboost = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model_pipeline(train, dev, eval, excluded_columns, xgboost, "XGBoost", eval_metr_params)

# 7. Oversampling

In [None]:
import joblib
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import matplotlib.pyplot as plt

# Shared pipeline function for all models with oversampling and undersampling options
def model_pipeline_balanced(train, dev, eval, excluded_columns, model, model_name, eval_metr_params, resampling_method='smote'):
    selected_features = [
        "minfun", "spectral_entropy", "meanfun", "mode_frequency", "bit_rate", 
        "peak_frequency", "energy", "zcr", "modindex", "dfrange"
    ]

    def impute_missing_values(df, feature_columns):
        for col in feature_columns:
            if df[col].isna().any():
                print(f"Column '{col}' contains NaN values. Filling with mean.")
                df[col].fillna(df[col].mean(), inplace=True)
        return df

    train = impute_missing_values(train, selected_features)
    dev = impute_missing_values(dev, selected_features)
    eval = impute_missing_values(eval, selected_features)

    X_train, y_train = train[selected_features], train['label']
    X_dev, y_dev = dev[selected_features], dev['label']
    X_eval, y_eval = eval[selected_features], eval['label']

    # Apply resampling method (SMOTE or undersampling)
    if resampling_method == 'smote':
        resampler = SMOTE(random_state=42)
        X_train_resampled, y_train_resampled = resampler.fit_resample(X_train, y_train)
    elif resampling_method == 'undersample':
        resampler = RandomUnderSampler(random_state=42)
        X_train_resampled, y_train_resampled = resampler.fit_resample(X_train, y_train)
    else:
        raise ValueError(f"Unsupported resampling method: {resampling_method}")

    # Train the model
    model.fit(X_train_resampled, y_train_resampled)

    # Evaluate on the evaluation set
    y_eval_pred = model.predict(X_eval)
    y_eval_prob = model.predict_proba(X_eval)[:, 1]
    accuracy_eval = accuracy_score(y_eval, y_eval_pred)
    eer_eval, tdcf_eval = eval_metr(y_eval, y_eval_prob, *eval_metr_params['eval'])
    eval_report = classification_report(y_eval, y_eval_pred)

    print(f"\n=== {model_name} Evaluation on Eval Set using {resampling_method.upper()} ===")
    print(f"Accuracy: {accuracy_eval:.4f}")
    print("Classification Report:")
    print(eval_report)
    print("Custom Eval Metrics:")
    print(f"EER on testing data: {eer_eval * 100:.2f}%")
    print(f"Min t-DCF on testing data: {tdcf_eval:.4f}")

# Example usage with both SMOTE and undersampling:
eval_metr_params = {
    'dev': [0.1588, 2.1007],
    'eval': [0.1847, 2.0173]
}

excluded_columns = ["label", "duration", "size", "spectral_bandwidth"]

## 7.1 Random Forest

In [None]:
rf_model = RandomForestClassifier(random_state=42)
model_pipeline_balanced(train, dev, eval, excluded_columns, rf_model, "Random Forest", eval_metr_params, resampling_method='undersample')

## 7.2 AdaBoost

In [None]:
# AdaBoost
adaboost = AdaBoostClassifier(random_state=42)
model_pipeline_balanced(train, dev, eval, excluded_columns, adaboost, "AdaBoost", eval_metr_params, resampling_method='undersample')

## 7.3 XGBoost

In [None]:
# XGBoost
xgboost = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model_pipeline_balanced(train, dev, eval, excluded_columns, adaboost, "XGBoost", eval_metr_params, resampling_method='undersample')

# 8. System Specific Features

## 8.1 All System ID

In [2]:
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

# ganti dataset ngikut upload nnti
train = pd.read_csv("/kaggle/input/borzi-full/train_set.csv")
dev = pd.read_csv("/kaggle/input/borzi-full/dev_set.csv")
eval = pd.read_csv("/kaggle/input/borzi-full/eval_set.csv")

# Ganti value label
train['label'] = train['label'].map({'bonafide': 1, 'spoof': 0})
dev['label'] = dev['label'].map({'bonafide': 1, 'spoof': 0})
eval['label'] = eval['label'].map({'bonafide': 1, 'spoof': 0})

data = pd.concat([train, dev, eval], axis=0, ignore_index=True, join='outer')

# Drop col gk penting
data = data.drop(['AUDIO_FILE_NAME', "duration", "size", "spectral_bandwidth"], axis=1)

In [3]:
from sklearn.model_selection import train_test_split

X = data.drop('label', axis=1)
y = data['label']

X_train, y_train, X_test, y_test = train_test_split(X, y, test_size=0.30, random_state=42, shuffle=True)

In [4]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(85022, 47)
(36438, 47)
(85022,)
(36438,)


### 8.1.1 Evaluation & Feature Importance
- Random Forest
- AdaBoost
- XGBoost

In [5]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_curve
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


# Handle missing values
def handle_missing_values(data):
    for column in data.columns:
        if data[column].isnull().sum() > 0:
            if data[column].dtype in ['int64', 'float64']:
                data[column].fillna(data[column].mean(), inplace=True)
            else:
                data[column].fillna(data[column].mode()[0], inplace=True)
    return data


# Evaluate model performance
def evaluate_model(model, model_name, X_train, y_train, X_test, y_test):
    print(f"\n=== Evaluating {model_name} ===")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_probs = model.predict_proba(X_test)[:, 1]
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    fpr, tpr, _ = roc_curve(y_test, y_pred_probs)
    eer = fpr[np.nanargmin(np.abs(fpr - (1 - tpr)))]
    
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(report)
    print(f"EER: {eer * 100:.2f}%")
    
    return model

# Train and test models on all SYSTEM_IDs
def train_and_test_all_system_ids(data):
    # Handle missing values
    data = handle_missing_values(data)
    
    # Get unique SYSTEM_IDs
    system_ids = data['SYSTEM_ID'].unique()
    system_ids = [sid for sid in system_ids if sid != 'bonafide']  # Exclude 'bonafide'

    models = {
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
        "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=42),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    }

    # Dictionary to store trained models for feature importance
    trained_models = {}

    for sid in system_ids:
        print(f"\n=== SYSTEM_ID: {sid} ===")
        
        # Filter data for bonafide and the current SYSTEM_ID
        subset = data[(data['SYSTEM_ID'] == sid) | (data['SYSTEM_ID'] == 'bonafide')]
        
        # Show distribution of labels
        # print(subset['SYSTEM_ID'].value_counts())
        
        # Prepare features and labels
        X = subset.drop(['label', 'SYSTEM_ID'], axis=1)
        y = subset['label']
        feature_columns = X.columns  # Dynamically extract features for this subset

        # Print feature names and preview the data
        # print(f"Features for SYSTEM_ID {sid}: {list(feature_columns)}")
        # print(f"First rows of features for SYSTEM_ID {sid}:\n{X.head()}")
        
        # Split data into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, shuffle=True)
        
        for model_name, model in models.items():
            print(f"\n--- Evaluating {model_name} for SYSTEM_ID: {sid} ---")
            trained_model = evaluate_model(model, model_name, X_train, y_train, X_test, y_test)
            print('\n')
            # Print trained model parameters
            # print(f"Trained {model_name} parameters for SYSTEM_ID {sid}:\n{trained_model.get_params()}")
            
            # # Extract and print feature importances
            # if hasattr(trained_model, 'feature_importances_'):
            #     feature_importances = trained_model.feature_importances_
            #     feature_importance_df = pd.DataFrame({
            #         'Feature': feature_columns,
            #         'Importance': feature_importances
            #     }).sort_values(by='Importance', ascending=False).head(20)
            #     print(f"Top 20 Feature Importances for {model_name}, SYSTEM_ID {sid}:\n{feature_importance_df}")
            
            # # Store the trained model and subset-specific features
            # trained_models[(sid, model_name)] = (trained_model, feature_columns)
    
    return trained_models

In [6]:
trained_models = train_and_test_all_system_ids(data)


=== SYSTEM_ID: A01 ===

--- Evaluating Random Forest for SYSTEM_ID: A01 ---

=== Evaluating Random Forest ===
Accuracy: 0.9763
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      2279
           1       0.98      0.98      0.98      3721

    accuracy                           0.98      6000
   macro avg       0.98      0.97      0.97      6000
weighted avg       0.98      0.98      0.98      6000

EER: 2.68%



--- Evaluating AdaBoost for SYSTEM_ID: A01 ---

=== Evaluating AdaBoost ===
Accuracy: 0.9707
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      2279
           1       0.97      0.98      0.98      3721

    accuracy                           0.97      6000
   macro avg       0.97      0.97      0.97      6000
weighted avg       0.97      0.97      0.97      6000

EER: 3.03%



--- Evaluating XGBoost for SYSTEM_ID: A01 ---

=== 

KeyboardInterrupt: 

# 9. Select SYSTEM_ID

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
from sklearn.metrics import accuracy_score, classification_report, roc_curve
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

In [None]:
# Load datasets
train = pd.read_csv("/kaggle/input/borzi-full/train_set.csv")
dev = pd.read_csv("/kaggle/input/borzi-full/dev_set.csv")
eval = pd.read_csv("/kaggle/input/borzi-full/eval_set.csv")

# Ganti value label
train['label'] = train['label'].map({'bonafide': 1, 'spoof': 0})
dev['label'] = dev['label'].map({'bonafide': 1, 'spoof': 0})
eval['label'] = eval['label'].map({'bonafide': 1, 'spoof': 0})

# Concatenate train and eval sets
data = pd.concat([train, eval], axis=0, ignore_index=True, join='outer')

# Handle missing values function
def handle_missing_values(data):
    for column in data.columns:
        if data[column].isnull().sum() > 0:
            if data[column].dtype in ['int64', 'float64']:
                data[column].fillna(data[column].mean(), inplace=True)
            else:
                data[column].fillna(data[column].mode()[0], inplace=True)
    return data

# Handle the missing value
data = handle_missing_values(data)

# Drop unnecessary columns
data = data.drop(['AUDIO_FILE_NAME', "duration", "size", "spectral_bandwidth"], axis=1) # msrcc & psrcc karena null

# Filter SYSTEM_ID to keep only specific values
valid_system_ids = ["A06", "A19"]
data = data[~data["SYSTEM_ID"].isin(valid_system_ids)] # ~ buat exclude

# Drop SYSTEM_ID column after filtering
data = data.drop(columns=["SYSTEM_ID"])

# Split features and labels
X = data.drop('label', axis=1)
y = data['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=24, shuffle=True)

## 9.1 Normal set

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# List of models to evaluate
models = {
    "Random Forest": RandomForestClassifier(random_state=42, n_estimators=100),
    "AdaBoost": AdaBoostClassifier(random_state=42, n_estimators=100),
    "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="logloss")
}

# Loop through each model
for model_name, model in models.items():
    print(f"\n===== {model_name} =====")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_probs = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    eer = eval_metr(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    # Print results
    print(f"{model_name} Accuracy: {accuracy * 100:.2f}%")
    print(f"{model_name} EER: {eer * 100:.2f}%")
    print(report)

## 9.2 Oversampling

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy=0.5, random_state=42)  # Adjust ratio as needed
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# List of models to evaluate
models = {
    "Random Forest": RandomForestClassifier(random_state=42, n_estimators=100),
    "AdaBoost": AdaBoostClassifier(random_state=42, n_estimators=100),
    "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="logloss")
}

# Loop through each model
for model_name, model in models.items():
    print(f"\n===== {model_name} =====")
    
    # Train the model
    model.fit(X_train_smote, y_train_smote)
    
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_probs = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    eer, _ = eval_metr(y_test, y_pred, 0.1847, 2.0173)
    report = classification_report(y_test, y_pred)
    
    # Print results
    print(f"{model_name} Accuracy: {accuracy * 100:.2f}%")
    print(f"{model_name} EER: {eer * 100:.2f}%")
    print(report)

## 9.3 Undersampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler
undersampler  = RandomUnderSampler(sampling_strategy=0.5, random_state=42)  # Adjust ratio as needed
X_train_resampled, y_train_resampled = undersampler .fit_resample(X_train, y_train)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# List of models to evaluate
models = {
    "Random Forest": RandomForestClassifier(random_state=42, n_estimators=100),
    "AdaBoost": AdaBoostClassifier(random_state=42, n_estimators=100),
    "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="logloss")
}

# Loop through each model
for model_name, model in models.items():
    print(f"\n===== {model_name} =====")
    
    # Train the model
    model.fit(X_train_resampled, y_train_resampled)
    
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_probs = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    eer, _ = eval_metr(y_test, y_pred, 0.1847, 2.0173)
    report = classification_report(y_test, y_pred)
    
    # Print results
    print(f"{model_name} Accuracy: {accuracy * 100:.2f}%")
    print(f"{model_name} EER: {eer * 100:.2f}%")
    print(report)

# 10. Not Joining Eval to the set (Eval is for testing only)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
from sklearn.metrics import accuracy_score, classification_report, roc_curve
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

In [None]:
from sklearn.metrics import classification_report, roc_curve, accuracy_score
def eval_metr(y_true, y_pred, C0, C1, P_target=0.5):
    # Compute ROC curve
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    fnr = 1 - tpr  # False Negative Rate (Miss Rate)

    # Compute EER
    abs_diff = np.abs(fpr - fnr)
    eer_index = np.argmin(abs_diff)
    eer = (fpr[eer_index] + fnr[eer_index]) / 2

    # Compute t-DCF
    pi_spoof = P_target  # Prior for spoof
    pi_bonafide = 1 - P_target  # Prior for bonafide

    # Calculate t-DCF values
    tdcf_values = (pi_bonafide * C0 * fnr + pi_spoof * C1 * fpr) / min(pi_bonafide * C0, pi_spoof * C1)

    # Find minimum t-DCF
    min_tdcf = np.min(tdcf_values)

    return eer, min_tdcf

In [None]:
# Load datasets
train = pd.read_csv("/kaggle/input/borzi-full/train_set.csv")
eval = pd.read_csv("/kaggle/input/borzi-full/eval_set.csv")

# Map labels
train['label'] = train['label'].map({'bonafide': 1, 'spoof': 0})
eval['label'] = eval['label'].map({'bonafide': 1, 'spoof': 0})
valid_system_ids = ["A06", "A17", "A19"]

# Handle missing values function
def handle_missing_values(data):
    for column in data.columns:
        if data[column].isnull().sum() > 0:
            if data[column].dtype in ['int64', 'float64']:
                data[column].fillna(data[column].mean(), inplace=True)
            else:
                data[column].fillna(data[column].mode()[0], inplace=True)
    return data

def train_test_filter(df):
    # Handle the missing value
    df = handle_missing_values(df)
    # Drop col null atau gk penting
    df = df.drop(['AUDIO_FILE_NAME', "duration", "size", "spectral_bandwidth"], axis=1) # msrcc & psrcc karena null
    # Filter System_id
    df = df[~df["SYSTEM_ID"].isin(valid_system_ids)] # ~ buat exclude
    # Drop system_id setelah di filter
    df = df.drop(columns=["SYSTEM_ID"])

    X = df.drop('label', axis=1)
    y = df['label']
    
    return X, y

X_train, y_train = train_test_filter(train)
X_test, y_test = train_test_filter(eval)

In [None]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# List of models to evaluate
models = {
    "Random Forest": RandomForestClassifier(random_state=42, n_estimators=100),
    "AdaBoost": AdaBoostClassifier(random_state=42, n_estimators=100),
    "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="logloss")
}

# Loop through each model
for model_name, model in models.items():
    print(f"\n===== {model_name} =====")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_probs = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    eer, _ = eval_metr(y_test, y_pred, 0.1847, 2.0173)
    report = classification_report(y_test, y_pred)
    
    # Print results
    print(f"{model_name} Accuracy: {accuracy * 100:.2f}%")
    print(f"{model_name} EER: {eer * 100:.2f}%")
    print(report)

# 11. Feature Importance on Spoofing Methods Only

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

def process_and_select_features(train, dev, eval, excluded_columns, system_id_list):
    # Combine datasets
    data = pd.concat([train, dev, eval], ignore_index=True)
    
    # Filter based on system_id_list
    data = data[data['SYSTEM_ID'].astype(str).isin(system_id_list)]
    data = data.drop(columns=['SYSTEM_ID'])
    
    # Identify feature columns
    feature_columns = [col for col in data.columns if col not in excluded_columns + ['label']]
    
    # Impute missing values
    def impute_missing_values(df, feature_columns):
        imputer = SimpleImputer(strategy='mean')
        df[feature_columns] = imputer.fit_transform(df[feature_columns])
        return df
    
    data = impute_missing_values(data, feature_columns)
    
    # Encode labels
    data['label'] = data['label'].replace({"bonafide": 1, "spoof": 0})
    data = data.dropna(subset=['label'])
    data['label'] = data['label'].astype(int)
    
    # Feature importance calculation
    models = {
        'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
        'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42),
        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    }
    
    X = data[feature_columns]
    y = data['label']
    
    feature_importance_df = pd.DataFrame()
    feature_importance_df['Num'] = range(1, 21)  # Top 20 features
    
    for model_name, model in models.items():
        model.fit(X, y)
        feature_importance = model.feature_importances_
        sorted_indices = np.argsort(feature_importance)[::-1][:20]  # Top 20 features
        top_features = [feature_columns[i] for i in sorted_indices]
        top_scores = [feature_importance[i] for i in sorted_indices]
        
        feature_importance_df[model_name] = [f"{f} ({s:.4f})" for f, s in zip(top_features, top_scores)]
    
    return feature_importance_df

excluded_columns = ["label", "duration", "size", "spectral_bandwidth"]
# system_id_list = ["bonafide", "A01", "A02", "A03", "A04", "A07", "A08", "A09", "A10", "A11", "A12", "A16"]  # TTS
# system_id_list = ["bonafide", "A05", "A06", "A17", "A18", "A19"]  # VC
system_id_list = ["bonafide", "A13", "A14", "A15"]  # TTS_VC

In [None]:
process_and_select_features(train, dev, eval, excluded_columns, system_id_list)