In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/audio-features-pre-transformed-transformed/enhanced_features.csv
/kaggle/input/audio-features-pre-transformed-transformed/preenhanced_features.csv


In [5]:
import pandas as pd
import numpy as np
import time
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, classification_report
)

# 1) Load your pre-enhanced features
df = pd.read_csv('/kaggle/input/audio-features-pre-transformed-transformed/preenhanced_features.csv')
X = df.drop('label', axis=1)
y = df['label']

# 2) Fixed CatBoost hyperparameters from tuning
cat_params = {
    'iterations': 944,
    'learning_rate': 0.28180829608089975,
    'depth': 10,
    'l2_leaf_reg': 7.173375161810935,
    'border_count': 163,
    'bagging_temperature': 0.1751993319934919,
    'random_state': 42,
    'verbose': 0       # silent train
}

# 3) Prepare cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
metrics = {
    'accuracy': [], 'precision': [], 'recall': [],
    'f1': [], 'roc_auc': [], 'inference_time_ms': []
}

# 4) Run 5-fold CV
for fold, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
    X_tr, X_te = X.iloc[train_idx], X.iloc[test_idx]
    y_tr, y_te = y.iloc[train_idx], y.iloc[test_idx]
    
    model = CatBoostClassifier(**cat_params)
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_te, y_te)],           # for early stopping
        early_stopping_rounds=50,          
        use_best_model=True
    )
    
    # measure inference time
    t0 = time.time()
    y_pred = model.predict(X_te)
    elapsed = (time.time() - t0) * 1000  # ms total for batch
    inf_time = elapsed / len(X_te)       # ms per sample
    
    y_proba = model.predict_proba(X_te)[:, 1]
    
    # collect metrics
    metrics['accuracy'].append(accuracy_score(y_te, y_pred))
    metrics['precision'].append(precision_score(y_te, y_pred, zero_division=0))
    metrics['recall'].append(recall_score(y_te, y_pred, zero_division=0))
    metrics['f1'].append(f1_score(y_te, y_pred, zero_division=0))
    metrics['roc_auc'].append(roc_auc_score(y_te, y_proba))
    metrics['inference_time_ms'].append(inf_time)
    
    print(f"\n=== Fold {fold} Classification Report ===")
    print(classification_report(y_te, y_pred, zero_division=0))
    print(f"Inference time per sample: {inf_time:.2f} ms")

# 5) Display average metrics
print("\n=== 10-Fold CV Average Metrics ===")
for name, scores in metrics.items():
    mean, std = np.mean(scores), np.std(scores)
    print(f"{name.replace('_',' ').capitalize():<20}: {mean:.4f} ± {std:.4f}")

# 6) Retrain on the entire dataset and save the final model
final_model = CatBoostClassifier(**cat_params)
final_model.fit(X, y, verbose=False)

# Save in CatBoost's native format
final_model.save_model('/kaggle/working/best_catboost_model.cbm')

# Optionally, also save with joblib/pickle
import joblib
joblib.dump(final_model, '/kaggle/working/best_catboost_model.pkl')

print("\nSaved final CatBoost model to:")
print(" - /kaggle/working/best_catboost_model.cbm")
print(" - /kaggle/working/best_catboost_model.pkl")



=== Fold 1 Classification Report ===
              precision    recall  f1-score   support

           0       0.92      0.94      0.93     35856
           1       0.90      0.86      0.88     22250

    accuracy                           0.91     58106
   macro avg       0.91      0.90      0.90     58106
weighted avg       0.91      0.91      0.91     58106

Inference time per sample: 0.00 ms

=== Fold 2 Classification Report ===
              precision    recall  f1-score   support

           0       0.92      0.94      0.93     35856
           1       0.90      0.86      0.88     22250

    accuracy                           0.91     58106
   macro avg       0.91      0.90      0.90     58106
weighted avg       0.91      0.91      0.91     58106

Inference time per sample: 0.00 ms

=== Fold 3 Classification Report ===
              precision    recall  f1-score   support

           0       0.92      0.94      0.93     35856
           1       0.90      0.86      0.88     22250