Train Multiple Classifiers on YAMNet Embeddings

Trains 4 different classifiers on extracted YAMNet embeddings:
1. Logistic Regression (baseline)
2. Random Forest
3. Support Vector Machine (SVM)
4. Deep Neural Network (DNN)


In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (classification_report, confusion_matrix, 
                            accuracy_score, f1_score, precision_score, recall_score)
import xgboost as xgb
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import joblib
from datetime import datetime
import json

# Configuration
FEATURES_DIR = '../data/approach2/features'
MODELS_DIR = '../models/models_approach2/classifiers'
RESULTS_DIR = '../results/results_approach2'
RANDOM_SEED = 42

# Set seeds
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)



In [None]:
# 1. Load Features and Labels

print("\nLoading features and labels...")

train_X = np.load(os.path.join(FEATURES_DIR, 'train_embeddings.npy'))
train_y = np.load(os.path.join(FEATURES_DIR, 'train_labels.npy'))

val_X = np.load(os.path.join(FEATURES_DIR, 'val_embeddings.npy'))
val_y = np.load(os.path.join(FEATURES_DIR, 'val_labels.npy'))

test_X = np.load(os.path.join(FEATURES_DIR, 'test_embeddings.npy'))
test_y = np.load(os.path.join(FEATURES_DIR, 'test_labels.npy'))

# Load label mapping
label_mapping = np.load(os.path.join(FEATURES_DIR, 'label_mapping.npy'), 
                       allow_pickle=True).item()
categories = label_mapping['categories']
id_to_category = label_mapping['id_to_category']

print(f"Loaded features:")
print(f"  Training:   {train_X.shape}")
print(f"  Validation: {val_X.shape}")
print(f"  Test:       {test_X.shape}")
print(f"  Classes: {categories}")


In [None]:
# 2. Feature Scaling

print("\nScaling features...")

scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_X)
val_X_scaled = scaler.transform(val_X)
test_X_scaled = scaler.transform(test_X)

# Save scaler
scaler_path = os.path.join(MODELS_DIR, 'feature_scaler.pkl')
joblib.dump(scaler, scaler_path)
print(f"Feature scaler saved to {scaler_path}")



In [None]:

# 3. Helper Functions

def evaluate_model(model, X, y, categories, model_name):
    """Comprehensive model evaluation."""
    y_pred = model.predict(X)
    
    # If predictions are probabilities, convert to class labels
    if len(y_pred.shape) > 1 and y_pred.shape[1] > 1:
        y_pred = np.argmax(y_pred, axis=1)
    
    # Calculate metrics
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y, y_pred, average='weighted', zero_division=0)
    
    # Per-class metrics
    class_report = classification_report(y, y_pred, 
                                        target_names=categories,
                                        output_dict=True,
                                        zero_division=0)
    
    # Confusion matrix
    cm = confusion_matrix(y, y_pred)
    
    results = {
        'model_name': model_name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'classification_report': class_report,
        'confusion_matrix': cm,
        'predictions': y_pred
    }
    
    return results

def plot_confusion_matrix(cm, categories, model_name, save_path):
    """Plot confusion matrix."""
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=categories, yticklabels=categories,
                cbar_kws={'label': 'Count'})
    plt.title(f'Confusion Matrix - {model_name}', fontsize=16, fontweight='bold')
    plt.xlabel('Predicted Label', fontsize=12)
    plt.ylabel('True Label', fontsize=12)
    plt.tight_layout()
    plt.savefig(save_path, dpi=150, bbox_inches='tight')
    plt.close()

def print_results(results):
    """Print model evaluation results."""
    print(f"\n{results['model_name']}")
    print("-" * 50)
    print(f"Accuracy:  {results['accuracy']:.4f}")
    print(f"Precision: {results['precision']:.4f}")
    print(f"Recall:    {results['recall']:.4f}")
    print(f"F1-Score:  {results['f1_score']:.4f}")
    
    print("\nPer-class metrics:")
    print(f"{'Class':<20s} {'Precision':>10s} {'Recall':>10s} {'F1-Score':>10s} {'Support':>10s}")
    print("-" * 65)
    
    for cat in categories:
        metrics = results['classification_report'][cat]
        print(f"{cat:<20s} {metrics['precision']:>10.4f} {metrics['recall']:>10.4f} "
              f"{metrics['f1-score']:>10.4f} {int(metrics['support']):>10d}")


In [None]:
# 4. Train Models
print("\n[3] Training classifiers...")
print("="*70)

all_results = {}


# Model 1: Logistic Regression (Baseline)

print("\nTraining Logistic Regression...") 

lr_model = LogisticRegression(
    max_iter=1000,
    C=0.01,
    solver='lbfgs',
    multi_class='multinomial',
    random_state=RANDOM_SEED,
    n_jobs=-1,
    verbose=1
)

lr_model.fit(train_X_scaled, train_y)
lr_val_results = evaluate_model(lr_model, val_X_scaled, val_y, categories, 
                                "Logistic Regression")
print_results(lr_val_results)

# Save model
lr_path = os.path.join(MODELS_DIR, 'logistic_regression.pkl')
joblib.dump(lr_model, lr_path)
print(f"Model saved to {lr_path}")

all_results['logistic_regression'] = lr_val_results

# Plot confusion matrix
plot_confusion_matrix(lr_val_results['confusion_matrix'], categories,
                     "Logistic Regression", 
                     os.path.join(RESULTS_DIR, 'cm_logistic_regression.png'))


# Model 2: Random Forest
print("\nTraining Random Forest...")

rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=30,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=RANDOM_SEED,
    n_jobs=-1,
    verbose=1
)

rf_model.fit(train_X_scaled, train_y)
rf_val_results = evaluate_model(rf_model, val_X_scaled, val_y, categories,
                                "Random Forest")
print_results(rf_val_results)

# Save model
rf_path = os.path.join(MODELS_DIR, 'random_forest.pkl')
joblib.dump(rf_model, rf_path)
print(f"Model saved to {rf_path}")

all_results['random_forest'] = rf_val_results

plot_confusion_matrix(rf_val_results['confusion_matrix'], categories,
                     "Random Forest",
                     os.path.join(RESULTS_DIR, 'cm_random_forest.png'))

# Feature importance
feature_importance = rf_model.feature_importances_
top_features_idx = np.argsort(feature_importance)[-20:][::-1]

plt.figure(figsize=(10, 6))
plt.bar(range(len(top_features_idx)), feature_importance[top_features_idx])
plt.title('Top 20 Most Important Features - Random Forest', 
          fontsize=14, fontweight='bold')
plt.xlabel('Feature Index')
plt.ylabel('Importance')
plt.tight_layout()
plt.savefig(os.path.join(RESULTS_DIR, 'rf_feature_importance.png'), dpi=150)
plt.close()


# Model 3: Support Vector Machine (SVM)

print("\nTraining SVM...")

svm_model = SVC(
    C=10.0,
    kernel='rbf',
    gamma='scale',
    probability=True,  # Enable probability estimates
    random_state=RANDOM_SEED,
    verbose=True,
    max_iter=1000
)

svm_model.fit(train_X_scaled, train_y)
svm_val_results = evaluate_model(svm_model, val_X_scaled, val_y, categories,
                                 "SVM")
print_results(svm_val_results)

# Save model
svm_path = os.path.join(MODELS_DIR, 'svm.pkl')
joblib.dump(svm_model, svm_path)
print(f"Model saved to {svm_path}")

all_results['svm'] = svm_val_results

plot_confusion_matrix(svm_val_results['confusion_matrix'], categories,
                     "SVM",
                     os.path.join(RESULTS_DIR, 'cm_svm.png'))


# Model 4: Deep Neural Network (DNN)
print("\nTraining Deep Neural Network...")

# Build DNN architecture
def build_dnn(input_dim, num_classes):
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(512, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.4),
        
        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.2),
        
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.1),
        
        layers.Dense(num_classes, activation='softmax')
    ])
    
    return model

dnn_model = build_dnn(train_X_scaled.shape[1], len(categories))

# Compile
dnn_model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

print(dnn_model.summary())

# Callbacks
early_stopping = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True,
    verbose=1
)

lr_scheduler = keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=5,
    min_lr=1e-6,
    verbose=1
)

# Train
history = dnn_model.fit(
    train_X_scaled, train_y,
    validation_data=(val_X_scaled, val_y),
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping, lr_scheduler],
    verbose=1
)

dnn_val_results = evaluate_model(dnn_model, val_X_scaled, val_y, categories,
                                 "Deep Neural Network")
print_results(dnn_val_results)

# Save model
dnn_path = os.path.join(MODELS_DIR, 'dnn_model.keras')
dnn_model.save(dnn_path)
print(f"✓ Model saved to {dnn_path}")

all_results['dnn'] = dnn_val_results

plot_confusion_matrix(dnn_val_results['confusion_matrix'], categories,
                     "Deep Neural Network",
                     os.path.join(RESULTS_DIR, 'cm_dnn.png'))

# Plot training history
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(history.history['loss'], label='Training Loss')
axes[0].plot(history.history['val_loss'], label='Validation Loss')
axes[0].set_title('DNN Training Loss', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].plot(history.history['accuracy'], label='Training Accuracy')
axes[1].plot(history.history['val_accuracy'], label='Validation Accuracy')
axes[1].set_title('DNN Training Accuracy', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(RESULTS_DIR, 'dnn_training_history.png'), dpi=150)
plt.close()



In [None]:

# 5. Compare All Models

comparison_data = []
for model_name, results in all_results.items():
    comparison_data.append({
        'Model': model_name.replace('_', ' ').title(),
        'Accuracy': results['accuracy'],
        'Precision': results['precision'],
        'Recall': results['recall'],
        'F1-Score': results['f1_score']
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('F1-Score', ascending=False)

print("\n" + comparison_df.to_string(index=False))

# Save comparison
comparison_df.to_csv(os.path.join(RESULTS_DIR, 'model_comparison.csv'), index=False)

# Visualize comparison
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
for idx, metric in enumerate(metrics):
    ax = axes[idx // 2, idx % 2]
    sorted_df = comparison_df.sort_values(metric)
    ax.barh(sorted_df['Model'], sorted_df[metric], color='steelblue')
    ax.set_xlabel(metric, fontsize=12)
    ax.set_title(f'{metric} Comparison', fontsize=14, fontweight='bold')
    ax.set_xlim([0, 1])
    ax.grid(axis='x', alpha=0.3)
    
    # Add value labels
    for i, v in enumerate(sorted_df[metric]):
        ax.text(v + 0.01, i, f'{v:.4f}', va='center')

plt.tight_layout()
plt.savefig(os.path.join(RESULTS_DIR, 'model_comparison.png'), dpi=150, bbox_inches='tight')
plt.show()


In [None]:
# 6. Test Set Evaluation (Best Model)

best_model_name = comparison_df.iloc[0]['Model'].lower().replace(' ', '_')
print(f"\nBest model: {best_model_name}")

if best_model_name == 'deep_neural_network':
    best_model = dnn_model
    test_results = evaluate_model(best_model, test_X_scaled, test_y, categories,
                                  f"{best_model_name} (Test Set)")
elif best_model_name == 'logistic_regression':
    best_model = lr_model
    test_results = evaluate_model(best_model, test_X_scaled, test_y, categories,
                                  f"{best_model_name} (Test Set)")
elif best_model_name == 'random_forest':
    best_model = rf_model
    test_results = evaluate_model(best_model, test_X_scaled, test_y, categories,
                                  f"{best_model_name} (Test Set)")
else:  # SVM
    best_model = svm_model
    test_results = evaluate_model(best_model, test_X_scaled, test_y, categories,
                                  f"{best_model_name} (Test Set)")

print_results(test_results)

plot_confusion_matrix(test_results['confusion_matrix'], categories,
                     f"{best_model_name.replace('_', ' ').title()} (Test Set)",
                     os.path.join(RESULTS_DIR, f'cm_{best_model_name}_test.png'))

# Save test results
test_results_save = {
    'model_name': best_model_name,
    'accuracy': float(test_results['accuracy']),
    'precision': float(test_results['precision']),
    'recall': float(test_results['recall']),
    'f1_score': float(test_results['f1_score']),
    'per_class_metrics': {}
}

for cat in categories:
    test_results_save['per_class_metrics'][cat] = {
        'precision': float(test_results['classification_report'][cat]['precision']),
        'recall': float(test_results['classification_report'][cat]['recall']),
        'f1-score': float(test_results['classification_report'][cat]['f1-score']),
        'support': int(test_results['classification_report'][cat]['support'])
    }

with open(os.path.join(RESULTS_DIR, 'best_model_test_results.json'), 'w') as f:
    json.dump(test_results_save, f, indent=2)

print(f"\nTest results saved to {RESULTS_DIR}/best_model_test_results.json")


In [None]:
# Summary
print(f"\nTrained {len(all_results)} classifiers")
print(f"All models saved to: {MODELS_DIR}")
print(f"Results and visualizations saved to: {RESULTS_DIR}")
print(f"\nBest performing model: {best_model_name.replace('_', ' ').title()}")
print(f"  - Validation F1-Score: {all_results[best_model_name]['f1_score']:.4f}")
print(f"  - Test F1-Score: {test_results['f1_score']:.4f}")
