In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV  # Split data and search over C
from sklearn.metrics import hamming_loss
from sklearn.preprocessing import LabelEncoder, StandardScaler  # Encode labels and standardize features
from sklearn.svm import SVC, LinearSVC  # SVM and L1-penalized Linear SVM
from sklearn.metrics import accuracy_score, zero_one_loss  # Evaluation metrics

from imblearn.over_sampling import SMOTE  # For class imbalance handling
from imblearn.pipeline import Pipeline  # To create a pipeline with SMOTE
from sklearn.cluster import KMeans
from sklearn.metrics import calinski_harabasz_score
import matplotlib.pyplot as plt

Part 1. Support Vector Machine 
-
1A) Data Loading and Pre-processing
-
- Step 1: Load Data
- Step 2: Split features and multi-label outputs
- Step 3: Encode each label into integers. SVC only works with numerical labels. Cannot process strings.
- Step 4: 70/30 train test split. 

In [3]:
pd.set_option("display.width", 300)
pd.set_option("display.max_columns", 12)
pd.set_option("display.float_format", "{:.2f}".format)

#Step 1: Load Data
data = pd.read_csv("../data/Anuran Calls (MFCCs)/Frogs_MFCCs.csv")

data.info()
print(data.head())
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7195 entries, 0 to 7194
Data columns (total 26 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   MFCCs_ 1  7195 non-null   float64
 1   MFCCs_ 2  7195 non-null   float64
 2   MFCCs_ 3  7195 non-null   float64
 3   MFCCs_ 4  7195 non-null   float64
 4   MFCCs_ 5  7195 non-null   float64
 5   MFCCs_ 6  7195 non-null   float64
 6   MFCCs_ 7  7195 non-null   float64
 7   MFCCs_ 8  7195 non-null   float64
 8   MFCCs_ 9  7195 non-null   float64
 9   MFCCs_10  7195 non-null   float64
 10  MFCCs_11  7195 non-null   float64
 11  MFCCs_12  7195 non-null   float64
 12  MFCCs_13  7195 non-null   float64
 13  MFCCs_14  7195 non-null   float64
 14  MFCCs_15  7195 non-null   float64
 15  MFCCs_16  7195 non-null   float64
 16  MFCCs_17  7195 non-null   float64
 17  MFCCs_18  7195 non-null   float64
 18  MFCCs_19  7195 non-null   float64
 19  MFCCs_20  7195 non-null   float64
 20  MFCCs_21  7195 non-null   floa

In [4]:
# Step 2: Split features and multi-label outputs
X = data.drop(columns=['Family', 'Genus', 'Species', 'RecordID'])  # MFCC features
Y = data[['Family', 'Genus', 'Species']].copy()  # Multi-label targets

In [5]:
# Step 3: Encode each label into integers
label_encoders = {}
for col in Y.columns:
    le = LabelEncoder()
    Y[col] = le.fit_transform(Y[col])
    label_encoders[col] = le  # Store encoder for inverse transform later

# Step 4: 70/30 train/test split with stratify for better label distribution
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42, stratify=Y)


1B) Evaluation Metrics
--
Evaluation Metrics:

- Exact Match: A prediction is counted as correct only if all 3 labels match.

- Hamming Loss: Fraction of incorrect predictions averaged over all labels.

- Hamming Score = 1 - Hamming Loss

- - Exact Match is strict (all labels must match); Hamming Score is more lenient and interpretable per-label.


Key Considerations:
- sklearn.metrics.hamming_loss() doesn’t support multiclass-multioutput. So, I computed loss per label using zero_one_loss and average.



In [6]:
# Hamming Loss for multiclass multi-label = average of per-label classification error
def hamming_loss_multiclass(y_true, y_pred):
    losses = [
        zero_one_loss(y_true[:, i], y_pred[:, i])
        for i in range(y_true.shape[1])
    ]
    return np.mean(losses)

# Hamming Score = 1 - average Hamming Loss
def hamming_score(y_true, y_pred):
    return 1 - hamming_loss_multiclass(y_true, y_pred)

# Exact Match Score
def exact_match_score(y_true, y_pred):
    return np.mean(np.all(y_true == y_pred, axis=1))



1C) SVM Model
-

Step 1: Coarse Grid Search (Stage 1)
- Define a coarse parameter grid: C = [0.1, 1, 10, 100, 1000].  gamma = [0.001, 0.01, 0.1, 1, 10]
- Use this grid to explore a wide range of penalty and kernel width values across 10-fold cross-validation.

Step 2: Identify Best Parameters for Each Label
- Train one SVM per label (Family, Genus, Species) using the coarse grid.
- Extract the best C and gamma values per label.
- Determine if best values lie at the edges of the coarse grid.

Step 3: Refine Grid Search (Stage 2)
- If best values lie on the edge, define a refined grid centered around those best values:
- coarse best was C=100, gamma=1, use refined grid: C = [50, 100, 200] gamma = [0.5, 1, 2]
- Use GridSearchCV again on the refined grid for each label.

Step 4: Train Final Models Using Refined Parameters
- Fit one SVM per label using best parameters found from refined search.

Step 5: Evaluate on Test Set
- Predict using the final models on X_test.
- Evaluate using:
- Exact Match Score
- Hamming Loss
- Hamming Score = 1 - Hamming Loss

In [7]:
pd.set_option("display.float_format", "{:.2f}".format)

# Coarse Parameter Grid (Stage 1)
# Used initially to identify a rough range of hyperparameters for each label
coarse_param_grid = {
    'C': [0.1, 1, 10, 100, 1000],  # SVM penalty weight (wide range to find boundaries)
    'gamma': [0.001, 0.01, 0.1, 1, 10],  # Gaussian kernel width (gamma = 1 / (2 * sigma^2))
    'kernel': ['rbf']
}

# Refined Parameter Grid (Stage 2)
# Narrowed grid based on coarse tuning results to better pinpoint optimal parameters
refined_param_grid = {
    'C': [50, 100, 200],  # Refined around the best found C (previously 100)
    'gamma': [0.5, 1, 2],  # Refined around the best found gamma (previously 1)
    'kernel': ['rbf']
}

# Dictionary to store final models and best parameters per label
models = {}
best_params = {}

# Train one SVM per label using GridSearchCV on refined grid
# Each label is treated independently (binary relevance for multi-label)
for label in Y.columns:
    print(f"\n[Refined Tuning] Training SVM for label: {label}")
    # Perform 10-fold cross-validation to select best hyperparameters
    grid_search = GridSearchCV(SVC(), refined_param_grid, cv=10, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, Y_train[label])
    models[label] = grid_search.best_estimator_  # Store best model for current label
    best_params[label] = grid_search.best_params_  # Store best parameters
    print(f"Best refined parameters for {label}: {grid_search.best_params_}")

# Evaluate models on test set.
# Predict each label independently using corresponding trained SVM
Y_pred = np.zeros_like(Y_test)  # Initialize prediction matrix with same shape as Y_test
for i, label in enumerate(Y.columns):
    Y_pred[:, i] = models[label].predict(X_test)

# Compute multi-label evaluation metrics
print("\nEvaluation Results on Test Set:")
print("Exact Match Score:", exact_match_score(Y_test.values, Y_pred))
print("Hamming Loss:", hamming_loss_multiclass(Y_test.values, Y_pred))
print("Hamming Score:", hamming_score(Y_test.values, Y_pred))


[Refined Tuning] Training SVM for label: Family
Best refined parameters for Family: {'C': 50, 'gamma': 2, 'kernel': 'rbf'}

[Refined Tuning] Training SVM for label: Genus
Best refined parameters for Genus: {'C': 50, 'gamma': 2, 'kernel': 'rbf'}

[Refined Tuning] Training SVM for label: Species
Best refined parameters for Species: {'C': 50, 'gamma': 2, 'kernel': 'rbf'}

Evaluation Results on Test Set:
Exact Match Score: 0.9861046780917091
Hamming Loss: 0.00910915547321293
Hamming Score: 0.9908908445267871


1D) SVM with L1-Penalized Regression
-
Steps:
Step 1: Standardize the input features using StandardScaler().
- This is required for L1-penalized models as LinearSCV
- Fit scaler on training data and use it to transform both train and test sets to prevent data leakage.

Step 2: Define grid of C values for regularization. 

Step 3: Train one model per Label with GridSearchCV

Step 4: Predict on test set and evaluate Model performance. 




In [8]:
pd.set_option("display.float_format", "{:.2f}".format)

# Step 1: Standardize features 
scaler = StandardScaler() #Initialize standard scaler 
X_train = scaler.fit_transform(X_train)  # Fit on train set (Avoid data leakage to test data) 
X_test = scaler.transform(X_test)        # Transform test set using same scaler. 


# Step 2: Define penalty weight values for L1-penalized SVM
c_values = [0.01, 0.1, 1, 10, 100]

models = {} #Dictionary to store the best model for each label
best_params = {} #Dictionary to store the best C for each label

# Step 3: Train one LinearSVC per label using GridSearchCV
for label in Y.columns:
    print(f"\nTraining L1-Penalized SVM for label: {label}")
    l1_svc = LinearSVC(penalty='l1', dual=False, max_iter=10000)
    param_grid = {'C': c_values}

    grid_search = GridSearchCV(l1_svc, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, Y_train[label])

    models[label] = grid_search.best_estimator_
    best_params[label] = grid_search.best_params_
    print(f"Best C for {label}: {grid_search.best_params_['C']}")

# Step 4: Predict on test set for each label and evaluate model performance. 
Y_pred = np.zeros_like(Y_test)
for i, label in enumerate(Y.columns):
    Y_pred[:, i] = models[label].predict(X_test)

print("\nEvaluation Results on Test Set (L1-Penalized SVM):")
print("Exact Match Score:", exact_match_score(Y_test.values, Y_pred))
print("Hamming Loss:", hamming_loss_multiclass(Y_test.values, Y_pred))
print("Hamming Score:", hamming_score(Y_test.values, Y_pred))


Training L1-Penalized SVM for label: Family
Best C for Family: 10

Training L1-Penalized SVM for label: Genus
Best C for Genus: 10

Training L1-Penalized SVM for label: Species
Best C for Species: 100

Evaluation Results on Test Set (L1-Penalized SVM):
Exact Match Score: 0.9143121815655396
Hamming Loss: 0.048633626679018084
Hamming Score: 0.951366373320982


Findings: Hamming Score is higher for RBF Gaussian Kernel (non-linear) as compared to the linear Kernel
-

1E) SMOTE to address class imbalance
-

Steps breakdown:

Step 1: Define Grid of C values. 

Step 2: Train with SMOTE + StandardScaler + LinearSVC in a pipeline (using cross-validation). 
Create a pipeline that performs:

- SMOTE to balance class distribution per label

- Standardization using StandardScaler

- L1-penalized LinearSVC

Use GridSearchCV to select the best C via 10-fold CV.

Key Considerations:

- SMOTE is applied inside each CV fold, avoiding data leakage.

- Standardization must happen after SMOTE and within the CV fold.

Step 3: Refit the Best Model on Fully SMOTE-Augmented Training Data

- After identifying best C, reapply SMOTE to the full training data for each label.

- Fit a new LinearSVC using the best C and the resampled (balanced) training set.

- Standardize the resampled training data and transform the original test set using the same scaler.

- This step ensures that the final model is trained on balanced data. We should not use best_estimator_ directly on unbalanced data — always refit.

Step 4: Predict and Evaluate on the original test set. 
- Never apply SMOTE to the test set. Keep it untouched to simulate real-world, imbalanced conditions.



In [9]:
# Step 1: Define grid of C values for regularization strength
c_values = [0.1, 1, 10]  # Reduced grid to speed up training

models = {}         # Stores final model for each label
best_params = {}    # Stores best C for each label

# Step 2: Train with Pipeline(SMOTE + StandardScaler + LinearSVC) using CV for each label. 
for label in Y.columns:
    print(f"\nTraining SMOTE-enhanced L1-SVM for label: {label}")

    # Build pipeline
    pipe = Pipeline([
        ('smote', SMOTE(random_state=42)),
        ('scaler', StandardScaler()),
                # Reduced max_iter to speed up training
        ('svc', LinearSVC(penalty='l1', dual=False, max_iter=3000, tol=1e-3, verbose=1))
    ])

    # Grid search over C values (passed into svc step in pipeline)
    param_grid = {
        'svc__C': c_values # Double underscore for nested parameter names
    }

    # Perform 10-fold cross-validation using accuracy as the scoring metric
    grid_search = GridSearchCV(pipe, param_grid, cv=10, scoring='accuracy', n_jobs=1)
    grid_search.fit(X_train, Y_train[label])

    best_C = grid_search.best_params_['svc__C']
    best_params[label] = best_C
    print(f"Best C for {label}: {best_C}")

    # Step 3: Fit best model again on full SMOTE-applied training data
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_train, Y_train[label])

    scaler = StandardScaler()
    X_resampled = scaler.fit_transform(X_resampled)
    X_test_scaled = scaler.transform(X_test)

    final_model = LinearSVC(penalty='l1', dual=False, max_iter=3000, tol=1e-3, verbose=1, C=best_C)
    final_model.fit(X_resampled, y_resampled)
    models[label] = final_model

# Step 4: Predict and evaluate
Y_pred = np.zeros_like(Y_test)
for i, label in enumerate(Y.columns):
    Y_pred[:, i] = models[label].predict(X_test_scaled)

print("\nEvaluation Results on Test Set (L1-SVM with SMOTE):")
print("Exact Match Score:", exact_match_score(Y_test.values, Y_pred))
print("Hamming Loss:", hamming_loss_multiclass(Y_test.values, Y_pred))
print("Hamming Score:", hamming_score(Y_test.values, Y_pred))



Training SMOTE-enhanced L1-SVM for label: Family
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]Best C for Family: 10
[LibLinear]
Training SMOTE-enhanced L1-SVM for label: Genus
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]Best C for Genus: 10
[LibLinear]
Training SMOTE-enhanced L1-SVM for label: Species
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][Lib

Part 2) K-Means Clustering
-
Steps:
-

Step 1: Use entire Anuran MFCC dataset (no splitting) — since this is unsupervised clustering, we don’t do train/test splitting. 

- Perform encoding as metrics like Hammong loss require numeric input.
- Also we need to use functions like np.bincount to compute majority label in each cluster, whcih only work with integer. 

Step 2: Run Monte Carlo Simulation (50 times)

Step 3: For each trial, automatically choose the number of clusters using CH Score. 

Step 3: Cluster data using K-means with chosen k. Assigns cluster ID to each data point

Step 4: Assign majority label with cluster. 

Step 5:Evaluate clusters and report results. 


In [10]:
# Prepare features and labels
features = data.drop(columns=["Family", "Genus", "Species", "RecordID"])
labels = data[["Family", "Genus", "Species"]].copy() 

# Encode labels as metrics like Hammong Loss require numerical values. 
le_family = LabelEncoder()
le_genus = LabelEncoder()
le_species = LabelEncoder()
labels["Family"] = le_family.fit_transform(labels["Family"])
labels["Genus"] = le_genus.fit_transform(labels["Genus"])
labels["Species"] = le_species.fit_transform(labels["Species"])

Y_true = labels.values #Store encoded labels as a NumPy array for comparison

#Initialize lists to store Hamming losses and scores for each trial. 
hamming_losses = []
hamming_scores = []
num_trials = 50 # Number of Monte Carlo Trials
max_k = 20 #Max number of clusters to consider. 

for trial in range(num_trials):
    print(f"Running trial {trial + 1} of {num_trials}...") #Track progress 
    ch_scores = [] #Store CH index scores for different values of k. 

    #Try different values of k from 2 to max_k to find the best_k
    for i in range(2, max_k + 1):  # CH index requires at least 2 clusters
        kmeans = KMeans(n_clusters=i, n_init=10, random_state=None) 
        clusters = kmeans.fit_predict(features) #Assign clusters. 
        score = calinski_harabasz_score(features, clusters) #Compute CH score
        ch_scores.append(score) #save score 

    #Get k with max CH sccore. offset by 2 because we started at k = 2. 
    best_k = np.argmax(ch_scores) + 2  

    #Run K-means again with best number of clustesr. 
    kmeans = KMeans(n_clusters=best_k, n_init=10, random_state=None)
    clusters = kmeans.fit_predict(features) #Assign clusters to each data point. 

    # Assign majority label for each cluster for each of the 3 labels. 
    Y_pred = np.zeros_like(Y_true)
    for cluster_id in range(best_k):
        idx = np.where(clusters == cluster_id)[0] #Find indices of points in this cluster. 
        for i in range(3):
            if len(idx) == 0:
                continue #Skip empty clusters
            majority_label = np.bincount(Y_true[idx, i]).argmax() #Most common label 
            Y_pred[idx, i] = majority_label #Assign majority label to all points in this cluster. 

    ## Compute average Hamming loss across the 3 labels
    losses = [hamming_loss(Y_true[:, i], Y_pred[:, i]) for i in range(3)] #Per label Hamming loss
    hamming_losses.append(np.mean(losses)) #mean loss

    # Compute average Hamming score (correct predictions / total labels)
    correct = np.sum(Y_true == Y_pred, axis=1)  # Number of correct labels per instance
    score = np.mean(correct / Y_true.shape[1]) # Average score across instances
    hamming_scores.append(score) # Store Hamming score

# After all trials, compute mean and std for Hamming loss and score
mean_loss = np.mean(hamming_losses)  # Mean Hamming loss over trials
std_loss = np.std(hamming_losses)    # Std deviation of Hamming loss
mean_score = np.mean(hamming_scores) # Mean Hamming score
std_score = np.std(hamming_scores)   # Std deviation of Hamming score

# Store results in a DataFrame for display
results = pd.DataFrame({
    "Hamming Loss (mean)": [mean_loss],
    "Hamming Loss (std)": [std_loss],
    "Hamming Score (mean)": [mean_score],
    "Hamming Score (std)": [std_score]
})
# Display the result
print(results)



Running trial 1 of 50...
Running trial 2 of 50...
Running trial 3 of 50...
Running trial 4 of 50...
Running trial 5 of 50...
Running trial 6 of 50...
Running trial 7 of 50...
Running trial 8 of 50...
Running trial 9 of 50...
Running trial 10 of 50...
Running trial 11 of 50...
Running trial 12 of 50...
Running trial 13 of 50...
Running trial 14 of 50...
Running trial 15 of 50...
Running trial 16 of 50...
Running trial 17 of 50...
Running trial 18 of 50...
Running trial 19 of 50...
Running trial 20 of 50...
Running trial 21 of 50...
Running trial 22 of 50...
Running trial 23 of 50...
Running trial 24 of 50...
Running trial 25 of 50...
Running trial 26 of 50...
Running trial 27 of 50...
Running trial 28 of 50...
Running trial 29 of 50...
Running trial 30 of 50...
Running trial 31 of 50...
Running trial 32 of 50...
Running trial 33 of 50...
Running trial 34 of 50...
Running trial 35 of 50...
Running trial 36 of 50...
Running trial 37 of 50...
Running trial 38 of 50...
Running trial 39 of 5

Source:
- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
- https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
-	https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
-	https://scikit-learn.org/stable/modules/grid_search.html
-	https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
-	https://medium.com/data-science/the-right-way-of-using-smote-with-cross-validation-92a8d09d00c7
-	https://stats.stackexchange.com/questions/417576/can-we-apply-smote-on-data-with-k-fold-cv
-	https://www.kaggle.com/discussions/questions-and-answers/427399
-	https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
-	https://medium.com/@whystudying/monte-carlo-simulation-with-python-13e09731d500
-	https://scikit-learn.org/stable/modules/generated/sklearn.metrics.calinski_harabasz_score.html
