This notebook demonstrates the use of a genetic algorithm to optimize the weights for weighted voting in ensemble voting for time signature detection.

Introduction:
- Ensemble voting is a technique that combines the predictions of multiple models to make a final decision.
- In this notebook, we focus on time signature detection, which is the process of determining the time signature of a musical piece.
- We use a weighted voting approach, where each model's prediction is multiplied by a weight and then summed up to make the final decision.
- The genetic algorithm is employed to find the optimal weights that maximize the accuracy of the ensemble voting system.

Usage:
- Before running the code, make sure you have the necessary dependencies installed.
- The code assumes that you have a dataset of labeled musical pieces for training and testing.
- Adjust the hyperparameters and settings according to your specific needs.
- Run the code cells sequentially to train the models, optimize the weights, and evaluate the ensemble voting system.
- The results and performance metrics will be displayed in the output.

Note: This code is provided as a starting point and can be customized and extended for different ensemble voting tasks.


In [None]:
# Remember to install the geneticalgorithm package
# uncomment the following line to install the package
# !pip install geneticalgorithm

In [None]:
import pandas as pd
import numpy as np
import ast
import glob
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import itertools
from geneticalgorithm import geneticalgorithm as ga

plt.rcParams.update({'font.size':20})

num_classes = 2  # or 4, depending on the data you're working with

csv_test_files = {
    'cnn': f"Models/Mfcc/{num_classes}-CLASSES/test/agg/cnn_mfcc_{num_classes}_cls_test_ensemble_agg.csv",
    'crnn': f"Models/Mfcc/{num_classes}-CLASSES/test/agg/crnn_mfcc_{num_classes}_cls_test_ensemble_agg.csv",
    'resnet': f"Models/Mfcc/{num_classes}-CLASSES/test/agg/resnet_mfcc_{num_classes}_cls_test_ensemble_agg.csv",
    'resnetlstm': f"Models/Mfcc/{num_classes}-CLASSES/test/agg/resnetlstm_mfcc_{num_classes}_cls_test_ensemble_agg.csv",
    'svm': f"Models/Mfcc/{num_classes}-CLASSES/test/agg/SVM_mfcc_{num_classes}_cls_test_ensemble_agg.csv",
    'knn': f"Models/Mfcc/{num_classes}-CLASSES/test/agg/KNN_mfcc_{num_classes}_cls_test_ensemble_agg.csv",
    'naive': f"Models/Mfcc/{num_classes}-CLASSES/test/agg/Naive_Bayes_mfcc_{num_classes}_cls_test_ensemble_agg.csv",
    'forest': f"Models/Mfcc/{num_classes}-CLASSES/test/agg/R-Forest_mfcc_{num_classes}_cls_test_ensemble_agg.csv"
}


csv_val_files = {
    'cnn': f"Models/Mfcc/{num_classes}-CLASSES/val/agg/cnn_mfcc_{num_classes}_cls_val_ensemble_agg.csv",
    'crnn': f"Models/Mfcc/{num_classes}-CLASSES/val/agg/crnn_mfcc_{num_classes}_cls_val_ensemble_agg.csv",
    'resnet': f"Models/Mfcc/{num_classes}-CLASSES/val/agg/resnet_mfcc_{num_classes}_cls_val_ensemble_agg.csv",
    'resnetlstm': f"Models/Mfcc/{num_classes}-CLASSES/val/agg/resnetlstm_mfcc_{num_classes}_cls_val_ensemble_agg.csv",
    'svm': f"Models/Mfcc/{num_classes}-CLASSES/val/agg/SVM_mfcc_{num_classes}_cls_val_ensemble_agg.csv",
    'knn': f"Models/Mfcc/{num_classes}-CLASSES/val/agg/KNN_mfcc_{num_classes}_cls_val_ensemble_agg.csv",
    'naive': f"Models/Mfcc/{num_classes}-CLASSES/val/agg/Naive_Bayes_mfcc_{num_classes}_cls_val_ensemble_agg.csv",
    'forest': f"Models/Mfcc/{num_classes}-CLASSES/val/agg/R-Forest_mfcc_{num_classes}_cls_val_ensemble_agg.csv"
}

# Test files becomes our training files for the genetic algorithm
csv_files_train = [
    csv_test_files['svm'], csv_test_files['cnn'], csv_test_files['knn'], 
    csv_test_files['resnetlstm'], csv_test_files['naive'], csv_test_files['forest'], 
    csv_test_files['crnn'], csv_test_files['resnet']
]

# Validation files becomes our test files for the genetic algorithm
csv_files_test = [
    csv_test_files['svm'], csv_test_files['cnn'], csv_test_files['knn'], 
    csv_test_files['resnetlstm'], csv_test_files['naive'], csv_test_files['forest'], 
    csv_test_files['crnn'], csv_test_files['resnet']
]

In [None]:
def sort_csv_files(csv_files):
    # Sort the CSV files by the `file_name` column in ascending order
    sorted_csv_files = sorted(csv_files, key=lambda f: f.split('/')[-1])

    # Iterate over the sorted CSV files
    for csv_file in sorted_csv_files:

        # Read the CSV file into a Pandas DataFrame
        df = pd.read_csv(csv_file)

        # Sort the DataFrame by the `file_name` column in ascending order
        df = df.sort_values(by=['file_name'], ascending=True)

        # Write the sorted DataFrame to the same CSV file
        df.to_csv(csv_file, index=False)

def create_grouped_df(csv_files):
    grouped_df = pd.DataFrame()

    for i, file in enumerate(csv_files):
        df = pd.read_csv(file)
        model_name = file.split("/")[-1].split("_")[0]  # Extract the model name from the file path

        # Select the desired columns and rename them with the model name as a suffix
        df_selected = df[['file_name', 'softmax_output', 'predictions']]
        df_selected.columns = ['file_name', f'{model_name}_softmax_output', f'{model_name}_predictions']

        if grouped_df.empty:
            grouped_df = df_selected
        else:
            # Merge the selected columns with the existing grouped_df DataFrame
            grouped_df = pd.merge(grouped_df, df_selected, on='file_name')

    # Include the 'true_label' column
    df_true_label = pd.read_csv(csv_files[0])  # Read the first CSV file to extract the 'true_label' column
    grouped_df = pd.merge(grouped_df, df_true_label[['file_name', 'true_label']], on='file_name')

    # Group the data by 'file_name' and select the first occurrence of each file name
    grouped_df = grouped_df.groupby('file_name').first().reset_index()

    return grouped_df

def preprocess_softmax(df, softmax_columns):
    softmax_arrays = []
    for col in softmax_columns:
        softmax_arrays.append(np.array([ast.literal_eval(val) for val in df[col]]))
    return softmax_arrays

def calculate_weighted_sum(df, softmax_arrays, weights):
    weighted_sums = np.zeros((len(df), len(softmax_arrays[0][0])))
    for i, prob in enumerate(softmax_arrays):
        weighted_sums += prob * weights[i]
    return weighted_sums

def get_ensemble_predictions(weighted_sums):
    return np.argmax(weighted_sums, axis=1)

def evaluate_predictions(true_labels, ensemble_predictions):
    report = classification_report(true_labels, ensemble_predictions, output_dict=True)
    return report['accuracy']

def weighted_voting(df, softmax_columns, weights, return_labels=False):
    if not softmax_columns:
        raise ValueError("No softmax columns found in the DataFrame.")
    
    softmax_arrays = preprocess_softmax(df, softmax_columns)
    weighted_sums = calculate_weighted_sum(df, softmax_arrays, weights)
    ensemble_predictions = get_ensemble_predictions(weighted_sums)
    
    if return_labels:
        true_labels = df['true_label'].tolist()
        return ensemble_predictions, true_labels
    else:
        df['weighted'] = ensemble_predictions
        return df


def weighted_ensemble_voting_with_accuracy_list(df, softmax_columns, weights):
    if not softmax_columns:
        raise ValueError("No softmax columns found in the DataFrame.")
    
    true_labels = df['true_label'].tolist()
    softmax_arrays = preprocess_softmax(df, softmax_columns)
    
    accuracy_list = []
    
    for weight_set in weights:
        weighted_sums = calculate_weighted_sum(df, softmax_arrays, weight_set)
        ensemble_predictions = get_ensemble_predictions(weighted_sums)
        accuracy = evaluate_predictions(true_labels, ensemble_predictions)
        accuracy_list.append(accuracy)
    
    return accuracy_list

def calculate_accuracy(predictions, true_labels):
    correct = 0
    total = len(predictions)
    for pred, true_label in zip(predictions, true_labels):
        if pred == true_label:
            correct += 1
    accuracy = correct / total
    return 100 - accuracy

def evaluate_weights(weights):
    predictions, true_labels = weighted_voting(df, softmax_columns, weights, return_labels=True)
    accuracy = calculate_accuracy(predictions, true_labels)
    return accuracy

def plot_confusion_matrix(cm, classes, title, normalize=False,
                          cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1, keepdims=True)
        print("Normalized confusion matrix")
    else:
        print("Confusion matrix without normalization")
        
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j,i, cm[i, j], 
                 horizontalalignment="center", 
                 color="white" if cm[i, j] > thresh else "black")
        
    plt.tight_layout()
    plt.ylabel("True Meter")
    plt.xlabel("Predicted Meter")

    


In [None]:
# Get the list of predictions columns
sort_csv_files(csv_files_train)
df = create_grouped_df(csv_files_train)
test_df = create_grouped_df(csv_files_test)
softmax_columns = [col for col in df.columns if col.endswith('_softmax_output')]

The following code implements the Genetic Algorithm (GA) for optimizing the weights in the ensemble voting system.

In [None]:
# Define the optimization parameters
varbound = np.array([[0, 1]] * 8) # Define the lower and upper bounds for the weights

algorithm_param = {'max_num_iteration': 100,
                   'population_size': 100,
                   'mutation_probability': 0.6,
                   'elit_ratio': 0.01,
                   'crossover_probability': 0.2,
                   'parents_portion': 0.5,
                   'crossover_type': 'uniform',
                   'max_iteration_without_improv': None}

# Run the optimization process 50 times
num_runs = 50
optimized_weights_array = []

for _ in range(num_runs):
    # Initialize and run the genetic algorithm
    model = ga(function=evaluate_weights,
               dimension=8,
               variable_type='real',
               variable_boundaries=varbound,
               algorithm_parameters=algorithm_param,
               function_timeout=1000200)
    model.run()

    # Store the optimized weights in an array
    optimized_weights_array.append(model.output_dict['variable'])

# Convert the list of arrays to a numpy array
optimized_weights_array = np.array(optimized_weights_array)


In [None]:
model_names =['SVM', 'CNN', 'KNN','RES-LSTM', 'NB', 'RF',  'CRNN', 'RESNET'] 
num_weights = len(model_names)

# Plot heatmap to visualize similarity between weights across runs
plt.figure(figsize=(2070/100, 1570/100)) # Set the figure size to 30x30 inches
plt.imshow(optimized_weights_array, cmap='viridis', aspect='auto')
plt.xticks(ticks=np.arange(num_weights), labels=model_names)  # Set the fontsize for xticks
plt.colorbar(label='Value')  # Set the fontsize for colorbar
plt.xlabel('Models')  # Increase the xlabel fontsize
plt.ylabel('Runs')  # Increase the ylabel fontsize

plt.savefig('heatmap.png', dpi=200)  # Save the figure with 200 dpi
plt.show()


In [None]:
# Define the weights for the weighted ensemble
# pick one from the 50 runs
weights = [0.01411482, 0.33084206, 0.51650582, 0.57955419, 0.12583674, 0.38093341, 0.16874136, 0.79187015]

# Use the weightedEnsembleVoting method to get ensemble predictions
weighted_df = weighted_voting(test_df, softmax_columns, weights, return_labels=False)

# Calculate confusion matrix
cm = confusion_matrix(weighted_df['true_label'], weighted_df['weighted'])
    
# Create the title with the model name
title = f"\nCM | MFCC | Weighted-Ensemble | {num_classes} CLASSES\n"

# Create the class labels
if num_classes == 2:
    classes = ['FOUR', 'THREE']
elif num_classes == 4:
    classes = ['FOUR', 'THREE', 'FIVE', 'SEVEN']

# Plot the confusion matrix
plt.figure()
plot_confusion_matrix(cm, classes=classes, title=title)
plt.show()
    
print(classification_report(weighted_df['true_label'], weighted_df['weighted'], digits=4))

In [None]:
# Example usage: Calculate accuracy for each set of optimized weights
# Initialize list to store accuracy values for each set of weights
accuracy_results = []

# Iterate over each set of weights
for weights_set in optimized_weights_array:
    # Use the weightedTestVoting method to get ensemble predictions and calculate accuracy
    accuracy = weighted_ensemble_voting_with_accuracy_list(test_df, softmax_columns, [weights_set])
    accuracy_results.append(accuracy)


In [None]:
accuracy_array = np.array(accuracy_results)

# Calculate standard deviation, minimum, maximum, and mean
std_deviation = np.std(accuracy_array)
minimum = np.min(accuracy_array)
maximum = np.max(accuracy_array)
mean = np.mean(accuracy_array)

print("Standard Deviation:", std_deviation)
print("Minimum:", minimum)
print("Maximum:", maximum)
print("Mean:", mean)