In [1]:
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from skopt import BayesSearchCV

In [2]:
def run_clustering(n):

    # load input data and labels
    X_train = np.load('../../Datasets_Train_Test_Split/kryptonite-%s-X_train.npy' % (n))
    y_train = np.load('../../Datasets_Train_Test_Split/kryptonite-%s-y_train.npy' % (n))
    X_test = np.load('../../Datasets_Train_Test_Split/kryptonite-%s-X_test.npy' % (n))
    y_test = np.load('../../Datasets_Train_Test_Split/kryptonite-%s-y_test.npy' % (n))

    # Define parameter space for Gaussian Mixture Model
    param_space = {
        'n_components': (20, 50),
        'covariance_type': ['full', 'tied', 'diag', 'spherical'],
        'tol': (1e-4, 1e-2, 'log-uniform')
    }

    # Split data into class 0 and class 1
    X_train_class0 = X_train[y_train == 0]
    X_train_class1 = X_train[y_train == 1]

    gmm = GaussianMixture()

    # Train Gaussian Mixture Model for class 0
    bayes_search_class0 = BayesSearchCV(
        estimator = gmm,
        search_spaces = param_space,
        n_iter = 5,
        cv = 4,
        random_state = 42
    )
    bayes_search_class0.fit(X_train_class0)
    best_gmm_class0 = bayes_search_class0.best_estimator_

    # Train Gaussian Mixture Model for class 1
    bayes_search_class1 = BayesSearchCV(
        estimator = gmm,
        search_spaces = param_space,
        n_iter = 5,
        cv = 4,
        random_state = 42
    )
    bayes_search_class1.fit(X_train_class1)
    best_gmm_class1 = bayes_search_class1.best_estimator_

    # Evaluate the model on the test set
    log_likelihood_class0 = best_gmm_class0.score_samples(X_test)
    log_likelihood_class1 = best_gmm_class1.score_samples(X_test)

    # Predict the class with the highest log likelihood
    y_pred = (log_likelihood_class1 > log_likelihood_class0).astype(int)

    # Calculate the accuracy of the model
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {test_accuracy:.4f}")

In [3]:
possible_n_vals = [9, 12, 15, 18]

# Run clustering for different number of features
for n_val in possible_n_vals:
    print(f"Running for {n_val} features")
    run_clustering(n_val)

Running for 9 features
Test Accuracy: 0.9567
Running for 12 features
Test Accuracy: 0.5102
Running for 15 features
Test Accuracy: 0.4998
Running for 18 features
Test Accuracy: 0.5076
