In [None]:
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from skopt import BayesSearchCV

possible_n_vals = [9, 12, 15]

def denoise_bernoulli_data(X):
    """Denoise data by converting to binary values"""
    return (X >= 0.5).astype(float)

def run_clustering(n):

    X = np.load('Datasets/kryptonite-%s-X.npy'%(n))
    y = np.load('Datasets/kryptonite-%s-y.npy'%(n))

    denoise_bernoulli_data(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # 80% training, 20% test

    param_space = {
        'n_components': (1, 30),
        'covariance_type': ['full', 'tied', 'diag', 'spherical'],
        'tol': (1e-4, 1e-2, 'log-uniform')
    }
    
    gmm = GaussianMixture()
    X_train_class0 = X_train[y_train == 0]
    X_train_class1 = X_train[y_train == 1]
    
    # bayes_search_class0 = BayesSearchCV(
    #     estimator = gmm,
    #     search_spaces = param_space,
    #     n_iter = 20,
    #     cv = 10,
    #     random_state = 42
    # )
    # bayes_search_class0.fit(X_train_class0)
    # best_gmm_class0 = bayes_search_class0.best_estimator_
    
    # bayes_search_class1 = BayesSearchCV(
    #     estimator = gmm,
    #     search_spaces = param_space,
    #     n_iter = 20,
    #     cv = 10,
    #     random_state = 42
    # )
    # bayes_search_class1.fit(X_train_class1)
    # best_gmm_class1 = bayes_search_class1.best_estimator_
       
    # log_likelihood_class0 = best_gmm_class0.score_samples(X_test)
    # log_likelihood_class1 = best_gmm_class1.score_samples(X_test)
    
    gmm_class0 = GaussianMixture(n_components=40, covariance_type='full', tol=1e-3, random_state=42)
    gmm_class1 = GaussianMixture(n_components=40, covariance_type='full', tol=1e-3, random_state=42) 
    
    gmm_class0.fit(X_train_class0)
    gmm_class1.fit(X_train_class1)
    log_likelihood_class0 = gmm_class0.score_samples(X_test)
    log_likelihood_class1 = gmm_class1.score_samples(X_test)
    
    y_pred = (log_likelihood_class1 > log_likelihood_class0).astype(int)
    
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {test_accuracy:.4f}")
    

for n_val in possible_n_vals:
    print(f"Running for {n_val} features")
    run_clustering(n_val)
        

Running for 9 features
Test Accuracy: 0.9525
Running for 12 features
Test Accuracy: 0.5602
Running for 15 features
Test Accuracy: 0.5012


In [12]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cluster import DBSCAN
from scipy.spatial.distance import cdist
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import silhouette_score


possible_n_vals = [9, 12, 15]

def compute_avg_distance_to_clusters(dbscan_model, X_train, X_test):
    core_samples = X_train[dbscan_model.core_sample_indices_]
    distances = cdist(X_test, core_samples)
    return np.mean(distances, axis=1)
    
def denoise_bernoulli_data(X):
    """Denoise data by converting to binary values"""
    return (X >= 0.5).astype(float)

def silhouette_scorer(estimator, X):
    labels = estimator.fit_predict(X)
    if len(set(labels)) > 1:
        return silhouette_score(X, labels)
    else:
        return -1  # Penalize poor clusters
    
def run_clustering(n):

    X = np.load('Datasets/kryptonite-%s-X.npy'%(n))
    y = np.load('Datasets/kryptonite-%s-y.npy'%(n))

    denoise_bernoulli_data(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # 80% training, 20% test

    X_train_class0 = X_train[y_train == 0]
    X_train_class1 = X_train[y_train == 1]

    param_dist = {
        'eps': np.linspace(0.1, 2.0, 20),  # Range of possible `eps` values
        'min_samples': np.arange(3, 20, 1)  # Range of possible `min_samples`
    }
    
    dbscan_class0 = DBSCAN()
    dbscan_class1 = DBSCAN()
    
    random_search_class0 = RandomizedSearchCV(dbscan_class0, param_distributions=param_dist, 
                                   n_iter=50, scoring=silhouette_scorer, random_state=42)
    random_search_class1 = RandomizedSearchCV(dbscan_class1, param_distributions=param_dist, 
                                   n_iter=50, scoring=silhouette_scorer, random_state=42)
    
    random_search_class0.fit(X_train_class0)
    random_search_class1.fit(X_train_class1)
        
    
    # dbscan_class0.fit_predict(X_train_class0)
    # dbscan_class1.fit_predict(X_train_class1)
    
    # avg_distance_class0 = compute_avg_distance_to_clusters(dbscan_class0, X_train_class0, X_test)
    # avg_distance_class1 = compute_avg_distance_to_clusters(dbscan_class1, X_train_class1, X_test)

    # y_pred = (avg_distance_class1 < avg_distance_class0).astype(int)
    
    avg_distance_class0 = compute_avg_distance_to_clusters(random_search_class0, X_train_class0, X_test)
    avg_distance_class1 = compute_avg_distance_to_clusters(random_search_class1, X_train_class1, X_test)

    y_pred = (avg_distance_class1 < avg_distance_class0).astype(int)
    
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {test_accuracy:.4f}")
    

for n_val in possible_n_vals:
    print(f"Running for {n_val} features")
    run_clustering(n_val)
        

Running for 9 features


KeyboardInterrupt: 

In [5]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


possible_n_vals = [9, 12, 15]

from scipy.stats import mode

def map_clusters_to_labels(clusters, true_labels):
  labels_mapping = {}
  for cluster in np.unique(clusters):
    cluster_indices = np.where(clusters == cluster)
    majority_label = mode(true_labels[cluster_indices])
    labels_mapping[cluster] = majority_label[0]
  return labels_mapping
    
def run_clustering(n):

    X = np.load('Datasets/kryptonite-%s-X.npy'%(n))
    y = np.load('Datasets/kryptonite-%s-y.npy'%(n))

    # X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)  # 60% training
    # X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # 20% validation, 20% test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # 80% training, 20% test
    
    kmeans = KMeans(n_clusters=10, random_state=42)
    
    kmeans.fit(X_train)
    
    train_clusters = kmeans.predict(X_train)
    test_clusters = kmeans.predict(X_test)
    
    labels_mapping = map_clusters_to_labels(train_clusters, y_train)
    
    y_train_pred = np.array([labels_mapping[cluster] for cluster in train_clusters])
    y_test_pred = np.array([labels_mapping[cluster] for cluster in test_clusters])
    
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    
    # print("Classification Report:")
    # print(classification_report(y_test, y_pred))

for n_val in possible_n_vals:
    print(f"Running for {n_val} features")
    run_clustering(n_val)

Running for 9 features
Training Accuracy: 0.5135
Test Accuracy: 0.4869
Running for 12 features
Training Accuracy: 0.5110
Test Accuracy: 0.4990
Running for 15 features
Training Accuracy: 0.5066
Test Accuracy: 0.5112
