In [29]:
import pandas as pd
import numpy as np

In [10]:
!pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.2.0-cp39-cp39-macosx_10_9_x86_64.whl (9.1 MB)
[K     |████████████████████████████████| 9.1 MB 2.7 MB/s eta 0:00:01     |███████████████████████▌        | 6.6 MB 3.3 MB/s eta 0:00:01
Collecting joblib>=1.1.1
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Installing collected packages: joblib, scikit-learn
  Attempting uninstall: joblib
    Found existing installation: joblib 1.1.0
    Uninstalling joblib-1.1.0:
      Successfully uninstalled joblib-1.1.0
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.0.2
    Uninstalling scikit-learn-1.0.2:
      Successfully uninstalled scikit-learn-1.0.2
Successfully installed joblib-1.2.0 scikit-learn-1.2.0


In [30]:
import warnings
warnings.filterwarnings("ignore")

In [31]:
from utils.em_clusterization import ExpectationMaximization, AlternatingECM
from utils.misc import cluster_accuracy, map_labels
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans

In [32]:
np.random.seed(139)

In [33]:
from sklearn.cluster import SpectralCoclustering

class SpectralBiClustering:
    
    def __init__(self, n_clusters):
        self.model = SpectralCoclustering(n_clusters = n_clusters)
    
    
    def fit_predict(self, X):
        self.model.fit(X)
        return self.model.row_labels_


## 1.0 Wine dataset

In [34]:
import time
from sklearn.preprocessing import scale
from sklearn.metrics.cluster import adjusted_rand_score


def run_models(models, X, y, scaling=False, n_it=10):
    result = {"accuracy": [], "ARI": [], "execution time": [], "model_name": []}
    if scaling:
        X = scale(X)
        
    for model_name in models:
        accuracies = []
        ARI = []
        exec_times = []
        
        for i in range(n_it):
            model = models[model_name](True)
            st = time.time()
            pred = model.fit_predict(X)
            end = time.time()
            
#             if model_name == "proposed":
#                 print([np.sum(Di) for Di in model.D[0]])
#                 print([np.sum(Di) for Di in model.D[1]])

            exec_times.append(end-st)
            ARI.append(adjusted_rand_score(y, pred))
            accuracies.append(cluster_accuracy(y, pred)[1])
        
        result["accuracy"].append(np.mean(accuracies))
        result["ARI"].append(np.mean(ARI))
        result["execution time"].append(np.mean(exec_times))
        result["model_name"].append(model_name)
    
    return pd.DataFrame(result)
            

            

In [35]:
from sklearn.datasets import load_wine

dataset = load_wine()
X, y = dataset['data'], dataset['target']

In [36]:
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans

model_to_run = {"gmm" : lambda x: GaussianMixture(n_components=3),
                "kmeans": lambda x: KMeans(n_clusters=3),
              #  "aecm": lambda x: AlternatingECM(n_clusters=3, q=5),
                "proposed": lambda x: ExpectationMaximization(n_clusters=3, linkage="average", shared_cov=False, group_search_rng=[2, 3, 4]),
                "proposed2": lambda x: ExpectationMaximization(n_clusters=3, linkage="average", shared_cov=False, 
                                                               group_search_rng=[2, 3, 4], metric="precomputed"),
               "spectral": lambda x: SpectralBiClustering(n_clusters=3)}

In [37]:
run_models(model_to_run, X, y, n_it=10, scaling=True)

Unnamed: 0,accuracy,ARI,execution time,model_name
0,0.962921,0.88705,0.014658,gmm
1,0.966292,0.897495,0.035226,kmeans
2,0.983708,0.947662,0.193431,proposed
3,0.97191,0.911196,0.247566,proposed2
4,0.908989,0.737256,0.069399,spectral


In [114]:
run_models(model_to_run, X, y, n_it=10, scaling=True)

Unnamed: 0,accuracy,ARI,execution time,model_name
0,0.927528,0.831308,0.009522,gmm
1,0.966292,0.897495,0.015762,kmeans
2,0.983708,0.948791,4.320042,aecm
3,0.983146,0.945885,0.12025,proposed
4,0.909551,0.738716,0.020445,spectral


## 2.0 Olive dataset

In [43]:
OLIVE_DATA_URL = 'https://www.scss.tcd.ie/~arwhite/Teaching/STU33011/olive.csv'
data = pd.read_csv(OLIVE_DATA_URL)

# Use data[:,1] for area
X, y = data.values[:,2:], map_labels(data.values[:,0])

model_to_run = {"gmm" : lambda x: GaussianMixture(n_components=3),
                "kmeans": lambda x: KMeans(n_clusters=3),
                #"aecm": lambda x: AlternatingECM(n_clusters=3, q=3),
                "proposed": lambda x: ExpectationMaximization(n_clusters=3, linkage="average", group_search_rng=[2, 3, 4]),
                "proposed2": lambda x: ExpectationMaximization(n_clusters=3, linkage="average", group_search_rng=[2, 3, 4],
                                                               metric="precomputed"),

                "spectral": lambda x: SpectralBiClustering(n_clusters=3)}

run_models(model_to_run, X, y, n_it=15, scaling=True)

Unnamed: 0,accuracy,ARI,execution time,model_name
0,0.774476,0.653776,0.037327,gmm
1,0.765734,0.448355,0.073976,kmeans
2,0.825175,0.583364,0.220013,proposed
3,0.819231,0.608168,0.226149,proposed2
4,0.573427,0.237712,0.063781,spectral


In [16]:
OLIVE_DATA_URL = 'https://www.scss.tcd.ie/~arwhite/Teaching/STU33011/olive.csv'
data = pd.read_csv(OLIVE_DATA_URL)

# Use data[:,1] for area
X, y = data.values[:,2:], map_labels(data.values[:,0])

model_to_run = {"gmm" : lambda x: GaussianMixture(n_components=3),
                "kmeans": lambda x: KMeans(n_clusters=3),
                #"aecm": lambda x: AlternatingECM(n_clusters=3, q=3),
                "proposed": lambda x: ExpectationMaximization(n_clusters=3, shared_cov=True, linkage="average", group_search_rng=[2, 3, 4]),
                "spectral": lambda x: SpectralBiClustering(n_clusters=3)}
run_models(model_to_run, X, y, n_it=10, scaling=True)

Unnamed: 0,accuracy,ARI,execution time,model_name
0,0.709091,0.570165,0.022157,gmm
1,0.765734,0.448355,0.063835,kmeans
2,0.807692,0.528832,0.136656,proposed
3,0.573427,0.237712,0.048054,spectral


## 3.0 Ecoli dataset

In [44]:
ECOLI_DATA_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/ecoli/ecoli.data'
data = pd.read_csv(ECOLI_DATA_URL, header=None, delim_whitespace=True)

X, y = data.values[:,1:-1].astype('float'), map_labels(data.values[:,-1])

model_to_run = {"gmm" : lambda x: GaussianMixture(n_components=8),
                "kmeans": lambda x: KMeans(n_clusters=8),
               # "aecm": lambda x: AlternatingECM(n_clusters=8, q=4),
                "proposed": lambda x: ExpectationMaximization(n_clusters=8, linkage="average", group_search_rng=[3]),
                 "proposed2": lambda x: ExpectationMaximization(n_clusters=8, linkage="average", group_search_rng=[3],
                                                               metric="precomputed"),

                "spectral": lambda x: SpectralBiClustering(n_clusters=8)}

run_models(model_to_run, X, y, n_it=10, scaling=True)

Unnamed: 0,accuracy,ARI,execution time,model_name
0,0.74494,0.644923,0.021043,gmm
1,0.649702,0.505903,0.047007,kmeans
2,0.762202,0.655928,0.171389,proposed
3,0.76131,0.653929,0.170306,proposed2
4,0.537202,0.358701,0.069657,spectral


## 4.0 Alon dataset

In [13]:
data = pd.read_csv("data/alon/alon-exprs.csv")
data.drop([62], inplace=True)
data["V461"] = data["V461\\"].apply(lambda v: float(v[:-1]))
data.drop(columns=["V461\\"], inplace=True)

X = data.values
y = (pd.read_csv("data/alon/alon-class.csv").x - 1).values

In [14]:
data.shape

(62, 461)

In [15]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

def select_features(X, y, k=100):
    X_new = SelectKBest(f_classif, k=k).fit_transform(X, y)
    return X_new

def select_random_features(X, k=100):
    ind = np.random.choice(np.arange(X.shape[1]), k, replace=False)
    return X[:, ind]

In [17]:
# Running on a subset of 100 best predictors
model_to_run = {"gmm" : lambda x: GaussianMixture(n_components=2),
                "kmeans": lambda x: KMeans(n_clusters=2),
              # "aecm": lambda x: AlternatingECM(n_clusters=2, q=5),
                "proposed": lambda x: ExpectationMaximization(n_clusters=2, linkage="average", group_search_rng=[3, 4, 5]),
                "proposed2": lambda x: ExpectationMaximization(n_clusters=2, linkage="average", group_search_rng=[3, 4, 5], metric="precomputed"),
                
                "spectral": lambda x: SpectralBiClustering(n_clusters=2)
               }

n_features = 100
X_selected = select_features(X, y, k=n_features)
run_models(model_to_run, X_selected, y, n_it=5, scaling=True)

Unnamed: 0,accuracy,ARI,execution time,model_name
0,0.874194,0.553512,0.023636,gmm
1,0.887097,0.592196,0.035228,kmeans
2,0.887097,0.592196,0.686413,proposed
3,0.887097,0.592196,0.716029,proposed2
4,0.83871,0.450288,0.047934,spectral


In [19]:
# Running clustering on whole dataset
model_to_run = {"gmm" : lambda x: GaussianMixture(n_components=2),
                "kmeans": lambda x: KMeans(n_clusters=2),
              # "aecm": lambda x: AlternatingECM(n_clusters=2, q=5),
                "proposed": lambda x: ExpectationMaximization(n_clusters=2, linkage="complete", group_search_rng=[2]),
                                "proposed2": lambda x: ExpectationMaximization(n_clusters=2, linkage="average", group_search_rng=[2],
                                                                               metric="precomputed"),

                "spectral": lambda x: SpectralBiClustering(n_clusters=2)
               }

run_models(model_to_run, X, y, n_it=5, scaling=True)

Unnamed: 0,accuracy,ARI,execution time,model_name
0,0.609677,0.051039,0.085572,gmm
1,0.551613,-0.004561,0.032064,kmeans
2,0.535484,-0.015192,6.425758,proposed
3,0.548387,-0.002739,5.725083,proposed2
4,0.548387,-0.006395,0.068162,spectral


## 5.0 Golub dataset

In [20]:
data = pd.read_csv("data/golub/golub.csv")
X, y = data.drop(columns=["class"]).values, data["class"].values - 1

In [21]:
X.shape

(72, 2030)

In [26]:
# Running on whole dataset
model_to_run = {"gmm" : lambda x: GaussianMixture(n_components=2),
                "kmeans": lambda x: KMeans(n_clusters=2),
               #"aecm": lambda x: AlternatingECM(n_clusters=2, q=5),
                "proposed": lambda x: ExpectationMaximization(n_clusters=2, linkage="average", group_search_rng=[2, 3, 4]),
                "proposed2": lambda x: ExpectationMaximization(n_clusters=2, linkage="average", group_search_rng=[2, 3, 4],
                                                              metric="precomputed"),

                "spectral": lambda x: SpectralBiClustering(n_clusters=2)
               }

run_models(model_to_run, X, y, n_it=4, scaling=True)

Unnamed: 0,accuracy,ARI,execution time,model_name
0,0.777778,0.342555,0.835701,gmm
1,0.736111,0.213002,0.038591,kmeans
2,0.701389,0.151831,802.528918,proposed
3,0.736111,0.213802,137.855231,proposed2
4,0.722222,0.185869,0.086695,spectral


In [23]:
# Running on a subset of 100 best predictors

model_to_run = {"gmm" : lambda x: GaussianMixture(n_components=2),
                "kmeans": lambda x: KMeans(n_clusters=2),
               #"aecm": lambda x: AlternatingECM(n_clusters=2, q=3),
                "proposed": lambda x: ExpectationMaximization(n_clusters=2, linkage="average", group_search_rng=[3, 4, 5, 6]),
                                "proposed2": lambda x: ExpectationMaximization(n_clusters=2, linkage="average", group_search_rng=[3, 4, 5, 6], metric="precomputed"),

                "spectral": lambda x: SpectralBiClustering(n_clusters=2)
               }


n_features = 100
X_selected = select_features(X, y, k=n_features)
run_models(model_to_run, X_selected, y, n_it=5, scaling=True)

Unnamed: 0,accuracy,ARI,execution time,model_name
0,0.972222,0.889738,0.020149,gmm
1,0.972222,0.889738,0.029761,kmeans
2,0.972222,0.889738,0.917322,proposed
3,0.972222,0.889738,1.409838,proposed2
4,0.958333,0.837557,0.050788,spectral
