In [2]:
import pandas as pd
import numpy as np

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
from utils.em_clusterization import ExpectationMaximization, AlternatingECM
from utils.misc import cluster_accuracy, map_labels
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans

In [5]:
np.random.seed(139)

In [6]:
from sklearn.cluster import SpectralCoclustering

class SpectralBiClustering:
    
    def __init__(self, n_clusters):
        self.model = SpectralCoclustering(n_clusters = n_clusters)
    
    
    def fit_predict(self, X):
        self.model.fit(X)
        return self.model.row_labels_


## 1.0 Wine dataset

In [111]:
import time
from sklearn.preprocessing import scale
from sklearn.metrics.cluster import adjusted_rand_score


def run_models(models, X, y, scaling=False, n_it=10):
    result = {"accuracy": [], "ARI": [], "execution time": [], "model_name": []}
    if scaling:
        X = scale(X)
        
    for model_name in models:
        accuracies = []
        ARI = []
        exec_times = []
        
        for i in range(n_it):
            model = models[model_name](True)
            st = time.time()
            pred = model.fit_predict(X)
            end = time.time()
            
#             if model_name == "proposed":
#                 print([np.sum(Di) for Di in model.D[0]])
#                 print([np.sum(Di) for Di in model.D[1]])

            exec_times.append(end-st)
            ARI.append(adjusted_rand_score(y, pred))
            accuracies.append(cluster_accuracy(y, pred)[1])
        
        result["accuracy"].append(np.mean(accuracies))
        result["ARI"].append(np.mean(ARI))
        result["execution time"].append(np.mean(exec_times))
        result["model_name"].append(model_name)
    
    return pd.DataFrame(result)
            

            

In [112]:
from sklearn.datasets import load_wine

dataset = load_wine()
X, y = dataset['data'], dataset['target']

In [113]:
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans

model_to_run = {"gmm" : lambda x: GaussianMixture(n_components=3),
                "kmeans": lambda x: KMeans(n_clusters=3),
                "aecm": lambda x: AlternatingECM(n_clusters=3, q=5),
                "proposed": lambda x: ExpectationMaximization(n_clusters=3, linkage="average", group_search_rng=[2, 3, 4]), 
               "spectral": lambda x: SpectralBiClustering(n_clusters=3)}

In [114]:
run_models(model_to_run, X, y, n_it=10, scaling=True)

Unnamed: 0,accuracy,ARI,execution time,model_name
0,0.927528,0.831308,0.009522,gmm
1,0.966292,0.897495,0.015762,kmeans
2,0.983708,0.948791,4.320042,aecm
3,0.983146,0.945885,0.12025,proposed
4,0.909551,0.738716,0.020445,spectral


## 2.0 Olive dataset

In [111]:
OLIVE_DATA_URL = 'https://www.scss.tcd.ie/~arwhite/Teaching/STU33011/olive.csv'
data = pd.read_csv(OLIVE_DATA_URL)

# Use data[:,1] for area
X, y = data.values[:,2:], map_labels(data.values[:,0])

model_to_run = {"gmm" : lambda x: GaussianMixture(n_components=3),
                "kmeans": lambda x: KMeans(n_clusters=3),
                "aecm": lambda x: AlternatingECM(n_clusters=3, q=3),
                "proposed": lambda x: ExpectationMaximization(n_clusters=3, linkage="average", group_search_rng=[2, 3, 4]),
                "spectral": lambda x: SpectralBiClustering(n_clusters=3)}

run_models(model_to_run, X, y, n_it=10, scaling=True)

Unnamed: 0,accuracy,ARI,execution time,model_name
0,0.781818,0.60109,0.019907,gmm
1,0.765734,0.448355,0.047953,kmeans
2,0.791958,0.51727,19.623902,aecm
3,0.804196,0.574351,0.16904,proposed
4,0.573427,0.237712,0.042532,spectral


## 3.0 Ecoli dataset

In [116]:
ECOLI_DATA_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/ecoli/ecoli.data'
data = pd.read_csv(ECOLI_DATA_URL, header=None, delim_whitespace=True)

X, y = data.values[:,1:-1].astype('float'), map_labels(data.values[:,-1])

model_to_run = {"gmm" : lambda x: GaussianMixture(n_components=8),
                "kmeans": lambda x: KMeans(n_clusters=8),
               # "aecm": lambda x: AlternatingECM(n_clusters=8, q=4),
                "proposed": lambda x: ExpectationMaximization(n_clusters=8, linkage="average", group_search_rng=[3]),
                "spectral": lambda x: SpectralBiClustering(n_clusters=8)}

run_models(model_to_run, X, y, n_it=10, scaling=True)

Unnamed: 0,accuracy,ARI,execution time,model_name
0,0.760417,0.652883,0.017651,gmm
1,0.647917,0.504246,0.053712,kmeans
2,0.7625,0.656224,0.138954,proposed
3,0.564286,0.394227,0.048215,spectral


## 4.0 Alon dataset

In [132]:
data = pd.read_csv("data/alon/alon-exprs.csv")
data.drop([62], inplace=True)
data["V461"] = data["V461\\"].apply(lambda v: float(v[:-1]))
data.drop(columns=["V461\\"], inplace=True)

X = data.values
y = (pd.read_csv("data/alon/alon-class.csv").x - 1).values

In [133]:
data.shape

(62, 461)

In [118]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

def select_features(X, y, k=100):
    X_new = SelectKBest(f_classif, k=k).fit_transform(X, y)
    return X_new

def select_random_features(X, k=100):
    ind = np.random.choice(np.arange(X.shape[1]), k, replace=False)
    return X[:, ind]

In [119]:
# Running on a subset of 100 best predictors
model_to_run = {"gmm" : lambda x: GaussianMixture(n_components=2),
                "kmeans": lambda x: KMeans(n_clusters=2),
               "aecm": lambda x: AlternatingECM(n_clusters=2, q=5),
                "proposed": lambda x: ExpectationMaximization(n_clusters=2, linkage="average", group_search_rng=[3, 4, 5]),
                "spectral": lambda x: SpectralBiClustering(n_clusters=2)
               }

n_features = 100
X_selected = select_features(X, y, k=n_features)
run_models(model_to_run, X_selected, y, n_it=5, scaling=True)

Unnamed: 0,accuracy,ARI,execution time,model_name
0,0.874194,0.553512,0.013955,gmm
1,0.887097,0.592196,0.013017,kmeans
2,0.887097,0.592196,114.222108,aecm
3,0.887097,0.592196,0.812161,proposed
4,0.83871,0.450288,0.026637,spectral


In [129]:
# Running clustering on whole dataset
model_to_run = {"gmm" : lambda x: GaussianMixture(n_components=2),
                "kmeans": lambda x: KMeans(n_clusters=2),
              # "aecm": lambda x: AlternatingECM(n_clusters=2, q=5),
                "proposed": lambda x: ExpectationMaximization(n_clusters=2, linkage="complete", group_search_rng=[2]),
                "spectral": lambda x: SpectralBiClustering(n_clusters=2)
               }

run_models(model_to_run, X, y, n_it=5, scaling=True)

Unnamed: 0,accuracy,ARI,execution time,model_name
0,0.583871,0.04224,0.049828,gmm
1,0.551613,-0.003116,0.015926,kmeans
2,0.554839,0.001733,7.829108,proposed
3,0.548387,-0.006395,0.037958,spectral


## 5.0 Golub dataset

In [134]:
data = pd.read_csv("data/golub/golub.csv")
X, y = data.drop(columns=["class"]).values, data["class"].values - 1

In [135]:
X.shape

(72, 2030)

In [95]:
# Running on whole dataset
model_to_run = {"gmm" : lambda x: GaussianMixture(n_components=2),
                "kmeans": lambda x: KMeans(n_clusters=2),
               #"aecm": lambda x: AlternatingECM(n_clusters=2, q=5),
                "proposed": lambda x: ExpectationMaximization(n_clusters=2, linkage="average", group_search_rng=[2, 3, 4]),
                "spectral": lambda x: SpectralBiClustering(n_clusters=2)
               }

run_models(model_to_run, X, y, n_it=1, scaling=True)

[224.0, 1806.0]
[1869.0, 161.0]


Unnamed: 0,accuracy,ARI,execution time,model_name
0,0.708333,0.161019,0.83195,gmm
1,0.708333,0.162078,0.011937,kmeans
2,0.736111,0.212943,271.23691,proposed
3,0.722222,0.185869,0.18632,spectral


In [124]:
# Running on a subset of 100 best predictors

model_to_run = {"gmm" : lambda x: GaussianMixture(n_components=2),
                "kmeans": lambda x: KMeans(n_clusters=2),
               "aecm": lambda x: AlternatingECM(n_clusters=2, q=3),
                "proposed": lambda x: ExpectationMaximization(n_clusters=2, linkage="average", group_search_rng=[3, 4, 5, 6]),
                "spectral": lambda x: SpectralBiClustering(n_clusters=2)
               }


n_features = 100
X_selected = select_features(X, y, k=n_features)
run_models(model_to_run, X_selected, y, n_it=5, scaling=True)

Unnamed: 0,accuracy,ARI,execution time,model_name
0,0.972222,0.889738,0.012788,gmm
1,0.972222,0.889738,0.013169,kmeans
2,0.972222,0.889738,21.734696,aecm
3,0.972222,0.889738,0.697017,proposed
4,0.958333,0.837557,0.017515,spectral
