In [1]:
import numpy as np
from sklearn import datasets
from sklearn.cluster import AgglomerativeClustering, KMeans, SpectralClustering
from sklearn.base import BaseEstimator, ClusterMixin
from scipy.sparse.csgraph import connected_components
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import minimum_spanning_tree
from sklearn.pipeline import Pipeline

In [2]:
# Load iris dataset and seperate it to features and labels
iris = datasets.load_iris()
irisX = iris.data
irisY = iris.target

In [3]:
# Load breast cancer dataset and seperate it to features and labels
breasC = datasets.load_breast_cancer()
breasX = breasC.data
breasY = breasC.target

In [4]:
# Load optidigits dataset and seperate it to features and labels
optidigits = datasets.load_digits()
optidigitsX = optidigits.data
optidigitsY = optidigits.target

In [5]:
# Load yeast dataset and seperate it to features and labels
yeastTxt = open("yeast.data", "r")
yeastLines = yeastTxt.readlines()
firstData = []
trainData = []
lastData = []
for line in yeastLines:
    line = line.replace("\n", "")
    split = line.split("  ")
    firstData.append(split[0])
    lastData.append(split[-1])
    trainData.append([split[1], split[2], split[3], split[4], split[5], split[6], split[7], split[8]])
    
yeastFirst = np.asarray(firstData)
yeastX = np.asarray(trainData)
yeastLast = np.asarray(lastData)

In [6]:
# creating coassociation matrix 
def create_coassociation_matrix(labels):
    rows = []
    cols = []
    unique_labels = set(labels)
    for label in unique_labels:
        indices = np.where(labels == label)[0]
        for index1 in indices:
            for index2 in indices:
                rows.append(index1)
                cols.append(index2)
                
    data = np.ones((len(rows),))
    return csr_matrix((data, (rows, cols)), dtype='float')

In [74]:
def _generate_similarity_mat(labels):
    l_mat = np.repeat(np.asarray(labels).reshape(-1,1), len(labels), axis=1)
    l_mat_t = l_mat.T

    sim_mat = np.equal(l_mat, l_mat_t).astype(int)
    return sim_mat


def overall_quality(y,y_hat):
    return np.mean(y == y_hat) * 100 

from scipy.cluster.hierarchy import fcluster
from scipy.cluster.hierarchy import linkage

# Class of Evidence Accumulation Cluster which has different kind of clustering algorithm
# such as kmeans single link and average link methods
class EAC(BaseEstimator, ClusterMixin):
    def __init__(self, k, parameters_SC, cut_threshold=0.5):
        self.cut_threshold = cut_threshold
        self.k = k
        self.parameters_SC = parameters_SC
    def fit(self, X, y=None, method = None):
        coassociation_algorithms = []
        for i in self.k:
            coassociation_algorithms.append(create_coassociation_matrix(self._kmeans_clustering(X,i)))
            coassociation_algorithms.append(create_coassociation_matrix(self._singleLink_clustering(X,i)))
        for parameters in self.parameters_SC:
            coassociation_algorithms.append(create_coassociation_matrix(self._spectral_clustering(X, parameters)))
        
        C = sum(coassociation_algorithms)
        
        # get the average of the similarity mat
        avgC = np.divide(C.toarray(), (len(self.k)*2 + len(self.parameters_SC)))

        # flip the similarity. smaller value implies more similarity
        avgC = np.abs(np.max(avgC) - avgC)

        # build clusters
        self.Z_ = linkage(avgC, method="single")
        self.labels_ = fcluster(self.Z_, min(self.k), criterion='inconsistent')
        
        """
        mst = minimum_spanning_tree(-C)
        mst.data[mst.data > -self.cut_threshold] = 0
        self.n_components, self.labels_ = connected_components(mst)
        self.coassociation_matrix = coassociation_algorithms
        """
        return self

    def _kmeans_clustering(self, X, k):
        km = KMeans(n_clusters=k)
        return km.fit_predict(X)
    
    def _averageLink_clustering(self, X, k):
        alCluster = AgglomerativeClustering(n_clusters = k, linkage = "average")
        alCluster.fit(X)
        return alCluster.labels_
    
    def _singleLink_clustering(self, X, k):
        slCluster = AgglomerativeClustering(n_clusters = k, linkage = "single")
        slCluster.fit(X)
        return slCluster.labels_
    
    def _spectral_clustering(self, X, parameters):
        sc = SpectralClustering(n_clusters=parameters[0])
        return sc.fit_predict(X)

In [60]:
# For iris dataset
k = [3,5,10,12,15]
parameters_SC = []
eac1 = EAC(k, parameters_SC, cut_threshold=0.9)
eac1.fit(irisX)
print(overall_quality(irisY, (eac1.labels_ - 1)))
eac1.labels_

68.66666666666667




array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], dtype=int32)

In [61]:
# For iris dataset
k = [3,5,10,12,15]
parameters_SC = [(3,0.1), (12,0.1)]
eac2 = EAC(k, parameters_SC, cut_threshold=0.1)
eac2.fit(irisX)
print(overall_quality(irisY, (eac2.labels_ - 1)))
(eac2.labels_ - 1)

68.66666666666667




array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int32)

In [75]:
# For iris dataset
k = [2, 3, 5, 10]
parameters_SC = []
eac3 = EAC(k, parameters_SC, cut_threshold = 0.5)
eac3.fit(breasX)
print(overall_quality(breasY, (eac3.labels_ - 1)))
eac3.labels_
breasY

37.258347978910365




array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [68]:
eac3.labels_ - 1

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
# -*- coding: utf-8 -*-
"""Combining multiple clusterings using evidence accumulation (EAC).
"""
# Author: Yue Zhao <zhaoy@cmu.edu>
# License: BSD 2 clause

import warnings
import numpy as np

from scipy.cluster.hierarchy import fcluster
from scipy.cluster.hierarchy import linkage
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted

from pyod.utils.utility import check_parameter

from .base import BaseAggregator


def _generate_similarity_mat(labels):
    l_mat = np.repeat(labels, len(labels), axis=1)
    l_mat_t = l_mat.T

    sim_mat = np.equal(l_mat, l_mat_t).astype(int)
    return sim_mat

def overall_quality(y,y_hat):
    return np.mean(y == y_hat) * 100 


class EAC(BaseAggregator):

    def __init__(self, base_estimators, n_clusters, linkage_method='single',
                 weights=None, pre_fitted=False):

        super(EAC, self).__init__(
            base_estimators=base_estimators, pre_fitted=pre_fitted)

        check_parameter(n_clusters, low=2, param_name='n_clusters')
        self.n_clusters = n_clusters

        # set estimator weights
        self._set_weights(weights)

        self.linkage_method = linkage_method

    def fit(self, X):
        # Validate inputs X
        X = check_array(X)
        n_samples = X.shape[0]

        # initialize similarity matrix
        sim_mat_all = np.zeros([n_samples, n_samples])

        if self.pre_fitted:
            print("Training Skipped")

        else:
            for clf in self.base_estimators:
                clf.fit(X)
                clf.fitted_ = True

        for i, estimator in enumerate(self.base_estimators):
            check_is_fitted(estimator, ['labels_'])

            # get the labels from each base estimator
            labels = estimator.labels_.reshape(n_samples, 1)

            # generate the similarity matrix for the current estimator
            sim_mat = _generate_similarity_mat(labels)

            # add to the main similarity mat
            sim_mat_all = sim_mat_all + sim_mat

        # get the average of the similarity mat
        sim_mat_avg = np.divide(sim_mat_all, self.n_base_estimators_)

        # flip the similarity. smaller value implies more similarity
        sim_mat_avg = np.abs(np.max(sim_mat_avg) - sim_mat_avg)

        # build clusters
        self.Z_ = linkage(sim_mat_avg, method=self.linkage_method)
        self.labels_ = fcluster(self.Z_, self.n_clusters, criterion='maxclust')

        # it may leads to different number of clusters as specified by the user
        if len(np.unique(self.labels_)) != self.n_clusters:
            warnings.warn(
                'EAC generates {n} clusters instead of {n_clusters}'.format(
                    n=len(np.unique(self.labels_)),
                    n_clusters=self.n_clusters))

        return self

    def fit_predict(self, X, y=None):
        self.fit(X)
        return self.labels_