In [1]:
import numpy as np
from sklearn import datasets
from sklearn.cluster import AgglomerativeClustering, KMeans, SpectralClustering
from sklearn.base import BaseEstimator, ClusterMixin
from scipy.sparse.csgraph import connected_components
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import minimum_spanning_tree
from sklearn.pipeline import Pipeline

In [2]:
# Load iris dataset and seperate it to features and labels
iris = datasets.load_iris()
irisX = iris.data
irisY = iris.target
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [3]:
# Load breast cancer dataset and seperate it to features and labels
breasC = datasets.load_breast_cancer()
breasX = breasC.data
breasY = breasC.target

In [4]:
# Load optidigits dataset and seperate it to features and labels
optidigits = datasets.load_digits()
optidigitsX = optidigits.data
optidigitsY = optidigits.target

In [5]:
# Load yeast dataset and seperate it to features and labels
yeastTxt = open("yeast.data", "r")
yeastLines = yeastTxt.readlines()
firstData = []
trainData = []
lastData = []
for line in yeastLines:
    line = line.replace("\n", "")
    split = line.split("  ")
    firstData.append(split[0])
    lastData.append(split[-1])
    trainData.append([split[1], split[2], split[3], split[4], split[5], split[6], split[7], split[8]])
    
yeastFirst = np.asarray(firstData)
yeastX = np.asarray(trainData)
yeastLast = np.asarray(lastData)

In [6]:
# creating coassociation matrix 
def create_coassociation_matrix(labels):
    rows = []
    cols = []
    unique_labels = set(labels)
    for label in unique_labels:
        indices = np.where(labels == label)[0]
        for index1 in indices:
            for index2 in indices:
                rows.append(index1)
                cols.append(index2)
                
    data = np.ones((len(rows),))
    return csr_matrix((data, (rows, cols)), dtype='float')

In [32]:
def overall_quality(y,y_hat):
    return np.mean(y == y_hat) * 100 

# Class of Evidence Accumulation Cluster which has different kind of clustering algorithm
# such as kmeans single link and average link methods
class EAC(BaseEstimator, ClusterMixin):
    def __init__(self, k, parameters_SC, cut_threshold=0.5):
        self.cut_threshold = cut_threshold
        self.k = k
        self.parameters_SC = parameters_SC
    def fit(self, X, y=None, method = None):
        coassociation_algorithms = []
        for i in self.k:
            coassociation_algorithms.append(create_coassociation_matrix(self._kmeans_clustering(X,i)))
            coassociation_algorithms.append(create_coassociation_matrix(self._singleLink_clustering(X,i)))
        for parameters in self.parameters_SC:
            coassociation_algorithms.append(create_coassociation_matrix(self._spectral_clustering(X, parameters)))
        
        C = sum(coassociation_algorithms)
        mst = minimum_spanning_tree(-C)
        mst.data[mst.data > -self.cut_threshold] = 0
        self.n_components, self.labels_ = connected_components(mst)
        self.coassociation_matrix = coassociation_algorithms
        return self

    def _kmeans_clustering(self, X, k):
        km = KMeans(n_clusters=k)
        return km.fit_predict(X)
    
    def _averageLink_clustering(self, X, k):
        alCluster = AgglomerativeClustering(n_clusters = k, linkage = "average")
        alCluster.fit(X)
        return alCluster.labels_
    
    def _singleLink_clustering(self, X, k):
        slCluster = AgglomerativeClustering(n_clusters = k, linkage = "single")
        slCluster.fit(X)
        return slCluster.labels_
    
    def _spectral_clustering(self, X, parameters):
        sc = SpectralClustering(n_clusters=parameters[0])
        return sc.fit_predict(X)
        

In [33]:
# For iris dataset
k = [3,5,10,12,15]
parameters_SC = []
eac1 = EAC(k, parameters_SC, cut_threshold=0.3)
eac1.fit(irisX)
print(overall_quality(irisY,eac1.labels_))
eac1.labels_

66.66666666666666


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [34]:
# For iris dataset
k = [3,5,10,12,15]
parameters_SC = [(3,0.1), (12,0.1)]
eac2 = EAC(k, parameters_SC, cut_threshold=0.1)
eac2.fit(irisX)
print(overall_quality(irisY,eac2.labels_))
eac2.labels_

66.66666666666666


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)