In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import *
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from ntpath import split, basename
from os import stat

In [2]:
def readData(path, label_col = None, return_dataframe = False):
    dataframe = pd.read_csv(path)
    if(label_col is None):
        X = dataframe.to_numpy()
    else:
        X = np.array(dataframe.drop(label_col, axis = 1))
    
    if(return_dataframe):
        return X, dataframe
    return X

In [3]:
def cluster(method, parameters = {}):
    methods = {"KMeans" : KMeans,
               "AffinityPropagation" : AffinityPropagation,
               "AgglomerativeClustering" : AgglomerativeClustering,
               "Birch" : Birch,
               "DBSCAN" : DBSCAN,
               "FeatureAgglomeration" : FeatureAgglomeration,
               "MiniBatchKMeans" : MiniBatchKMeans,
               "MeanShift" : MeanShift,
               "OPTICS" : OPTICS,
               "SpectralClustering" : SpectralClustering,
               "SpectralBiclustering" : SpectralBiclustering,
               "SpectralCoclustering" : SpectralCoclustering,}
               
    return methods[method](**parameters)

In [4]:
def getScores(X, labels):
    slt_score = silhouette_score(X, labels)
    db_score = davies_bouldin_score(X, labels)
    return (slt_score, db_score)

In [5]:
def fileNameFromPath(path):
    head, tail = split(path)
    return tail or basename(head)

In [6]:
def writeData(file_name, method, slt_score, db_score, k = '-', seed = '-'): 
    file = open(file_name, 'a')
    if(stat(file_name).st_size == 0):
        file.write("method,n_cluster,seed,silhoutte,db\n")
    
    file.write(method + ',' + k + ',' + seed + ',' + slt_score + ',' + db_score + '\n')
    file.close()

In [7]:
def analyze(path, method, file_name, parameters = {}, k_range = None, seed_range = None, label_col = None):
    dataset = readData(path, label_col)
    if(k_range is not None and seed_range is not None):
        for x in range(k_range[0], k_range[1]):
            if(method == "SpectralBiclustering"):
                parameters["n_clusters"] = (x, dataset.shape[1])
            else:
                parameters["n_clusters"] = x
            for y in range(seed_range[0], seed_range[1]):
                parameters["random_state"] = y
                cluster_method = cluster(method, parameters)
                cluster_method.fit(dataset)
                if(method == "SpectralBiclustering" or method == "SpectralCoclustering"):
                    scores = getScores(dataset, cluster_method.row_labels_)
                else:
                    scores = getScores(dataset, cluster_method.labels_)
                writeData(file_name, method, k = str(x), seed = str(y), slt_score = str(scores[0]), db_score = str(scores[1]))
    elif(k_range is not None and seed_range is None):
        for x in range(k_range[0], k_range[1]):
            parameters["n_clusters"] = x
            cluster_method = cluster(method, parameters)
            cluster_method.fit(dataset)
            scores = getScores(dataset, cluster_method.labels_)
            writeData(file_name, method, k = str(x), slt_score = str(scores[0]), db_score = str(scores[1]))
    elif(k_range is None and seed_range is not None):
        for y in range(seed_range[0], seed_range[1]):
            parameters["random_state"] = y
            cluster_method = cluster(method, parameters)
            cluster_method.fit(dataset)
            scores = getScores(dataset, cluster_method.labels_)
            writeData(file_name, method, seed = str(y), slt_score = str(scores[0]), db_score = str(scores[1]))
    else:
        cluster_method = cluster(method, parameters)
        cluster_method.fit(dataset)
        scores = getScores(dataset, cluster_method.labels_)
        writeData(file_name, method, slt_score = str(scores[0]), db_score = str(scores[1]))

In [8]:
def analyzeAll(path, k_range = None, seed_range = None, label_col = None):
    file_name = fileNameFromPath(path) + '_' + 'k(' + str(k_range[0]) + ',' + str(k_range[1]) + ')' + '_' + 'seed(' + str(seed_range[0]) + ',' + str(seed_range[1]) + ').csv'
    have_none = ["AffinityPropagation", "DBSCAN", "OPTICS", "MeanShift"]
    have_k = ["AgglomerativeClustering", "Birch", "FeatureAgglomeration"]
    have_seed_and_k = ["KMeans", "MiniBatchKMeans", "SpectralClustering", "SpectralBiclustering", "SpectralCoclustering"]
    
    for method in have_seed_and_k:
        analyze(path, method, file_name, k_range = k_range, seed_range = seed_range, label_col = "variety")
    for method in have_k:
        analyze(path, method, file_name, k_range = k_range, label_col = "variety", parameters = {})
    for method in have_none:
        analyze(path, method, file_name, label_col = "variety", parameters = {})

In [9]:
def main():
    analyzeAll("iris.csv", k_range = (2, 5), seed_range = (1, 5), label_col = "variety")

In [10]:
main()

ValueError: Found input variables with inconsistent numbers of samples: [150, 4]