In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import *
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from ntpath import split, basename
from os import stat

In [2]:
def readData(path, label_col = None):
    dataframe = pd.read_csv(path)
    if(label_col is None):
        X = dataframe.to_numpy()
    else:
        X = np.array(dataframe.drop(label_col, axis = 1))
    return X

In [3]:
def cluster(method, parameters = {}):
    methods = {"KMeans" : KMeans,
               "AffinityPropagation" : AffinityPropagation,
               "AgglormerativeClustering" : AgglomerativeClustering,
               "Birch" : Birch,
               "DBSCAN" : DBSCAN,
               "FeatureAgglomeration" : FeatureAgglomeration,
               "MiniBatchKMeans" : MiniBatchKMeans,
               "MeanShift" : MeanShift,
               "OPTICS" : OPTICS,
               "SpectralClustering" : SpectralClustering,
               "SpectralBiclustering" : SpectralBiclustering,
               "SpectralCoclustering" : SpectralCoclustering,}
               
    return methods[method](**parameters)

In [4]:
def getScores(X, labels):
    slt_score = silhouette_score(X, labels)
    db_score = davies_bouldin_score(X, labels)
    return (slt_score, db_score)

In [5]:
def fileNameFromPath(path):
    head, tail = split(path)
    return tail or basename(head)

In [6]:
def writeData(file_name, slt_score, db_score, k = '-', seed = '-'): 
    file = open(file_name, 'a')
    if(stat(file_name).st_size == 0):
        file.write("n_cluster,seed,silhoutte,db\n")
    
    file.write(k + ',' + seed + ',' + slt_score + ',' + db_score + '\n')
    file.close()

In [7]:
def analyze(path, method, parameters = {}, k_range = None, seed_range = None, label_col = None, ):
    dataset = readData(path, label_col)
    if(k_range is not None and seed_range is not None):
        file_name = fileNameFromPath(path) + '_' + method + '_' + 'k(' + str(k_range[0]) + ',' + str(k_range[1]) + ')' + '_' + 'seed(' + str(seed_range[0]) + ',' + str(seed_range[1]) + ').csv'
        for x in range(k_range[0], k_range[1]):
            parameters["n_clusters"] = x
            for y in range(seed_range[0], seed_range[1]):
                parameters["random_state"] = y
                cluster_method = cluster(method, parameters)
                cluster_method.fit(dataset)
                scores = getScores(dataset, cluster_method.labels_)
                writeData(file_name, k = str(x), seed = str(y), slt_score = str(scores[0]), db_score = str(scores[1]))
    elif(k_range is not None and seed_range is None):
        file_name = fileNameFromPath(path) + '_' + method + '_' + 'k(' + str(k_range[0]) + ',' + str(k_range[1]) + ').csv'
        for x in range(k_range[0], k_range[1]):
            parameters["n_clusters"] = x
            cluster_method = cluster(method, parameters)
            cluster_method.fit(dataset)
            scores = getScores(dataset, cluster_method.labels_)
            writeData(file_name, k = str(x), slt_score = str(scores[0]), db_score = str(scores[1]))
    elif(k_range is None and seed_range is not None):
        file_name = fileNameFromPath(path) + '_' + method + '_' + 'seed(' + str(seed_range[0]) + ',' + str(seed_range[1]) + ').csv'
        for y in range(seed_range[0], seed_range[1]):
            parameters["random_state"] = y
            cluster_method = cluster(method, parameters)
            cluster_method.fit(dataset)
            scores = getScores(dataset, cluster_method.labels_)
            writeData(file_name, seed = str(y), slt_score = str(scores[0]), db_score = str(scores[1]))
    else:
        file_name = fileNameFromPath(path) + '_' + method + '.csv'
        cluster_method = cluster(method, parameters)
        cluster_method.fit(dataset)
        scores = getScores(dataset, cluster_method.labels_)
        writeData(file_name, slt_score = str(scores[0]), db_score = str(scores[1]))

In [14]:
def main():
    analyze("iris.csv", "KMeans", k_range = (2, 21), seed_range = (1, 21), label_col = "variety")

In [15]:
main()

TypeError: __init__() got an unexpected keyword argument 'n_clusters'