In [None]:
print(__doc__)

# ba and bb data sets from http://portals.broadinstitute.org/cgi-bin/cancer/datasets.cgi

import time

import numpy as np
import matplotlib.pyplot as plt

from sklearn import cluster, datasets
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import hdbscan as hd
from Analysis.Cluster.Clusterer import IMWKRescaled

import warnings
warnings.filterwarnings('ignore')

# load data
ba = np.genfromtxt('/Volumes/RyanMData-1/GenomicClustering/Breast_A.gct', skip_header=3)
ba = np.delete(ba, [0, 1], 1)
ba = ba.transpose()
bb = np.genfromtxt('/Volumes/RyanMData-1/GenomicClustering/Breast_B.gct', skip_header=3)
bb = np.delete(bb, [0, 1], 1)
bb = bb.transpose()
labels = [np.genfromtxt('/Volumes/RyanMData-1/GenomicClustering/Breast_A.cls', skip_header=2),
          np.genfromtxt('/Volumes/RyanMData-1/GenomicClustering/Breast_B.cls', skip_header=2)
         ] 
datasets = [ba, bb]

colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
colors = np.hstack([colors] * 20)

clustering_names = [
    'MiniBatchKMeans', 'AffinityPropagation', 'MeanShift',
    'SpectralClustering', 'Ward', 'AgglomerativeClustering',
    'DBSCAN', 'Birch']

plt.figure(figsize=(len(clustering_names) * 2 + 3, 9.5))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
                    hspace=.01)

plot_num = 1

for i_dataset, dataset in enumerate(datasets):
    X = dataset
    y = labels[i_dataset]
    # normalize dataset for easier parameter selection
    X = StandardScaler().fit_transform(X)

    # estimate bandwidth for mean shift
    bandwidth = cluster.estimate_bandwidth(X,)

    # connectivity matrix for structured Ward
    connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)

    # create clustering estimators
    ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
    two_means = cluster.MiniBatchKMeans(n_clusters=3)
    ward = cluster.AgglomerativeClustering(n_clusters=3, linkage='ward',
                                           connectivity=connectivity)
    spectral = cluster.SpectralClustering(n_clusters=3,
                                          eigen_solver='arpack',
                                          affinity="nearest_neighbors")
    dbscan = cluster.DBSCAN(eps=.2)
    affinity_propagation = cluster.AffinityPropagation(damping=.9,
                                                       preference=-200)

    average_linkage = cluster.AgglomerativeClustering(
        linkage="average", affinity="cityblock", n_clusters=3,
        connectivity=connectivity)

    birch = cluster.Birch(n_clusters=3)
    clustering_algorithms = [
        two_means, affinity_propagation, ms, spectral, ward, average_linkage,
        dbscan, birch]

    for name, algorithm in zip(clustering_names, clustering_algorithms):
        # predict cluster memberships
        algorithm.fit(X)
        if hasattr(algorithm, 'labels_'):
            y_pred = algorithm.labels_.astype(np.int)
        else:
            y_pred = algorithm.predict(X)
            
        score = metrics.adjusted_rand_score(y_pred, y)

        # plot
        plot = plt.subplot(4, len(clustering_algorithms) + 1, plot_num)
        if i_dataset == 0:
            plt.title(name, size=18)
        plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10)

        if hasattr(algorithm, 'cluster_centers_'):
            centers = algorithm.cluster_centers_
            center_colors = colors[:len(centers)]
            plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors)
        plt.xlim(-2, 2)
        plt.ylim(-2, 2)
        plt.xticks(())
        plt.yticks(())
        plt.text(.99, .01, ('%.2f' % (score)).lstrip('0'),
                 transform=plt.gca().transAxes, size=15,
                 horizontalalignment='right')
        plot_num += 1
        
    # Also do HDBSCAN and A-H
    # HDBSCAN 
    algorithm = hd.HDBSCAN(min_cluster_size=2)
    y_pred = algorithm.fit_predict(X)
    score = metrics.adjusted_rand_score(y_pred, y)
    plot = plt.subplot(4, len(clustering_algorithms) + 1, plot_num)
    if i_dataset == 0:
        plt.title('HDBSCAN', size=18)
    plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10)

    if hasattr(algorithm, 'cluster_centers_'):
        centers = algorithm.cluster_centers_
        center_colors = colors[:len(centers)]
        plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors)
    plt.xlim(-2, 2)
    plt.ylim(-2, 2)
    plt.xticks(())
    plt.yticks(())
    plt.text(.99, .01, ('%.2f' % (score)).lstrip('0'),
                transform=plt.gca().transAxes, size=15,
                horizontalalignment='right')
    plot_num += 1
    
    # A-H 
    algorithm = IMWKRescaled(X)
    y_pred = algorithm.fit()
    score = metrics.adjusted_rand_score(y_pred, y)
    plot = plt.subplot(4, len(clustering_algorithms) + 1, plot_num)
    if i_dataset == 0:
        plt.title('Amorim-Hennig', size=18)
    plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10)

    if hasattr(algorithm, 'cluster_centers_'):
        centers = algorithm.cluster_centers_
        center_colors = colors[:len(centers)]
        plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors)
    plt.xlim(-2, 2)
    plt.ylim(-2, 2)
    plt.xticks(())
    plt.yticks(())
    plt.text(.99, .01, ('%.2f' % (score)).lstrip('0'),
                transform=plt.gca().transAxes, size=15,
                horizontalalignment='right')
    plot_num += 1

plt.show()

Automatically created module for IPython interactive environment
