In [1]:
import csv
import numpy as np
from sklearn import cluster, mixture
from sklearn.neighbors import kneighbors_graph, KNeighborsClassifier
import warnings

In [2]:
col_names = ["Class Name", "handicapped-infants", "water-project-cost-sharing", "adoption-of-the-budget-resolution", "physician-fee-freeze", "el-salvador-aid", "religious-groups-in-schools", "anti-satellite-test-ban", "aid-to-nicaraguan-contras", "mx-missile", "immigration", "synfuels-corporation-cutback", "education-spending", "superfund-right-to-sue", "crime", "duty-free-exports", "export-administration-act-south-africa"]

In [3]:
csvfile = open('data/CVRD/house-votes-84.data')
csvreader = csv.reader(csvfile, delimiter=',', quotechar='|')

def transform_yes_no(raw):
    if raw == 'y':
        return 2
    elif raw == 'n':
        return 0
    else:
        return 1

X = []
Y = []
for item in csvreader:
    if item[0] == 'republican':
        Y.append(0)
    elif item[0] == 'democrat':
        Y.append(1)
        
    entry = []
    for i in range(1, len(item)):
        entry.append(transform_yes_no(item[i]))
        
    X.append(entry)
    
X = np.array(X)
Y = np.array(Y)

In [4]:
params = {'quantile': .3,
                'eps': .3,
                'damping': .9,
                'preference': -200,
                'n_neighbors': 10,
                'n_clusters': 2,
                'min_samples': 20,
                'xi': 0.05,
                'min_cluster_size': 0.1}

# connectivity matrix for structured Ward
connectivity = kneighbors_graph(
    X, n_neighbors=params['n_neighbors'], include_self=False)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)

In [27]:
two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'], random_state=20)
spectral = cluster.SpectralClustering(
    n_clusters=params['n_clusters'], eigen_solver='arpack',
    affinity="nearest_neighbors", random_state=40)
ward = cluster.AgglomerativeClustering(
    n_clusters=params['n_clusters'], linkage='ward',
    connectivity=connectivity)
average_linkage = cluster.AgglomerativeClustering(
    linkage="average", affinity="cityblock",
    n_clusters=params['n_clusters'], connectivity=connectivity)
birch = cluster.Birch(n_clusters=params['n_clusters'])
gmm = mixture.GaussianMixture(
    n_components=params['n_clusters'], covariance_type='full',
    random_state=10)

clustering_algorithms = {
    'MiniBatchKMeans': two_means,
    'SpectralClustering': spectral,
    'Ward': ward,
    'AgglomerativeClustering': average_linkage,
    'Birch': birch,
    'GaussianMixture': gmm
}

In [26]:
for name, algorithm in clustering_algorithms.items():
    # catch warnings related to kneighbors_graph
    with warnings.catch_warnings():
        warnings.filterwarnings(
            "ignore",
            message="the number of connected components of the " +
            "connectivity matrix is [0-9]{1,2}" +
            " > 1. Completing it to avoid stopping the tree early.",
            category=UserWarning)
        warnings.filterwarnings(
            "ignore",
            message="Graph is not fully connected, spectral embedding" +
            " may not work as expected.",
            category=UserWarning)
        algorithm.fit(X)
        
        if hasattr(algorithm, 'labels_'):
            y_pred = algorithm.labels_.astype(np.int)
        else:
            y_pred = algorithm.predict(X)
        
        print('\n', name)
        print('wrong pred :', np.sum(np.abs(y_pred - Y)), ', total :', len(Y))
        accuracy = 1 - np.sum(np.abs(y_pred - Y))/len(Y)
        print('accuracy', ('%.2f' % (accuracy * 100)).lstrip('0'), '%')
        
        neigh = KNeighborsClassifier(n_neighbors=5)
        neigh.fit(X, y_pred)
        print('KNN retrieval accuracy', ('%.2f' % (neigh.score(X,Y) * 100)).lstrip('0'), '%')


 MiniBatchKMeans
wrong pred : 54 , total : 435
accuracy 87.59 %
KNN retrieval accuracy 87.36 %

 SpectralClustering
wrong pred : 53 , total : 435
accuracy 87.82 %
KNN retrieval accuracy 88.28 %

 Ward
wrong pred : 65 , total : 435
accuracy 85.06 %
KNN retrieval accuracy 85.52 %

 AgglomerativeClustering
wrong pred : 268 , total : 435
accuracy 38.39 %
KNN retrieval accuracy 38.39 %

 Birch
wrong pred : 63 , total : 435
accuracy 85.52 %
KNN retrieval accuracy 85.98 %

 GaussianMixture
wrong pred : 378 , total : 435
accuracy 13.10 %
KNN retrieval accuracy 12.64 %
